Example #1
0
def strip_reward_array(buffer):

    fresh_buffer = ReplayBuffer(len(buffer))

    print("Copying environment buffer: ")
    for i in range(len(buffer)):
        obs_t, action, reward, obs_tp1, done = buffer._storage[i]
        fresh_buffer.add(obs_t, action, reward[0], obs_tp1, done)

    return fresh_buffer
Example #2
0
    def init_buffer(self, fpath=None, buffer_size=None):

        with open(fpath, 'rb') as f:
            buffer_env = pickle.load(f)

        buffer_model = ReplayBuffer(buffer_size)

        print("Copying environment buffer: ")
        for i in tqdm(range(len(buffer_env))):
            obs_t, action, reward, obs_tp1, done = buffer_env._storage[i]
            buffer_model.add(obs_t, action, reward, obs_tp1, done)

        return buffer_env, buffer_model
Example #3
0
def test_extend_uniform():
    nvals = 16
    states = [np.random.rand(2, 2) for _ in range(nvals)]
    actions = [np.random.rand(2) for _ in range(nvals)]
    rewards = [np.random.rand() for _ in range(nvals)]
    newstate = [np.random.rand(2, 2) for _ in range(nvals)]
    done = [np.random.randint(0, 2) for _ in range(nvals)]

    size = 32
    baseline = ReplayBuffer(size)
    ext = ReplayBuffer(size)
    for data in zip(states, actions, rewards, newstate, done):
        baseline.add(*data)

    states, actions, rewards, newstates, done = map(
        np.array, [states, actions, rewards, newstate, done])

    ext.extend(states, actions, rewards, newstates, done)
    assert len(baseline) == len(ext)

    # Check buffers have same values
    for i in range(nvals):
        for j in range(5):
            condition = (baseline.storage[i][j] == ext.storage[i][j])
            if isinstance(condition, np.ndarray):
                # for obs, obs_t1
                assert np.all(condition)
            else:
                # for done, reward action
                assert condition
Example #4
0
def main(fpath):
    train_data = pd.read_csv(fpath)
    n_products = train_data['product'].max() + 1
    n_regions = train_data['region'].max() + 1

    buffer = ReplayBuffer(size=100000)
    grouped = train_data.groupby(by='date')

    prev_state = None

    for date, chunk in grouped:
        board_config = np.zeros([n_regions, n_products])
        prev_sales = np.zeros([n_regions, n_products])

        day = chunk.iloc[0, 8]

        prev_sales_product = {}
        prev_placement_cnts = {}
        for idx, row in chunk.iterrows():

            region = row['region']
            product = row['product']

            prev_sales_product[product] = row['prev_sales']

            if row['quantity'] > 0:
                board_config[region, product] = 1.0

                if product not in prev_placement_cnts:
                    prev_placement_cnts[product] = 0

                prev_placement_cnts[product] += 1

        for p in range(n_products):

            if p not in prev_placement_cnts:
                continue

            sales = prev_sales_product[p]
            cnt = prev_placement_cnts[p]
            avg_spatial_sales = sales / cnt
            regions = board_config[:, p]

            prev_sales[:, p] = regions * avg_spatial_sales

        day_vec = State.get_day_vec(day)

        state = {
            "day_vec": day_vec,
            "prev_sales": prev_sales,
            "board_config": board_config
        }

        if prev_state is not None:
            action = state['board_config'] - prev_state['board_config']

        prev_state = state
Example #5
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)  # Q(s,a)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)  # Q(s, pi(a|s))

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.env.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable(
                            'log_ent_coef',
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')

                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    'ent_coef_loss', 'ent_coef'
                                ]
                                self.step_ops += [
                                    ent_coef_op, ent_coef_loss, self.ent_coef
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Example #6
0
class DQN(OffPolicyRLModel):
    """
    The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the
            end of the training. If you do not wish to restore the best version
            at the end of the training set this variable to None.
    :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary
            directory.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 train_freq=1,
                 batch_size=32,
                 checkpoint_freq=10000,
                 checkpoint_path=None,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True):

        # TODO: replay_buffer refactoring
        super(DQN, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=DQNPolicy,
                                  requires_vec_env=False)

        self.checkpoint_path = checkpoint_path
        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.checkpoint_freq = checkpoint_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log

        self.graph = None
        self.sess = None
        self._train_step = None
        self.step_model = None
        self.update_target = None
        self.act = None
        self.proba_step = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None
        self.episode_reward = None

        if _init_setup_model:
            self.setup_model()

    def setup_model(self):
        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = deepq.build_train(
                    q_func=self.policy,
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess)
                self.proba_step = self.step_model.proba_step
                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                    self.beta_schedule = LinearSchedule(
                        prioritized_replay_beta_iters,
                        initial_p=self.prioritized_replay_beta0,
                        final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            for step in range(total_timesteps):
                if callback is not None:
                    callback(locals(), globals())
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(step)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(step) +
                                self.exploration.value(step) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer, step)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if step > self.learning_starts and step % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(step))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + step) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata,
                                                    'step%d' % step)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, step)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if step > self.learning_starts and step % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", step)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(step)))
                    logger.dump_tabular()

        return self

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        with self.sess.as_default():
            actions, _, _ = self.step_model.step(observation,
                                                 deterministic=deterministic)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def action_probability(self, observation, state=None, mask=None):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if not vectorized_env:
            if state is not None:
                raise ValueError(
                    "Error: The environment must be vectorized when using recurrent policies."
                )
            actions_proba = actions_proba[0]

        return actions_proba

    def save(self, save_path):
        # params
        data = {
            "checkpoint_path": self.checkpoint_path,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "checkpoint_freq": self.checkpoint_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "_vectorize_action": self._vectorize_action
        }

        params = self.sess.run(self.params)

        self._save_to_file(save_path, data=data, params=params)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        model = cls(policy=data["policy"], env=env, _init_setup_model=False)
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.params, params):
            restores.append(param.assign(loaded_p))
        model.sess.run(restores)

        return model
Example #7
0
class DqnAtml(DQN):
    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train_atml(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log)
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def get_actions_vec(self, actions_prims, actions_inputs, actions_mf):
        with self.sess.as_default():
            self.embedd_matrix = self.step_model.embedding.get_weights()
        invalid_action = np.zeros(self.embedd_matrix[0].shape[1]) - 1
        self.embedd_matrix = np.vstack([self.embedd_matrix[0], invalid_action])

        embedded_steps = self.embedd_matrix[actions_prims.astype(int)]
        actions_inputs = actions_inputs.reshape(len(actions_prims), -1)
        actions_mf = actions_mf.reshape(len(actions_prims), -1)

        concat_actions = np.concatenate(
            (embedded_steps, actions_inputs, actions_mf), axis=1)
        flatten_act = concat_actions.reshape(-1)

        return flatten_act

    def process_state_vec(self, obs, state_info):
        # transform actions representation with embeddings
        with self.sess.as_default():
            self.embedd_matrix = self.step_model.embedding.get_weights()
        ind1 = state_info['grid_prims_size']
        ind2 = ind1 + state_info['relations_size']
        ind3 = ind2 + state_info['ff_state_size']
        ind4 = ind3 + state_info['action_prims']
        ind5 = ind4 + state_info['action_inputs']
        ind6 = ind5 + state_info['action_mf']
        cells_num = state_info['cells_num']

        actions_prims = obs[ind3:ind4]
        actions_inputs = obs[ind4:ind5]
        actions_mf = obs[ind5:]
        flatten_act = self.get_actions_vec(actions_prims, actions_inputs,
                                           actions_mf)
        final_obs = np.concatenate((obs[:ind3], flatten_act))

        return final_obs

    def hierarchical_step(self, obs, ds_rewards, cnt, kwargs, update_eps):
        register = False
        while not register:
            with self.sess.as_default():
                action = self.predict(np.array(obs)[None])[0][0]
            env_action = action
            reset = False
            new_obs, rew, done, info = self.env.step(env_action)
            level = info.get('hier_level')
            register = info.get('register')

            self.actions_container.append(env_action)
            self.actions_weights.append(level)

            if rew < 0 or register:
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                if rew < 0 and not register:
                    self.actions_container = self.actions_container[:-1]
                    self.actions_weights = self.actions_weights[:-1]
                    rep_action = np.zeros(self.action_space.n)
                    rep_action[action] = 1.0

                if register:
                    if rew > 0:
                        ds_rewards.append([cnt, rew])
                        cnt += 1
                    self.actions_container = np.array(self.actions_container)
                    self.actions_weights = np.array(
                        self.actions_weights) / level
                    b = np.zeros(
                        (len(self.actions_container), self.action_space.n))
                    b[np.arange(len(self.actions_container)),
                      self.actions_container.astype(int)] = 1
                    act_replay = np.sum((self.actions_weights * b.T).T, axis=0)
                    rep_action = act_replay / np.sum(act_replay)
                    self.actions_container = []
                    self.actions_weights = []
                self.replay_buffer.add(obs, rep_action, rew, new_obs,
                                       float(done))
                break
            obs = new_obs
        obs = new_obs
        return obs, new_obs, rew, action, done, reset

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              initial_p=1.0):

        self.actions_weights = []
        self.actions_container = []

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        cnt = 0
        ds_rewards = [[0, 0]]
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=initial_p,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()

            reset = True
            self.episode_reward = np.zeros((1, ))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                ''' Hierarchical Step (Start) '''

                obs, new_obs, rew, action, done, reset = self.hierarchical_step(
                    obs, ds_rewards, cnt, kwargs, update_eps)
                ''' Hierarchical Step (End) '''

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        try:
                            new_priorities = np.array([
                                abs(x) for x in td_errors.tolist()
                            ]) + self.prioritized_replay_eps
                            self.replay_buffer.update_priorities(
                                batch_idxes, new_priorities)
                        except AssertionError:
                            print(td_errors)

                if self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1
        return self, ds_rewards
Example #8
0
# TRY NOT TO MODIFY: setup the environment
env = gym.make(args.gym_id)
env.seed(args.seed)
env.action_space.np_random.seed(args.seed)
env.observation_space.np_random.seed(args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
input_shape, preprocess_obs_fn = preprocess_obs_space(env.observation_space)
output_shape, preprocess_ac_fn = preprocess_ac_space(env.action_space,
                                                     stochastic=False)

# TODO: initialize agent here:
er = ReplayBuffer(args.buffer_size)


class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_shape, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, output_shape)

    def forward(self, x):
        x = preprocess_obs_fn(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    def learn(self,
              total_timesteps,
              seed=None,
              tb_log_name='DQN',
              test_interval=1,
              reset_num_timesteps=True):
        if reset_num_timesteps:
            self.num_timesteps = 0

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:
            self._setup_learn(seed)

            self.replay_buffer = ReplayBuffer(size=self.buffer_size)
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)
            episode_rewards = [0.0]
            obs = self.env.reset(train=True)

            best_train_score = None
            best_test_score = None
            self.reward_curve = []

            for _ in range(total_timesteps):
                update_eps = self.exploration.value(self.num_timesteps)
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps)[0]
                new_obs, rew, done, _ = self.env.step(action)

                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                episode_rewards[-1] += rew

                if self.num_timesteps > self.learning_starts:
                    obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                        self.batch_size)
                    weights = np.ones_like(rewards)
                    if writer is not None:
                        if (1 + self.num_timesteps) % 100 == 0:
                            summary, td_errors = self.train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                            writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self.train_step(obses_t,
                                                       actions,
                                                       rewards,
                                                       obses_tp1,
                                                       obses_tp1,
                                                       dones,
                                                       weights,
                                                       sess=self.sess)

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0:
                    self.update_target(sess=self.sess)

                if done:

                    print('-------------------------------------')
                    print('steps                     | {}'.format(
                        self.num_timesteps))
                    print('episodes                  | {}'.format(
                        len(episode_rewards)))
                    epsilon = int(100 *
                                  self.exploration.value(self.num_timesteps))
                    print('% time spent exploring    | {}'.format(epsilon))
                    print('--')

                    mean_100ep_reward = -np.inf if len(
                        episode_rewards[-16:-1]) == 0 else round(
                            float(np.mean(episode_rewards[-16:-1])), 1)
                    self.reward_curve.append(mean_100ep_reward)
                    print('mean 10 episode reward    | {:.1f}'.format(
                        mean_100ep_reward))

                    journal = self.env.sim.journal
                    print('Total operations          | {}'.format(
                        len(self.env.sim.journal)))
                    longs = [x for x in journal if x['Type'] == 'LONG']
                    shorts = [x for x in journal if x['Type'] == 'SHORT']
                    print('Long/Short                | {}/{}'.format(
                        len(longs), len(shorts)))
                    print('Avg duration trades       | {:.2f}'.format(
                        np.mean([j['Trade Duration'] for j in journal])))
                    total_profit = sum([j['Profit'] for j in journal])
                    print('Total profit              | {:.2f}'.format(
                        total_profit))
                    print('Avg profit per trade      | {:.3f}'.format(
                        total_profit / self.env.sim.total_trades))

                    if epsilon <= self.exploration_final_eps * 100:
                        if best_train_score is None or total_profit > best_train_score:
                            self.save('saves/best_model_train.pkl')
                            best_train_score = total_profit

                    if self.num_timesteps % test_interval == 0:
                        print('--')
                        test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test(
                        )
                        print('Total profit test         > {:.2f}'.format(
                            test_episode_rewards))
                        print('Long/Short test           > {}/{}'.format(
                            test_longs, test_shorts))
                        print('Avg profit per trade test > {:.3f}'.format(
                            test_ave_profit_per_trade))

                        if epsilon <= self.exploration_final_eps * 100:
                            if best_test_score is None or test_episode_rewards > best_test_score:
                                self.save('saves/best_model_test.pkl')
                                best_test_score = test_episode_rewards
                    print('-------------------------------------')

                    obs = self.env.reset()
                    episode_rewards.append(0.0)

                    if self.num_timesteps + (
                            self.num_timesteps /
                            len(episode_rewards)) >= total_timesteps:
                        self.save('saves/final_model.pkl')
                        break

                self.num_timesteps += 1
        return self
Example #10
0
    def train(self, args, callback, env_kwargs=None, train_kwargs=None):
        env = self.makeEnv(args, env_kwargs=env_kwargs)

        # set hyperparameters
        args.__dict__.update(train_kwargs)

        self.cuda = th.cuda.is_available() and not args.no_cuda
        self.device = th.device("cuda" if self.cuda else "cpu")
        self.using_images = args.srl_model == "raw_pixels"

        assert not (args.log_states and self.using_images), "SRL logger can only be used with SRL models"

        if args.log_states:
            srl_logger = LogRLStates(args.log_dir)
        else:
            srl_logger = None

        self.continuous_actions = args.continuous_actions

        if args.continuous_actions:
            action_space = np.prod(env.action_space.shape)
        else:
            action_space = env.action_space.n

        if args.srl_model != "raw_pixels":
            input_dim = np.prod(env.observation_space.shape)
        else:
            n_channels = env.observation_space.shape[-1]
            # We use an additional CNN when using images
            # to extract features
            self.encoder_net = NatureCNN(n_channels).to(self.device)
            input_dim = 512  # output dim of the encoder net

        self.policy_net = MLPPolicy(input_dim, action_space).to(self.device)
        self.q_value_net = MLPQValueNetwork(input_dim, action_space, args.continuous_actions).to(self.device)
        self.value_net = MLPValueNetwork(input_dim).to(self.device)
        self.target_value_net = MLPValueNetwork(input_dim).to(self.device)

        # Make sure target net has the same weights as value_net
        hardUpdate(source=self.value_net, target=self.target_value_net)

        value_criterion = nn.MSELoss()
        q_value_criterion = nn.MSELoss()

        replay_buffer = ReplayBuffer(args.buffer_size)

        policy_optimizer = th.optim.Adam(self.policy_net.parameters(), lr=args.learning_rate)
        value_optimizer = th.optim.Adam(self.value_net.parameters(), lr=args.learning_rate)
        q_optimizer = th.optim.Adam(self.q_value_net.parameters(), lr=args.learning_rate)

        obs = env.reset()
        start_time = time.time()
        if srl_logger is not None:
            srl_logger.reset(obs, env.getOriginalObs())

        for step in range(args.num_timesteps):
            action = self.getAction(obs[None])
            new_obs, reward, done, info = env.step(action)
            # Log states
            if srl_logger is not None:
                srl_logger.step(new_obs, env.getOriginalObs(), action, reward, done)

            # Fill the replay buffer
            replay_buffer.add(obs, action, reward, new_obs, float(done))
            obs = new_obs

            # Callback for plotting and saving best model
            if callback is not None:
                callback(locals(), globals())

            if done:
                obs = env.reset()
                if srl_logger is not None:
                    srl_logger.reset(obs, env.getOriginalObs())
            # Update the different networks
            for _ in range(args.gradient_steps):
                # Check that there is enough data in the buffer replay
                if step < args.batch_size:
                    break

                # Sample a minibatch from the replay buffer
                batch_obs, actions, rewards, batch_next_obs, dones = map(lambda x: self.toFloatTensor(x),
                                                                         replay_buffer.sample(args.batch_size))

                if self.using_images:
                    # Extract features from the images
                    batch_obs = self.encoder_net(channelFirst(batch_obs))
                    batch_next_obs = self.encoder_net(channelFirst(batch_next_obs))

                rewards = rewards.unsqueeze(1)
                dones = dones.unsqueeze(1)

                value_pred = self.value_net(batch_obs)
                q_value = self.q_value_net(batch_obs, actions)
                # Sample actions and retrieve log proba
                # pre_tanh_value, mean_policy and log_std are only used for regularization
                new_actions, log_pi, pre_tanh_value, mean_policy, log_std = self.sampleAction(batch_obs)

                # Q-Value function loss
                target_value_pred = self.target_value_net(batch_next_obs)
                # TD error with reward scaling
                next_q_value = args.reward_scale * rewards + (1 - dones) * args.gamma * target_value_pred.detach()
                loss_q_value = 0.5 * q_value_criterion(q_value, next_q_value.detach())

                # Value Function loss
                q_value_new_actions = self.q_value_net(batch_obs, new_actions)
                next_value = q_value_new_actions - log_pi
                loss_value = 0.5 * value_criterion(value_pred, next_value.detach())

                # Policy Loss
                # why not log_pi.exp_() ?
                loss_policy = (log_pi * (log_pi - q_value_new_actions + value_pred).detach()).mean()
                # Regularization
                if self.continuous_actions:
                    loss_policy += args.w_reg * sum(map(l2Loss, [mean_policy, log_std]))

                q_optimizer.zero_grad()
                # Retain graph if we are using a CNN for extracting features
                loss_q_value.backward(retain_graph=self.using_images)
                q_optimizer.step()

                value_optimizer.zero_grad()
                loss_value.backward(retain_graph=self.using_images)
                value_optimizer.step()

                policy_optimizer.zero_grad()
                loss_policy.backward()
                policy_optimizer.step()

                # Softly update target value_pred network
                softUpdate(source=self.value_net, target=self.target_value_net, factor=args.soft_update_factor)

            if (step + 1) % args.print_freq == 0:
                print("{} steps - {:.2f} FPS".format(step, step / (time.time() - start_time)))
     target_loss,
     target_train_opt,
     target_saver,
     target_write_op,
     target_q_value_index,
 ) = build_neural_network("target_network")
 
 # Start the training process
 sess = tf.Session()
 sess.run(tf.global_variables_initializer())
 writer = tf.summary.FileWriter("./logs", sess.graph_def)
 restore_training_variables(
     "target_network", backup_training_variables("q_network", sess), sess
 )
 random_actions_taken = 0
 er = ReplayBuffer(50000)
 episode_rewards = []
 finished_episodes_count = 0
 target_network_update_counter = 0
 total_timesteps = 0
 for i_episode in range(NUM_EPISODES):
     raw_state = env.reset()
     done = False
     episode_reward = 0
     skipping_count = 0
     for t in range(MAX_NUM_STEPS):
         total_timesteps += 1
         if SKIP_FRAMES == 0 or skipping_count == 0:
             epsilon = get_explore_rate(total_timesteps)
             target_network_update_counter += 1
             # env.render()
Example #12
0
class SAC(OffPolicyRLModel):
    """
    Soft Actor-Critic (SAC)
    Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor,
    This implementation borrows code from original implementation (https://github.com/haarnoja/sac)
    from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo
    (https://github.com/rail-berkeley/softlearning/)
    Paper: https://arxiv.org/abs/1801.01290
    Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html

    :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount factor
    :param learning_rate: (float or callable) learning rate for adam optimizer,
        the same learning rate will be used for all networks (Q-Values, Actor and Value function)
        it can be a function of the current progress (from 1 to 0)
    :param buffer_size: (int) size of the replay buffer
    :param batch_size: (int) Minibatch size for each gradient update
    :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
    :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to
        inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
        Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
    :param train_freq: (int) Update the model every `train_freq` steps.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
    :param gradient_steps: (int) How many gradient update after each step
    :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
    :param action_noise: (ActionNoise) the action noise type (None by default), this can help
        for hard exploration problem. Cf DDPG for the different action noise type.
    :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy)
        This is not needed for SAC normally but can help exploring when using HER + SAC.
        This hack was present in the original OpenAI Baselines repo (DDPG + HER)
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        Note: this has no effect on SAC logging for now
    """
    def __init__(self,
                 policy,
                 env,
                 args,
                 gamma=0.99,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=200,
                 train_freq=1,
                 batch_size=64,
                 tau=0.005,
                 ent_coef='auto',
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 action_noise=None,
                 random_exploration=0.0,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False):

        super(SAC, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=SACPolicy,
                                  requires_vec_env=False,
                                  policy_kwargs=policy_kwargs)

        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.tau = tau
        # In the original paper, same learning rate is used for all networks
        # self.policy_lr = learning_rate
        # self.qf_lr = learning_rate
        # self.vf_lr = learning_rate
        # Entropy coefficient / Entropy temperature
        # Inverse of the reward scale
        self.ent_coef = ent_coef
        self.target_update_interval = target_update_interval
        self.gradient_steps = gradient_steps
        self.gamma = gamma
        self.action_noise = action_noise
        self.random_exploration = random_exploration

        self.value_fn = None
        self.graph = None
        self.replay_buffer = None
        self.episode_reward = None
        self.sess = None
        self.tensorboard_log = tensorboard_log
        self.verbose = verbose
        self.params = None
        self.summary = None
        self.policy_tf = None
        self.target_entropy = target_entropy
        self.full_tensorboard_log = full_tensorboard_log

        self.obs_target = None
        self.target_policy = None
        self.actions_ph = None
        self.rewards_ph = None
        self.terminals_ph = None
        self.observations_ph = None
        self.action_target = None
        self.next_observations_ph = None
        self.value_target = None
        self.step_ops = None
        self.target_update_op = None
        self.infos_names = None
        self.entropy = None
        self.target_params = None
        self.learning_rate_ph = None
        self.processed_obs_ph = None
        self.processed_next_obs_ph = None
        self.log_ent_coef = None

        if _init_setup_model:
            self.setup_model()

        self.args = args
        self.reward_type = args.reward_type
        self.name = self.reward_type + '_'

        self.skew_explore = SkewExploreKDE(env, args)

        self.goal_update_frequency = 5000

    def _get_pretrain_placeholders(self):
        policy = self.policy_tf
        # Rescale
        deterministic_action = self.deterministic_action * np.abs(
            self.action_space.low)
        return policy.obs_ph, self.actions_ph, deterministic_action

    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                n_cpu = multiprocessing.cpu_count()
                if sys.platform == 'darwin':
                    n_cpu //= 2
                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probabilty of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.env.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable(
                            'log_ent_coef',
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Targets for Q and V regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')

                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    'ent_coef_loss', 'ent_coef'
                                ]
                                self.step_ops += [
                                    ent_coef_op, ent_coef_loss, self.ent_coef
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = find_trainable_variables("model")
                self.target_params = find_trainable_variables(
                    "target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

    def _train_step(self, step, writer, learning_rate):
        # Sample a batch from the replay buffer
        batch = self.replay_buffer.sample(self.batch_size)
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch

        feed_dict = {
            self.observations_ph: batch_obs,
            self.actions_ph: batch_actions,
            self.next_observations_ph: batch_next_obs,
            self.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.learning_rate_ph: learning_rate
        }

        # out  = [policy_loss, qf1_loss, qf2_loss,
        #         value_loss, qf1, qf2, value_fn, logp_pi,
        #         self.entropy, policy_train_op, train_values_op]

        # Do one gradient step
        # and optionally compute log for tensorboard
        if writer is not None:
            out = self.sess.run([self.summary] + self.step_ops, feed_dict)
            summary = out.pop(0)
            writer.add_summary(summary, step)
        else:
            out = self.sess.run(self.step_ops, feed_dict)

        # Unpack to monitor losses and entropy
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = out
        # qf1, qf2, value_fn, logp_pi, entropy, *_ = values
        entropy = values[4]

        if self.log_ent_coef is not None:
            ent_coef_loss, ent_coef = values[-2:]
            return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef

        return policy_loss, qf1_loss, qf2_loss, value_loss, entropy

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            tra_obs = []
            ep_count = 0
            selected_goal = None
            tra_count = 0
            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape
                new_obs, reward, done, info = self.env.step(rescaled_action)

                #################################################################
                # fit density model and update goal proposing model
                skew_explore_obs = obs.copy()
                if isinstance(self.env, HERGoalEnvWrapper):
                    skew_explore_obs_dict = self.env.convert_obs_to_dict(
                        skew_explore_obs)
                    skew_explore_obs = np.array(
                        [skew_explore_obs_dict['observation']])
                    tra_obs.append(skew_explore_obs[0])
                    if selected_goal is None:
                        selected_goal = np.array(
                            skew_explore_obs_dict['desired_goal'])
                else:
                    tra_obs.append(skew_explore_obs)

                self.skew_explore.update_history(skew_explore_obs, [done])
                if (step % self.goal_update_frequency == 0
                        and step != 0) or step == 2000:
                    logging.info('update buffer')
                    self.skew_explore.activate_buffer()
                #################################################################

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    self.plot_tra(tra_count, tra_obs, selected_goal)
                    tra_obs = []
                    selected_goal = None

                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()

                    ep_count += 1
                    episode_rewards.append(0.0)
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                    tra_count += 1
                    self.save(self.args.save_path + '/model')

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self

    def test(self, steps):
        # self.env.set_goals([[-0.00497384,  0.11419979,  0.32127943, 0.003,  2.5, 0.02]])
        # goal = np.array([[-0.07863348, -0.00893711,  0.2746492,  -0.0135142,  -1.52]]) # door
        goal = np.array([[-1, 3.5]])
        self.env.set_goals(goal)
        trajectory = []
        obs_dict = self.env.reset()
        obs = np.concatenate(
            (obs_dict['observation'], obs_dict['achieved_goal'],
             obs_dict['desired_goal']))
        for i in range(steps):
            action = self.policy_tf.step(obs[None],
                                         deterministic=True).flatten()
            # rescaled_action = action = self.env.action_space.sample()
            rescaled_action = action * np.abs(self.action_space.low)
            # if i > 20 and i < 30:
            #     rescaled_action[-1] = -0.9
            # if i > 40 and i < 60:
            #     rescaled_action[-1] = 0.9
            new_obs, reward, done, info = self.env.step(rescaled_action)
            obs_dict = new_obs
            obs = np.concatenate(
                (obs_dict['observation'], obs_dict['achieved_goal'],
                 obs_dict['desired_goal']))
        #     state = self.env.sim.get_state().qpos[:8]
        #     if state[-1] > -0.02:
        #         state[-1] = 0
        #     else:
        #         state[-1] = 1
        #     # print('[', state[0], ',', state[1], ',', state[2], ',', state[3], ',', state[4], ',', state[5], ',', state[6], ',', state[7], ']')
        #     trajectory.append(state)

        # np.save('./trajectory', np.array(trajectory))

    def plot_tra(self, t, tra_obs, sampled_goal):
        ## plot
        g_states = sampled_goal
        t_states = np.array(tra_obs)

        fig, (ax1) = plt.subplots(1, 1, figsize=(5, 5))

        if self.args.env == 'maze':
            ax1.set_xlim([-12, 4])
            ax1.set_ylim([-6, 6])
        elif self.args.env == 'yumi' or self.args.env == 'yumi_box_pick':
            ax1.set_xlim([
                self.skew_explore.x_start - 0.05,
                self.skew_explore.x_end + 0.05
            ])
            ax1.set_ylim([
                self.skew_explore.y_start - 0.05,
                self.skew_explore.y_end + 0.05
            ])

        # scale = (tra_rewards - tra_rewards.min())/(tra_rewards.max() - tra_rewards.min())
        ax1.scatter(t_states[:, 0], t_states[:, 1], c='g', s=5)
        ax1.scatter(g_states[0], g_states[1], c='r')

        plt.savefig(self.args.save_path + '/' + self.name + str(int(t % 20)) +
                    '.svg')
        logging_info = 'save trajectory plot as: ' + self.args.save_path + '/' + self.name + str(
            int(t % 20)) + '.svg'
        logging.info(logging_info)

        plt.close()

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None):
        if actions is None:
            warnings.warn(
                "Even thought SAC has a Gaussian policy, it cannot return a distribution as it "
                "is squashed by an tanh before being scaled and ouputed. Therefore 'action_probability' "
                "will only work with the 'actions' keyword argument being used. Returning None."
            )
            return None

        observation = np.array(observation)

        warnings.warn(
            "The probabilty of taken a given action is exactly zero for a continuous distribution."
            "See http://blog.christianperone.com/2019/01/ for a good explanation"
        )

        return np.zeros((observation.shape[0], 1), dtype=np.float32)

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation, deterministic=deterministic)
        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = actions * np.abs(
            self.action_space.low)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def save(self, save_path):
        data = {
            "learning_rate": self.learning_rate,
            "buffer_size": self.buffer_size,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "batch_size": self.batch_size,
            "tau": self.tau,
            "ent_coef":
            self.ent_coef if isinstance(self.ent_coef, float) else 'auto',
            "target_entropy": self.target_entropy,
            # Should we also store the replay buffer?
            # this may lead to high memory usage
            # with all transition inside
            # "replay_buffer": self.replay_buffer
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "action_noise": self.action_noise,
            "random_exploration": self.random_exploration,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params = self.sess.run(self.params)
        target_params = self.sess.run(self.target_params)

        self._save_to_file(save_path, data=data, params=params + target_params)

    @classmethod
    def load(cls, load_path, env=None, args=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[
                'policy_kwargs']:
            raise ValueError(
                "The specified policy kwargs do not equal the stored policy kwargs. "
                "Stored kwargs: {}, specified kwargs: {}".format(
                    data['policy_kwargs'], kwargs['policy_kwargs']))

        model = cls(policy=data["policy"],
                    env=env,
                    _init_setup_model=False,
                    args=args)
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        # model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.params + model.target_params, params):
            restores.append(param.assign(loaded_p))
        model.sess.run(restores)

        return model
Example #13
0
class CLAC(OffPolicyRLModel):
    """
    Capacity-Limited Actor-Critic (CLAC)
    Off-Policy Capacity Limited Deep Reinforcement Learning with a Stochastic Actor,
    This implementation borrows code from the Soft Actor-Critic Implementation (https://github.com/haarnoja/sac)
    from OpenAI Spinning Up (https://github.com/openai/spinningup) from the Softlearning repo
    (https://github.com/rail-berkeley/softlearning/) and from the Stable-Baseliens implementation 
    (https://github.com/hill-a/stable-baselines/tree/master/stable_baselines/sac)

    Paper: In Preperation for ICML 2020

    :param policy: (CLACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount factor
    :param learning_rate: (float or callable) learning rate for adam optimizer,
        the same learning rate will be used for all networks (Q-Values, Actor and Value function)
        it can be a function of the current progress (from 1 to 0)
    :param buffer_size: (int) size of the replay buffer
    :param batch_size: (int) Minibatch size for each gradient update
    :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
    :param mut_inf_coef: (str or float) Mutual Information regularization coefficient. Controlling
        performance/generalization trade-off. Set it to 'auto' to learn it automatically (still in development)
        (and 'auto_0.1' for using 0.1 as initial value)
    :param train_freq: (int) Update the model every `train_freq` steps.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
    :param gradient_steps: (int) How many gradient update after each step
    :param target_inf: (str or float) target mutual information when learning mut_inf_coef (mut_inf_coef = 'auto')
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        Note: this has no effect on CLAC logging for now
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=3e-4,
                 buffer_size=1000000,
                 learning_rate_phi=2e-3,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=256,
                 tau=0.005,
                 mut_inf_coef='auto',
                 target_update_interval=1,
                 coef_schedule=None,
                 gradient_steps=1,
                 target_entropy='auto',
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=None):

        super(CLAC, self).__init__(policy=policy,
                                   env=env,
                                   replay_buffer=None,
                                   verbose=verbose,
                                   policy_base=CLACPolicy,
                                   requires_vec_env=False,
                                   policy_kwargs=policy_kwargs,
                                   seed=seed,
                                   n_cpu_tf_sess=n_cpu_tf_sess)

        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.tau = tau
        # Same learning rate is used for all networks
        # self.policy_lr = learning_rate
        # self.qf_lr = learning_rate
        # self.vf_lr = learning_rate
        self.mut_inf_coef = mut_inf_coef
        self.target_update_interval = target_update_interval
        self.gradient_steps = gradient_steps
        self.gamma = gamma

        self.coef_schedule = coef_schedule
        self.init_mut_inf_coef = self.mut_inf_coef

        # Options for MI approximation and related parameters
        self.learning_rate_phi = learning_rate_phi  # Taken from MIRL paper, not altered
        self.multivariate_mean = None
        self.multivariate_cov = None

        self.value_fn = None
        self.graph = None
        self.replay_buffer = None
        self.episode_reward = None
        self.sess = None
        self.tensorboard_log = tensorboard_log
        self.verbose = verbose
        self.params = None
        self.summary = None
        self.policy_tf = None
        self.target_entropy = target_entropy
        self.full_tensorboard_log = full_tensorboard_log

        self.obs_target = None
        self.target_policy = None
        self.actions_ph = None
        self.rewards_ph = None
        self.terminals_ph = None
        self.observations_ph = None
        self.action_target = None
        self.next_observations_ph = None
        self.value_target = None
        self.step_ops = None
        self.target_update_op = None
        self.infos_names = None
        self.entropy = None
        self.target_params = None
        self.learning_rate_ph = None
        self.processed_obs_ph = None
        self.processed_next_obs_ph = None
        self.log_mut_inf_coef = None
        self.logp_phi = None
        self.logp_pi = None
        self.tf_logged_reward = float("-inf")

        self.auto_mut_inf_coef = False
        if not isinstance(self.mut_inf_coef, float):
            self.auto_mut_inf_coef = True

        self.action_history = None
        self.action_entropy = 1

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.policy_tf
        # Rescale
        deterministic_action = self.deterministic_action * np.abs(
            self.action_space.low)
        return policy.obs_ph, self.actions_ph, deterministic_action

    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')

                    # If the action space is discrete we want
                    if (isinstance(self.env.action_space, Discrete)):
                        self.action_history = np.zeros(
                            (self.env.action_space.n))
                        self.actions_ph = tf.placeholder(
                            tf.float32,
                            shape=(None, self.env.action_space.n),
                            name='actions')
                    else:
                        self.actions_ph = tf.placeholder(
                            tf.float32,
                            shape=(None, ) + self.action_space.shape,
                            name='actions')

                    self.logp_phi = tf.placeholder(tf.float32,
                                                   shape=(None, ),
                                                   name='logp_phi')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    self.mut_inf_coef_tensor = tf.placeholder(
                        tf.float32, shape=(), name='mut_inf_coef')

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probabilty of actions taken by the policy
                    _, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # self.logp_pi = logp_pi
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)

                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)

                    #phi_proba, log_phi_proba = self.policy_tf.make_marginal()
                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = np.prod(
                            self.env.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # Automatic mutual information coefficient setting is not fully tested
                    if isinstance(
                            self.mut_inf_coef,
                            str) and self.mut_inf_coef.startswith('auto'):
                        # Default initial value of mut_inf_coef when learned
                        init_value = 1.0
                        if '_' in self.mut_inf_coef:
                            init_value = float(self.mut_inf_coef.split('_')[1])
                            assert init_value > 0., "The initial value of mut_inf_coef must be greater than 0"

                        self.log_mut_inf_coef = tf.get_variable(
                            'log_mut_inf_coef',
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32))
                        self.mut_inf_coef = tf.exp(self.log_mut_inf_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.mut_inf_coef = float(self.mut_inf_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Targets for Q and V regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    mut_inf_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.mut_inf_coef, float):
                        mut_inf_coef_loss = -tf.reduce_mean(
                            # self.log_mut_inf_coef * tf.stop_gradient(logp_pi + self.target_entropy))
                            # self.log_mut_inf_coef * tf.stop_gradient((-1 * (self.logp_phi - logp_pi)) - self.target_entropy))
                            self.log_mut_inf_coef *
                            tf.stop_gradient(self.logp_phi - logp_pi -
                                             self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    #policy_kl_loss = tf.reduce_mean(self.mut_inf_coef * logp_pi - qf1_pi)
                    policy_kl_loss = tf.reduce_mean(
                        (-1 * self.mut_inf_coef_tensor *
                         (self.logp_phi - logp_pi)) - qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    # v_backup = tf.stop_gradient(min_qf_pi - self.mut_inf_coef * logp_pi)
                    # previous tests
                    # v_backup = tf.stop_gradient(min_qf_pi - self.mut_inf_coef * (self.logp_phi - logp_pi))
                    # Minimzing mutual information
                    v_backup = tf.stop_gradient(min_qf_pi +
                                                (self.mut_inf_coef_tensor *
                                                 (self.logp_phi - logp_pi)))
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss
                    discrete_loss = policy_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)

                    if (isinstance(self.env.action_space, Discrete)):
                        policy_train_op = policy_optimizer.minimize(
                            discrete_loss, var_list=get_vars('model/pi'))
                    else:
                        policy_train_op = policy_optimizer.minimize(
                            policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')
                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy', 'mut_inf_coef_loss',
                            'log_policy', 'log_marginal'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]  #, phi_train_op]

                        # Add entropy coefficient optimization operation if needed
                        if mut_inf_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                mut_inf_coef_op = entropy_optimizer.minimize(
                                    mut_inf_coef_loss,
                                    var_list=self.log_mut_inf_coef)
                                self.infos_names += ['mut_inf_coef']
                                self.step_ops += [
                                    mut_inf_coef_op, mut_inf_coef_loss,
                                    self.mut_inf_coef
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if mut_inf_coef_loss is not None:
                        tf.summary.scalar('mut_inf_coef_loss',
                                          mut_inf_coef_loss)
                    tf.summary.scalar('mut_inf_coef', self.mut_inf_coef)
                    tf.summary.scalar('log_policy', tf.reduce_mean(logp_pi))
                    tf.summary.scalar('log_marginal',
                                      tf.reduce_mean(self.logp_phi))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('episode_reward', self.tf_logged_reward)

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

    def _train_step(self, step, writer, learning_rate):
        # Sample a batch from the replay buffer
        batch = self.replay_buffer.sample(self.batch_size)
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch

        if (isinstance(self.env.action_space, Discrete)):
            batch_actions = batch_actions.reshape(self.batch_size,
                                                  self.env.action_space.n)
        else:
            batch_actions = batch_actions.reshape(
                self.batch_size, self.env.action_space.shape[0])

        # Determine the logp_phi based on the current batch:
        if (isinstance(self.env.action_space, Discrete)):
            assert (False)  # Not implemented
            # not correct for discrete actions
            action_count = [
                np.count_nonzero(batch_actions == action)
                for action in batch_actions
            ]
            action_count = action_count / len(batch_actions)
            # assert all values are percentages in: action_count
            logp_phi = np.log(action_count)
        else:
            EPS = 1e-6  # Avoid NaN (prevents division by zero or log of zero)

            #mu =  np.mean(batch_actions,axis=0)
            #cov = np.cov(batch_actions, rowvar=False) + (np.identity(self.env.action_space.shape[0]) * EPS)

            mu = self.multivariate_mean
            cov = self.multivariate_cov

            if (len(mu) == 1):
                mu = mu[0]

            try:
                multivar = multivariate_normal(mu, cov)
                logp_phi = multivar.logpdf(batch_actions)  # * -1
                logp_phi = logp_phi.reshape(self.batch_size, )
            except:
                # Mutual infomration coefficient is too small to contribute anything
                logp_phi = np.zeros(self.batch_size, )

        mut_inf_coef = self.mut_inf_coef

        # If coinrunner environment
        #batch_obs = np.squeeze(batch_obs, axis=1)
        #batch_next_obs  = np.squeeze(batch_next_obs, axis=1)
        feed_dict = {
            self.observations_ph: batch_obs,
            self.actions_ph: batch_actions,
            self.next_observations_ph: batch_next_obs,
            self.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.learning_rate_ph: learning_rate,
            self.logp_phi: logp_phi,
            self.mut_inf_coef_tensor: mut_inf_coef
        }

        # out  = [policy_loss, qf1_loss, qf2_loss,
        #         value_loss, qf1, qf2, value_fn, logp_pi,
        #         self.entropy, policy_train_op, train_values_op]

        # Do one gradient step
        # and optionally compute log for tensorboard
        if writer is not None:
            out = self.sess.run([self.summary] + self.step_ops, feed_dict)
            summary = out.pop(0)
            writer.add_summary(summary, step)
        else:
            out = self.sess.run(self.step_ops, feed_dict)

        # Unpack to monitor losses and entropy
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = out

        #qf1, qf2, value_fn, logp_pi, entropy, *_ = values
        entropy = values[4]

        if self.log_mut_inf_coef is not None:
            mut_inf_coef_loss, mut_inf_coef = values[-2:]
            return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, mut_inf_coef_loss, mut_inf_coef

        return policy_loss, qf1_loss, qf2_loss, value_loss, entropy

    def sample(self, num_samples=1000):
        samples = [[], [], [], [], []]
        for state in range(self.observation_space.n):
            mean = []

            for _ in range(num_samples):
                action = (self.predict(state)[0][0] -
                          self.action_space.low) / (self.action_space.high -
                                                    self.action_space.low)[0]
                mean.append(action[0])

            samples[state].append(np.mean(mean))

        return samples

    def run(self,
            total_timesteps,
            callback=None,
            seed=None,
            log_interval=4,
            tb_log_name="CLAC",
            reset_num_timesteps=True,
            randomization=0):

        start_time = time.time()
        episode_rewards = [0.0]
        learning_results = pd.DataFrame()
        obs = self.env.reset()
        self.episode_reward = np.zeros((1, ))
        ep_info_buf = deque(maxlen=100)
        n_updates = 0
        infos_values = []

        reward_data = pd.DataFrame()

        for step in range(total_timesteps):
            if (isinstance(self.env.action_space, Discrete)):
                actions = list(range(self.env.action_space.n))
                action = self.policy_tf.step(obs[None],
                                             deterministic=False).flatten()
                rescaled_action = np.random.choice(actions, 1, p=action)[0]
            else:
                action = self.policy_tf.step(obs[None],
                                             deterministic=False).flatten()
                # Rescale from [-1, 1] to the correct bounds
                rescaled_action = action * np.abs(self.action_space.low)

            new_obs, reward, done, info = self.env.step(rescaled_action)

            act_mu, act_std = self.policy_tf.proba_step(obs[None])
            obs = new_obs

            # Retrieve reward and episode length if using Monitor wrapper
            # info = info[0]
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_info_buf.extend([maybe_ep_info])

            if writer is not None:
                # Write reward per episode to tensorboard
                ep_reward = np.array([reward]).reshape((1, -1))
                ep_done = np.array([done]).reshape((1, -1))
                self.episode_reward = total_episode_reward_logger(
                    self.episode_reward, ep_reward, ep_done, writer,
                    self.num_timesteps)

            episode_rewards[-1] += reward
            if done:
                if not isinstance(self.env, VecEnv):
                    obs = self.env.reset()

                    if (randomization == 1):
                        try:
                            for env in self.env.unwrapped.envs:
                                env.randomize()
                        except:
                            print(
                                "Trying to randomize an environment that is not set up for randomization, check environment file"
                            )
                            assert (False)

                    if (randomization == 2):
                        try:
                            for env in self.env.unwrapped.envs:
                                env.randomize_extreme()
                        except:
                            print(
                                "Trying to extremely randomize an environment that is not set up for randomization, check environment file"
                            )
                            assert (False)

                Model_String = "CLAC"
                if not self.auto_mut_inf_coef:
                    Model_String = "CLAC " + str(self.init_mut_inf_coef)

                env_name = self.env.unwrapped.envs[0].spec.id

                mut_inf_coef = self.init_mut_inf_coef
                if (type(self.mut_inf_coef) == tf.Tensor
                        or np.isnan(mut_inf_coef)):
                    mut_inf_coef = "auto"
                Model_String = "CLAC" + str(mut_inf_coef)
                d = {
                    'Episode Reward': episode_rewards[-1],
                    'Coefficient': mut_inf_coef,
                    'Timestep': self.num_timesteps,
                    'Episode Number': len(episode_rewards) - 1,
                    'Env': env_name,
                    'Randomization': randomization,
                    'Model': "CLAC"
                }
                learning_results = learning_results.append(d,
                                                           ignore_index=True)

                self.tf_logged_reward = episode_rewards[-1]

                episode_rewards.append(0.0)

        return (self, learning_results)

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="CLAC",
              reset_num_timesteps=True,
              randomization=0):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            learning_results = pd.DataFrame()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            reward_data = pd.DataFrame()

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if self.num_timesteps < self.learning_starts:
                    if (isinstance(self.env.action_space, Discrete)):
                        action = []
                        for _ in range(self.env.action_space.n):
                            action.append(1 / self.env.action_space.n)
                        rescaled_action = self.env.action_space.sample()
                    else:
                        action = self.env.action_space.sample()
                        # No need to rescale when sampling random action
                        rescaled_action = action
                else:
                    if (isinstance(self.env.action_space, Discrete)):
                        actions = list(range(self.env.action_space.n))
                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()
                        rescaled_action = np.random.choice(actions,
                                                           1,
                                                           p=action)[0]
                    else:
                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()
                        # Rescale from [-1, 1] to the correct bounds
                        rescaled_action = action * np.abs(
                            self.action_space.low)

                if (not isinstance(self.env.action_space, Discrete)):
                    assert action.shape == self.env.action_space.shape

                # If coinrunner environment
                # rescaled_action = np.array(rescaled_action, ndmin=1)

                new_obs, reward, done, info = self.env.step(rescaled_action)

                act_mu, act_std = self.policy_tf.proba_step(obs[None])

                if (len(act_std) == 1):
                    act_std = act_std[0]

                #print("ACT MU FROM PROBA STEP", act_mu)
                #print("ACT STD FROM PROBA STEP", act_std)
                if self.num_timesteps > self.learning_starts:
                    # Only update marginal approximation after learning starts is completed
                    if (self.multivariate_mean is None):
                        self.multivariate_mean = act_mu
                    else:
                        previous_mean = self.multivariate_mean
                        self.multivariate_mean = (
                            (1 - self.learning_rate_phi) *
                            self.multivariate_mean) + (self.learning_rate_phi *
                                                       act_mu)
                    if (self.multivariate_cov is None):
                        self.multivariate_cov = np.diag(act_std)
                    else:
                        cov = (self.learning_rate_phi * np.diag(act_std) +
                               (1 - self.learning_rate_phi) *
                               self.multivariate_cov)
                        mom_1 = (self.learning_rate_phi *
                                 np.square(np.diag(act_mu))) + (
                                     (1 - self.learning_rate_phi) *
                                     np.square(np.diag(previous_mean)))
                        mom_2 = np.square((self.learning_rate_phi *
                                           np.diag(act_mu)) +
                                          (1 - self.learning_rate_phi) *
                                          np.diag(previous_mean))
                        self.multivariate_cov = cov + mom_1 - mom_2

                    # Update Beta parameter if coef_schedule is set
                    if (self.coef_schedule is not None
                            and self.mut_inf_coef > 1e-12):
                        # (1 - a) B + a(1/L()) # Loss based update schdule, for later

                        # Currently using linear schedule:
                        self.mut_inf_coef *= (1 - self.coef_schedule)
                    """if(self.num_timesteps % 1000 == 0):
                        print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)"""

                # Store transition in the replay buffer.
                #print("adding action to replay buffer: ", action)
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                # info = info[0]
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        for mb_info_val in mb_infos_vals:
                            for mb_info in mb_info_val:
                                if mb_info is not None:
                                    infos_values.append(np.mean(mb_info))
                        #infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()

                        if (randomization == 1):
                            try:
                                for env in self.env.unwrapped.envs:
                                    env.randomize()
                            except:
                                print(
                                    "Trying to randomize an environment that is not set up for randomization, check environment file"
                                )
                                assert (False)

                        if (randomization == 2):
                            try:
                                for env in self.env.unwrapped.envs:
                                    env.randomize_extreme()
                            except:
                                print(
                                    "Trying to extremely randomize an environment that is not set up for randomization, check environment file"
                                )
                                assert (False)

                    Model_String = "CLAC"
                    if not self.auto_mut_inf_coef:
                        Model_String = "CLAC " + str(self.mut_inf_coef)

                    env_name = self.env.unwrapped.envs[0].spec.id

                    mut_inf_coef = self.init_mut_inf_coef
                    if (type(self.mut_inf_coef) == tf.Tensor
                            or np.isnan(mut_inf_coef)):
                        mut_inf_coef = "auto"
                    Model_String = "CLAC" + str(mut_inf_coef)
                    d = {
                        'Episode Reward': episode_rewards[-1],
                        'Coefficient': mut_inf_coef,
                        'Timestep': self.num_timesteps,
                        'Episode Number': len(episode_rewards) - 1,
                        'Env': env_name,
                        'Randomization': randomization,
                        'Model': "CLAC"
                    }
                    learning_results = learning_results.append(
                        d, ignore_index=True)

                    self.tf_logged_reward = episode_rewards[-1]

                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return (self, learning_results)

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None):
        if actions is None:
            warnings.warn(
                "Even thought CLAC has a Gaussian policy, it cannot return a distribution as it "
                "is squashed by an tanh before being scaled and ouputed. Therefore 'action_probability' "
                "will only work with the 'actions' keyword argument being used. Returning None."
            )
            return None

        observation = np.array(observation)

        warnings.warn(
            "The probabilty of taken a given action is exactly zero for a continuous distribution."
        )

        return np.zeros((observation.shape[0], 1), dtype=np.float32)

    def predict(self, observation, state=None, mask=None, deterministic=False):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)
        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)

        if (isinstance(self.env.action_space, Discrete)):
            # could replace this with map apply
            actions = []
            action_distributions = self.policy_tf.step(observation,
                                                       deterministic=False)
            available_actions = list(range(self.env.action_space.n))

            for action_distribution in action_distributions:
                action = np.random.choice(available_actions,
                                          1,
                                          p=action_distribution)[0]
                actions.append(action)
        else:
            actions = self.policy_tf.step(observation, deterministic=False)
            actions = actions.reshape(
                (-1, ) +
                self.action_space.shape)  # reshape to the correct action shape
            actions = actions * np.abs(
                self.action_space.low)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def get_parameter_list(self):
        return (self.params + self.target_params)

    def save(self, save_path, cloudpickle=False):
        data = {
            "learning_rate":
            self.learning_rate,
            "buffer_size":
            self.buffer_size,
            "learning_starts":
            self.learning_starts,
            "multivariate_mean":
            self.multivariate_mean,
            "multivariate_cov":
            self.multivariate_cov,
            "train_freq":
            self.train_freq,
            "batch_size":
            self.batch_size,
            "tau":
            self.tau,
            "mut_inf_coef":
            self.mut_inf_coef
            if isinstance(self.mut_inf_coef, float) else 'auto',
            "target_entropy":
            self.target_entropy,
            "num_timesteps":
            self.num_timesteps,
            #"replay_buffer": self.replay_buffer,
            "gamma":
            self.gamma,
            "verbose":
            self.verbose,
            "observation_space":
            self.observation_space,
            "action_space":
            self.action_space,
            "policy":
            self.policy,
            "n_envs":
            self.n_envs,
            "n_cpu_tf_sess":
            self.n_cpu_tf_sess,
            "seed":
            self.seed,
            "_vectorize_action":
            self._vectorize_action,
            "policy_kwargs":
            self.policy_kwargs,
            "coef_schedule":
            self.coef_schedule,
            "init_mut_inf_coef":
            self.init_mut_inf_coef
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path,
                           data=data,
                           params=params_to_save,
                           cloudpickle=cloudpickle)
Example #14
0
                   ])  # The algorithms require a vectorized environment to run

model = DQN(MlpPolicy,
            env,
            verbose=2,
            learning_starts=LEARNING_START,
            gamma=.2,
            exploration_fraction=0.35,
            exploration_final_eps=0.2)
model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T)

with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f:
    pickle.dump(model.replay_buffer, f)

results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

for j in range(100):

    obs = env.reset()

    for i in range(TEST_T):
        feasible_actions = AllocationEnv.get_feasible_actions(
            obs["board_config"])
        action_mask = AllocationEnv.get_action_mask(feasible_actions,
                                                    n_actions)
        action, _states = model.predict(obs, mask=action_mask)

        action = AllocationEnv.check_action(obs['board_config'], action)
        new_obs, r, dones, info = env.step([action])
Example #15
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            Globals.env = self.env
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))
            timesteps_last_log = 0
            avr_ep_len_per_log = None
            sleep = 0.045

            for _ in range(total_timesteps):

                if Globals.loading:
                    Globals.loading = False

                while Globals.pause_game:
                    pass

                if Globals.exit_learning:
                    break

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)

                if can_sample:
                    sleep = 0.035

                time.sleep(sleep)

                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                if len(episode_rewards) % log_interval == 0:
                    avr_ep_len_per_log = (self.num_timesteps -
                                          timesteps_last_log) / log_interval
                    timesteps_last_log = self.num_timesteps

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.record_tabular("avr length of last logged ep",
                                          avr_ep_len_per_log)
                    logger.dump_tabular()

                self.num_timesteps += 1
                Globals.steps -= 1

        return self
Example #16
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                n_cpu = multiprocessing.cpu_count()
                if sys.platform == 'darwin':
                    n_cpu //= 2
                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # mu corresponds to deterministic actions
                    # pi  corresponds to stochastic actions, used for training
                    # logp_pi is the log probabilty of action pi
                    _, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Targets for Q and V regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    # (control dep of policy_train_op because sess.run otherwise
                    # evaluates in nondeterministic order)
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')

                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = find_trainable_variables("model")
                self.target_params = find_trainable_variables(
                    "target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Example #17
0
class SAC(OffPolicyRLModel):
    """
    Soft Actor-Critic (SAC)
    Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor,
    This implementation borrows code from original implementation (https://github.com/haarnoja/sac)
    and from OpenAI Spinning Up (https://github.com/openai/spinningup)
    Paper: https://arxiv.org/abs/1801.01290
    Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html

    :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount factor
    :param learning_rate: (float or callable) learning rate for adam optimizer,
        the same learning rate will be used for all networks (Q-Values, Actor and Value function)
        it can be a function of the current progress (from 1 to 0)
    :param buffer_size: (int) size of the replay buffer
    :param batch_size: (int) Minibatch size for each gradient update
    :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
    :param ent_coef: (float) Entropy regularization coefficient. (Equivalent to
        inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
    :param train_freq: (int) Update the model every `train_freq` steps.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
    :param gradient_steps: (int) How many gradient update after each step
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=3e-3,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 tau=0.005,
                 ent_coef=0.1,
                 target_update_interval=1,
                 gradient_steps=1,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True):
        super(SAC, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=SACPolicy,
                                  requires_vec_env=False)

        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.tau = tau
        # In the original paper, same learning rate is used for all networks
        # self.policy_lr = learning_rate
        # self.qf_lr = learning_rate
        # self.vf_lr = learning_rate
        # Entropy coefficient / Entropy temperature
        # Inverse of the reward scale
        self.ent_coef = ent_coef
        self.target_update_interval = target_update_interval
        self.gradient_steps = gradient_steps
        self.gamma = gamma

        self.value_fn = None
        self.graph = None
        self.replay_buffer = None
        self.episode_reward = None
        self.sess = None
        self.tensorboard_log = tensorboard_log
        self.verbose = verbose
        self.params = None
        self.summary = None
        self.policy_tf = None

        self.obs_target = None
        self.target_policy = None
        self.actions_ph = None
        self.rewards_ph = None
        self.terminals_ph = None
        self.observations_ph = None
        self.action_target = None
        self.next_observations_ph = None
        self.value_target = None
        self.step_ops = None
        self.target_update_op = None
        self.infos_names = None
        self.entropy = None
        self.target_params = None
        self.learning_rate_ph = None

        if _init_setup_model:
            self.setup_model()

    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                n_cpu = multiprocessing.cpu_count()
                if sys.platform == 'darwin':
                    n_cpu //= 2
                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # mu corresponds to deterministic actions
                    # pi  corresponds to stochastic actions, used for training
                    # logp_pi is the log probabilty of action pi
                    _, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Targets for Q and V regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    # (control dep of policy_train_op because sess.run otherwise
                    # evaluates in nondeterministic order)
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')

                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = find_trainable_variables("model")
                self.target_params = find_trainable_variables(
                    "target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

    def _train_step(self, step, writer, learning_rate):
        # Sample a batch from the replay buffer
        batch = self.replay_buffer.sample(self.batch_size)
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch

        feed_dict = {
            self.observations_ph: batch_obs,
            self.actions_ph: batch_actions,
            self.next_observations_ph: batch_next_obs,
            self.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.learning_rate_ph: learning_rate
        }

        # out  = [policy_loss, qf1_loss, qf2_loss,
        #         value_loss, qf1, qf2, value_fn, logp_pi,
        #         self.entropy, policy_train_op, train_values_op]

        # Do one gradient step
        # and optionally compute log for tensorboard
        if writer is not None:
            out = self.sess.run([self.summary] + self.step_ops, feed_dict)
            summary = out.pop(0)
            writer.add_summary(summary, step)
        else:
            out = self.sess.run(self.step_ops, feed_dict)

        # Unpack to monitor losses and entropy
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = out
        # qf1, qf2, value_fn, logp_pi, entropy, *_ = values
        entropy = values[4]

        return policy_loss, qf1_loss, qf2_loss, value_loss, entropy

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if step < self.batch_size or step < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self

    def action_probability(self, observation, state=None, mask=None):
        # Here there are no action probabilities, as SAC is continuous
        # therefore we return the action vector
        return self.predict(observation, state, mask, deterministic=True)[0]

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation, deterministic=deterministic)
        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = actions * np.abs(
            self.action_space.low)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def save(self, save_path):
        data = {
            "learning_rate": self.learning_rate,
            "buffer_size": self.buffer_size,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "batch_size": self.batch_size,
            "tau": self.tau,
            "ent_coef": self.ent_coef,
            # Should we also store the replay buffer?
            # this may lead to high memory usage
            # with all transition inside
            # "replay_buffer": self.replay_buffer
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "_vectorize_action": self._vectorize_action
        }

        params = self.sess.run(self.params)
        target_params = self.sess.run(self.target_params)

        self._save_to_file(save_path, data=data, params=params + target_params)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        model = cls(policy=data["policy"], env=env, _init_setup_model=False)
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.params + model.target_params, params):
            restores.append(param.assign(loaded_p))
        model.sess.run(restores)

        return model
Example #18
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    qf1, qf2 = self.policy_tf.make_critics(
                        self.processed_obs_ph, self.actions_ph)
                    # Q value when following the current policy
                    qf1_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph, policy_out, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1, 1)
                    # Q values when following the target policy
                    qf1_target, qf2_target = self.target_policy_tf.make_critics(
                        self.processed_next_obs_ph, noisy_target_action)

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    qf1_loss = tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = tf.reduce_mean((q_backup - qf2)**2)

                    qvalues_losses = qf1_loss + qf2_loss

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi)

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))
                    self.policy_train_op = policy_train_op

                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    qvalues_params = get_vars('model/values_fn/')

                    # Q Values and policy target params
                    source_params = get_vars("model/")
                    target_params = get_vars("target/")

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Example #19
0
class TradingDQN(DQN):
    def __init__(self,
                 policy,
                 env,
                 gamma=0.9,
                 batch_size=32,
                 buffer_size=100000,
                 learning_starts=10000,
                 learning_rate=0.0001,
                 target_network_update_freq=1000,
                 exploration_final_eps=0.02,
                 exploration_fraction=0.1,
                 tensorboard_log=None,
                 _init_setup_model=True):

        super().__init__(policy=policy,
                         env=env,
                         gamma=gamma,
                         batch_size=batch_size,
                         buffer_size=buffer_size,
                         learning_starts=learning_starts,
                         learning_rate=learning_rate,
                         target_network_update_freq=target_network_update_freq,
                         exploration_final_eps=exploration_final_eps,
                         exploration_fraction=exploration_fraction,
                         tensorboard_log=tensorboard_log,
                         _init_setup_model=_init_setup_model)

    def setup_model(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_util.make_session(graph=self.graph)

            # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/deepq/build_graph.py
            self.act, self.train_step, self.update_target, self.step_model = deepq.build_train(
                q_func=self.policy,
                ob_space=self.env.observation_space,
                ac_space=self.env.action_space,
                optimizer=tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate),
                gamma=self.gamma,
                # grad_norm_clipping=1,
                sess=self.sess)
            self.params = find_trainable_variables('deepq')

            tf_util.initialize(self.sess)
            self.update_target(sess=self.sess)
            self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              seed=None,
              tb_log_name='DQN',
              test_interval=1,
              reset_num_timesteps=True):
        if reset_num_timesteps:
            self.num_timesteps = 0

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:
            self._setup_learn(seed)

            self.replay_buffer = ReplayBuffer(size=self.buffer_size)
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)
            episode_rewards = [0.0]
            obs = self.env.reset(train=True)

            best_train_score = None
            best_test_score = None
            self.reward_curve = []

            for _ in range(total_timesteps):
                update_eps = self.exploration.value(self.num_timesteps)
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps)[0]
                new_obs, rew, done, _ = self.env.step(action)

                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                episode_rewards[-1] += rew

                if self.num_timesteps > self.learning_starts:
                    obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                        self.batch_size)
                    weights = np.ones_like(rewards)
                    if writer is not None:
                        if (1 + self.num_timesteps) % 100 == 0:
                            summary, td_errors = self.train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                            writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self.train_step(obses_t,
                                                       actions,
                                                       rewards,
                                                       obses_tp1,
                                                       obses_tp1,
                                                       dones,
                                                       weights,
                                                       sess=self.sess)

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0:
                    self.update_target(sess=self.sess)

                if done:

                    print('-------------------------------------')
                    print('steps                     | {}'.format(
                        self.num_timesteps))
                    print('episodes                  | {}'.format(
                        len(episode_rewards)))
                    epsilon = int(100 *
                                  self.exploration.value(self.num_timesteps))
                    print('% time spent exploring    | {}'.format(epsilon))
                    print('--')

                    mean_100ep_reward = -np.inf if len(
                        episode_rewards[-16:-1]) == 0 else round(
                            float(np.mean(episode_rewards[-16:-1])), 1)
                    self.reward_curve.append(mean_100ep_reward)
                    print('mean 10 episode reward    | {:.1f}'.format(
                        mean_100ep_reward))

                    journal = self.env.sim.journal
                    print('Total operations          | {}'.format(
                        len(self.env.sim.journal)))
                    longs = [x for x in journal if x['Type'] == 'LONG']
                    shorts = [x for x in journal if x['Type'] == 'SHORT']
                    print('Long/Short                | {}/{}'.format(
                        len(longs), len(shorts)))
                    print('Avg duration trades       | {:.2f}'.format(
                        np.mean([j['Trade Duration'] for j in journal])))
                    total_profit = sum([j['Profit'] for j in journal])
                    print('Total profit              | {:.2f}'.format(
                        total_profit))
                    print('Avg profit per trade      | {:.3f}'.format(
                        total_profit / self.env.sim.total_trades))

                    if epsilon <= self.exploration_final_eps * 100:
                        if best_train_score is None or total_profit > best_train_score:
                            self.save('saves/best_model_train.pkl')
                            best_train_score = total_profit

                    if self.num_timesteps % test_interval == 0:
                        print('--')
                        test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test(
                        )
                        print('Total profit test         > {:.2f}'.format(
                            test_episode_rewards))
                        print('Long/Short test           > {}/{}'.format(
                            test_longs, test_shorts))
                        print('Avg profit per trade test > {:.3f}'.format(
                            test_ave_profit_per_trade))

                        if epsilon <= self.exploration_final_eps * 100:
                            if best_test_score is None or test_episode_rewards > best_test_score:
                                self.save('saves/best_model_test.pkl')
                                best_test_score = test_episode_rewards
                    print('-------------------------------------')

                    obs = self.env.reset()
                    episode_rewards.append(0.0)

                    if self.num_timesteps + (
                            self.num_timesteps /
                            len(episode_rewards)) >= total_timesteps:
                        self.save('saves/final_model.pkl')
                        break

                self.num_timesteps += 1
        return self

    def test(self):
        obs = self.env.reset(train=False)
        done = False
        while not done:
            action, _ = self.predict(obs)
            obs, reward, done, info = self.env.step(action)
            journal = self.env.sim.journal
            longs = len([x for x in journal if x['Type'] == 'LONG'])
            shorts = len([x for x in journal if x['Type'] == 'SHORT'])
            test_episode_rewards = sum([j['Profit'] for j in journal])
            test_ave_profit_per_trade = test_episode_rewards / self.env.sim.total_trades if self.env.sim.total_trades > 0 else -np.inf
        return test_episode_rewards, longs, shorts, test_ave_profit_per_trade

    def save(self, save_path):
        data = {
            'batch_size': self.batch_size,
            'learning_starts': self.learning_starts,
            'learning_rate': self.learning_rate,
            'target_network_update_freq': self.target_network_update_freq,
            'exploration_final_eps': self.exploration_final_eps,
            'exploration_fraction': self.exploration_fraction,
            'gamma': self.gamma,
            'policy': self.policy,
            'journal': self.env.sim.journal,
            'reward_curve': self.reward_curve
        }

        params = self.sess.run(self.params)

        self._save_to_file(save_path, data=data, params=params)
Example #20
0
class TD3(OffPolicyRLModel):
    """
    Twin Delayed DDPG (TD3)
    Addressing Function Approximation Error in Actor-Critic Methods.

    Original implementation: https://github.com/sfujim/TD3
    Paper: https://arxiv.org/pdf/1802.09477.pdf
    Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html

    :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount factor
    :param learning_rate: (float or callable) learning rate for adam optimizer,
        the same learning rate will be used for all networks (Q-Values and Actor networks)
        it can be a function of the current progress (from 1 to 0)
    :param buffer_size: (int) size of the replay buffer
    :param batch_size: (int) Minibatch size for each gradient update
    :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1)
    :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps
        per training steps. The Q values will be updated policy_delay more often (update every training step).
    :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type.
    :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy
        (smoothing noise)
    :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
    :param train_freq: (int) Update the model every `train_freq` steps.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param gradient_steps: (int) How many gradient update after each step
    :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy)
        This is not needed for TD3 normally but can help exploring when using HER + TD3.
        This hack was present in the original OpenAI Baselines repo (DDPG + HER)
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        Note: this has no effect on TD3 logging for now
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=100,
                 gradient_steps=100,
                 batch_size=128,
                 tau=0.005,
                 policy_delay=2,
                 action_noise=None,
                 target_policy_noise=0.2,
                 target_noise_clip=0.5,
                 random_exploration=0.0,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=None):

        super(TD3, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=TD3Policy,
                                  requires_vec_env=False,
                                  policy_kwargs=policy_kwargs,
                                  seed=seed,
                                  n_cpu_tf_sess=n_cpu_tf_sess)

        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.tau = tau
        self.gradient_steps = gradient_steps
        self.gamma = gamma
        self.action_noise = action_noise
        self.random_exploration = random_exploration
        self.policy_delay = policy_delay
        self.target_noise_clip = target_noise_clip
        self.target_policy_noise = target_policy_noise

        self.graph = None
        self.replay_buffer = None
        self.episode_reward = None
        self.sess = None
        self.tensorboard_log = tensorboard_log
        self.verbose = verbose
        self.params = None
        self.summary = None
        self.policy_tf = None
        self.full_tensorboard_log = full_tensorboard_log

        self.obs_target = None
        self.target_policy_tf = None
        self.actions_ph = None
        self.rewards_ph = None
        self.terminals_ph = None
        self.observations_ph = None
        self.action_target = None
        self.next_observations_ph = None
        self.step_ops = None
        self.target_ops = None
        self.infos_names = None
        self.target_params = None
        self.learning_rate_ph = None
        self.processed_obs_ph = None
        self.processed_next_obs_ph = None
        self.policy_out = None
        self.policy_train_op = None
        self.policy_loss = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.policy_tf
        # Rescale
        policy_out = unscale_action(self.action_space, self.policy_out)
        return policy.obs_ph, self.actions_ph, policy_out

    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    qf1, qf2 = self.policy_tf.make_critics(
                        self.processed_obs_ph, self.actions_ph)
                    # Q value when following the current policy
                    qf1_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph, policy_out, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1, 1)
                    # Q values when following the target policy
                    qf1_target, qf2_target = self.target_policy_tf.make_critics(
                        self.processed_next_obs_ph, noisy_target_action)

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    qf1_loss = tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = tf.reduce_mean((q_backup - qf2)**2)

                    qvalues_losses = qf1_loss + qf2_loss

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi)

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))
                    self.policy_train_op = policy_train_op

                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    qvalues_params = get_vars('model/values_fn/')

                    # Q Values and policy target params
                    source_params = get_vars("model/")
                    target_params = get_vars("target/")

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

    def _train_step(self, step, writer, learning_rate, update_policy):
        # Sample a batch from the replay buffer
        batch = self.replay_buffer.sample(self.batch_size)
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch

        feed_dict = {
            self.observations_ph: batch_obs,
            self.actions_ph: batch_actions,
            self.next_observations_ph: batch_next_obs,
            self.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.learning_rate_ph: learning_rate
        }

        step_ops = self.step_ops
        if update_policy:
            # Update policy and target networks
            step_ops = step_ops + [
                self.policy_train_op, self.target_ops, self.policy_loss
            ]

        # Do one gradient step
        # and optionally compute log for tensorboard
        if writer is not None:
            out = self.sess.run([self.summary] + step_ops, feed_dict)
            summary = out.pop(0)
            writer.add_summary(summary, step)
        else:
            out = self.sess.run(step_ops, feed_dict)

        # Unpack to monitor losses
        qf1_loss, qf2_loss, *_values = out

        return qf1_loss, qf2_loss

    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="TD3",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(obs[None]).flatten()
                    # Add noise to the action, as the policy
                    # is deterministic, this is required for exploration
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(unscaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                                or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        # Note: the policy is updated less frequently than the Q functions
                        # this is controlled by the `policy_delay` parameter
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr,
                                             (step + grad_step) %
                                             self.policy_delay == 0))

                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        _ = np.array(observation)

        if actions is not None:
            raise ValueError("Error: TD3 does not have action probabilities.")

        # here there are no action probabilities, as DDPG does not use a probability distribution
        warnings.warn(
            "Warning: action probability is meaningless for TD3. Returning None"
        )
        return None

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation)

        if self.action_noise is not None and not deterministic:
            actions = np.clip(actions + self.action_noise(), -1, 1)

        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = unscale_action(
            self.action_space, actions)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def get_parameter_list(self):
        return (self.params + self.target_params)

    def save(self, save_path, cloudpickle=False):
        data = {
            "learning_rate": self.learning_rate,
            "buffer_size": self.buffer_size,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "batch_size": self.batch_size,
            "tau": self.tau,
            # Should we also store the replay buffer?
            # this may lead to high memory usage
            # with all transition inside
            # "replay_buffer": self.replay_buffer
            "policy_delay": self.policy_delay,
            "target_noise_clip": self.target_noise_clip,
            "target_policy_noise": self.target_policy_noise,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "action_noise": self.action_noise,
            "random_exploration": self.random_exploration,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path,
                           data=data,
                           params=params_to_save,
                           cloudpickle=cloudpickle)
Example #21
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True

            ############################################################
            # MODIFICATION:
            # Track list of actions taken each episode. This is
            # intentionally not a set so that we can use np.isin.
            action_list = list()
            ############################################################

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    ####################################################
                    # MODIFICATION:
                    # Rename variable from original, since it's now
                    # going to come back as an array due to the
                    # modified build_act function being used to
                    # construct everything.
                    action_arr = self.act(np.array(obs)[None],
                                          update_eps=update_eps,
                                          **kwargs)[0]
                    ####################################################
                    # ORIGINAL:
                    # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]

                ########################################################
                # MODIFICATION:
                # Get the best action that has not yet been taken this
                # episode.
                action = \
                    action_arr[np.argmin(np.isin(action_arr, action_list))]
                # Add this action to the list.
                action_list.append(action)
                ########################################################

                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    total_episode_reward_logger(self.episode_reward, ep_rew,
                                                ep_done, writer,
                                                self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    ####################################################
                    # MODIFICATION:
                    # Clear the list.
                    action_list.clear()
                    ####################################################
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return self
Example #22
0
class OurDDPG(OffPolicyRLModel):
    def __init__(self,
                 policy,
                 env,
                 seed=0,
                 eval_env=None,
                 eval_freq=5000,
                 gamma=0.99,
                 tau=0.005,
                 action_noise=None,
                 normalize_observations=False,
                 normalize_returns=False,
                 observation_range=(-np.inf, np.inf),
                 return_range=(-np.inf, np.inf),
                 reward_scale=1.,
                 critic_l2_reg=0.,
                 clip_norm=None,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 buffer_size=1e6,
                 batch_size=128,
                 verbose=0,
                 policy_kwargs=None,
                 tensorboard_log=None,
                 full_tensorboard_log=False,
                 _init_setup_model=True,
                 ro=True,
                 sample_number=128,
                 adjust_lr=False):

        super(OurDDPG, self).__init__(policy=policy,
                                      env=env,
                                      replay_buffer=None,
                                      verbose=verbose,
                                      policy_base=DDPGPolicy,
                                      requires_vec_env=False,
                                      policy_kwargs=policy_kwargs)

        # Parameters.
        self.seed = seed
        self.gamma = gamma
        self.tau = tau
        self.ro = ro
        self.sample_number = sample_number
        self.eval_freq = eval_freq
        self.adjust_lr = adjust_lr
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.return_range = return_range
        self.observation_range = observation_range
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.critic_l2_reg = critic_l2_reg
        self.eval_env = eval_env
        self.buffer_size = buffer_size
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log

        # init
        self.graph = None
        self.stats_sample = None
        self.replay_buffer = None
        self.policy_tf = None
        self.target_init_updates = None
        self.target_soft_updates = None
        self.critic_loss = None
        self.critic_optimizer = None
        self.critic_optimize_op = None
        self.sess = None
        self.stats_ops = None
        self.stats_names = None
        self.perturbed_actor_tf = None
        self.perturb_policy_ops = None
        self.perturb_adaptive_policy_ops = None
        self.adaptive_policy_distance = None
        self.actor_loss = None
        self.actor_optimizer = None
        self.actor_optimize_op = None
        self.old_std = None
        self.old_mean = None
        self.renormalize_q_outputs_op = None
        self.obs_rms = None
        self.ret_rms = None
        self.target_policy = None
        self.actor_tf = None
        self.critic_tf = None
        self.critic_with_actor_tf = None
        self.critic_with_actor_tf = None
        self.target_q = None
        self.obs_train_ph = None
        self.action_train_ph = None
        self.obs_target = None
        self.action_target = None
        self.obs_noise = None
        self.action_noise_ph = None
        self.obs_adapt_noise = None
        self.action_adapt_noise = None
        self.terminals1 = None
        self.rewards = None
        self.critic_target = None
        self.param_noise_stddev = None
        self.param_noise_actor = None
        self.adaptive_param_noise_actor = None
        self.params = None
        self.summary = None
        self.episode_reward = None
        self.tb_seen_steps = None

        self.target_params = None
        self.obs_rms_params = None
        self.ret_rms_params = None

        # Randomized Optimization
        self.augmented_obs0 = None
        self.augmented_action_raw = None
        self.augmented_action = None
        self.augmented_critic_with_actor_tf = None
        self.reward_summary = None
        self.actor_loss_summary = None
        self.critic_loss_summary = None
        self.obs_summary = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        return NotImplementedError
        # policy = self.policy_tf
        # # Rescale
        # deterministic_action = self.actor_tf * np.abs(self.action_space.low)
        # return policy.obs_ph, self.actions, deterministic_action

    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert isinstance(self.action_space, gym.spaces.Box), \
                "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space)
            assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \
                                                        "an instance of DDPGPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self._setup_learn(self.seed)
                # self.sess = tf_util.single_threaded_session(graph=self.graph)
                self.sess = tf_util.make_session()

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Observation normalization.
                    # if self.normalize_observations:
                    #     with tf.variable_scope('obs_rms'):
                    #         self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
                    # else:
                    #     self.obs_rms = None

                    # Return normalization.
                    # if self.normalize_returns:
                    #     with tf.variable_scope('ret_rms'):
                    #         self.ret_rms = RunningMeanStd()
                    # else:
                    #     self.ret_rms = None

                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space, 1, 1, None,
                                                 **self.policy_kwargs)

                    # Create target networks.
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space, 1, 1,
                                                     None,
                                                     **self.policy_kwargs)
                    self.obs_target = self.target_policy.obs_ph
                    self.action_target = self.target_policy.action_ph

                    # normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms),
                    #                                    self.observation_range[0], self.observation_range[1])
                    # normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms),
                    #                                    self.observation_range[0], self.observation_range[1])

                    # Inputs.
                    self.obs_train_ph = self.policy_tf.obs_ph
                    self.action_train_ph = self.policy_tf.action_ph
                    self.terminals1 = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='terminals1')
                    self.rewards = tf.placeholder(tf.float32,
                                                  shape=(None, 1),
                                                  name='rewards')
                    self.critic_target = tf.placeholder(tf.float32,
                                                        shape=(None, 1),
                                                        name='critic_target')

                # Create networks and core TF parts that are shared across setup parts.
                with tf.variable_scope("model", reuse=False):
                    self.actor_tf = self.policy_tf.make_actor(
                        self.policy_tf.processed_obs)
                    self.critic_tf = self.policy_tf.make_critic(
                        self.policy_tf.processed_obs, self.action_train_ph)
                    self.critic_with_actor_tf = self.policy_tf.make_critic(
                        self.policy_tf.processed_obs,
                        self.actor_tf,
                        reuse=True)

                    if self.ro:

                        def tf_repeat(tensor_to_repeat, repeat_num):
                            tiled = tf.tile(tensor_to_repeat, [1, repeat_num])
                            repeated = tf.reshape(
                                tiled,
                                shape=[
                                    self.batch_size * repeat_num,
                                    tensor_to_repeat.shape[1]
                                ])
                            return repeated

                        self.augmented_obs0 = tf_repeat(
                            self.policy_tf.processed_obs, self.sample_number)
                        self.augmented_action_raw = tf_repeat(
                            self.actor_tf, self.sample_number)
                        noises = []
                        for b_index in range(self.batch_size):
                            noises.append(
                                tf.random_uniform((self.sample_number - 1, ) +
                                                  self.action_space.shape,
                                                  -0.1, 0.1))
                            noises.append(
                                tf.zeros((1, ) + self.action_space.shape))
                        noises = tf.concat(noises, axis=0)
                        self.augmented_action = self.augmented_action_raw + noises
                        self.augmented_action = tf.clip_by_value(
                            self.augmented_action, -1, 1)
                        self.augmented_critic_with_actor_tf = self.policy_tf.make_critic(
                            self.augmented_obs0,
                            self.augmented_action,
                            reuse=True)[:, 0]

                with tf.variable_scope("target", reuse=False):
                    critic_target = \
                        self.target_policy.make_critic(self.target_policy.processed_obs,
                                                       self.target_policy.make_actor(self.target_policy.processed_obs))

                with tf.variable_scope("loss", reuse=False):
                    # self.critic_tf = denormalize(
                    #     tf.clip_by_value(self.critic_tf, self.return_range[0], self.return_range[1]),
                    #     self.ret_rms)
                    #
                    # self.critic_with_actor_tf = denormalize(
                    #     tf.clip_by_value(self.critic_with_actor_tf,
                    #                      self.return_range[0], self.return_range[1]),
                    #     self.ret_rms)
                    #
                    # q_obs1 = denormalize(critic_target, self.ret_rms)
                    self.target_q = self.rewards + (
                        1. - self.terminals1) * self.gamma * critic_target

                    # tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('critic_target',
                                             self.critic_target)

                    # Set up parts.
                    self._setup_stats()
                    self._setup_target_network_updates()

                with tf.variable_scope("input_info", reuse=False):
                    self.reward_summary = tf.summary.scalar(
                        'rewards', tf.reduce_mean(self.rewards))
                    self.obs_summary = tf.summary.scalar(
                        'obs', tf.reduce_mean(self.obs_train_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('rewards', self.rewards)
                        if len(self.observation_space.shape
                               ) == 3 and self.observation_space.shape[0] in [
                                   1, 3, 4
                               ]:
                            tf.summary.image('observation', self.obs_train_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 self.obs_train_ph)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self._setup_actor_optimizer()
                    self._setup_critic_optimizer()
                    self.actor_loss_summary = tf.summary.scalar(
                        'actor_loss', self.actor_loss)
                    self.critic_loss_summary = tf.summary.scalar(
                        'critic_loss', self.critic_loss)

                self.params = tf_util.get_trainable_vars("model")

                self.target_params = tf_util.get_trainable_vars("target")
                self.obs_rms_params = [
                    var for var in tf.global_variables()
                    if "obs_rms" in var.name
                ]
                self.ret_rms_params = [
                    var for var in tf.global_variables()
                    if "ret_rms" in var.name
                ]

                with self.sess.as_default():
                    self._initialize(self.sess)

                # self.summary = tf.summary.merge_all()

    def _setup_target_network_updates(self):
        """
        set the target update operations
        """
        init_updates, soft_updates = get_target_updates(
            tf_util.get_trainable_vars('model/'),
            tf_util.get_trainable_vars('target/'), self.tau, self.verbose)
        self.target_init_updates = init_updates
        self.target_soft_updates = soft_updates

    def _setup_actor_optimizer(self):
        """
        setup the optimizer for the actor
        """
        if self.verbose >= 2:
            logger.info('setting up actor optimizer')

        if self.ro:
            split_group_action_raw = tf.split(self.augmented_action_raw,
                                              self.batch_size,
                                              axis=0)
            split_group_action = tf.split(self.augmented_action,
                                          self.batch_size,
                                          axis=0)
            split_group_q = tf.split(self.augmented_critic_with_actor_tf,
                                     self.batch_size,
                                     axis=0)

            self.actor_loss = 0
            q_stds = []
            for idx in range(self.batch_size):
                # softmax = tf.nn.softmax(split_group_q[idx] -
                #                         tf.reduce_max(split_group_q[idx], axis=0, keepdims=True), axis=0)
                # self.actor_loss = self.actor_loss + tf.reduce_sum(
                #     tf.reduce_sum(tf.square(split_group_action_raw[idx] -
                #                             tf.stop_gradient(split_group_action[idx])),
                #                   axis=1)
                #     * tf.stop_gradient(softmax))

                max_index = tf.argmax(split_group_q[idx], axis=0)
                q_std = tf.math.reduce_std(split_group_q[idx]) * 20
                target_action = split_group_action[idx][max_index, :]
                if self.adjust_lr:
                    self.actor_loss = self.actor_loss + \
                        tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action))) \
                        / tf.stop_gradient(q_std)
                else:
                    self.actor_loss = self.actor_loss + \
                                      tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action)))
                q_stds.append(q_std)
            # tf.summary.histogram("q_std", tf.stack(q_stds, axis=0))
        else:
            self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        actor_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/pi/')
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        if self.verbose >= 2:
            logger.info('  actor shapes: {}'.format(actor_shapes))
            logger.info('  actor params: {}'.format(actor_nb_params))
        # self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'),
        #                                     clip_norm=self.clip_norm)
        # self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999,
        #                                epsilon=1e-08)
        self.actor_optimizer = tf.train.AdamOptimizer(
            learning_rate=self.actor_lr)
        self.actor_gradients = self.actor_optimizer.compute_gradients(
            self.actor_loss, var_list=tf_util.get_trainable_vars("model/pi/"))
        hist_summary = []
        for gradient, variable in self.actor_gradients:
            if gradient is not None:
                hist_summary.append(
                    tf.summary.histogram("gradients/" + variable.name,
                                         gradient))
                hist_summary.append(
                    tf.summary.histogram("variables/" + variable.name,
                                         variable))
        self.actor_gradient_summary = tf.summary.merge(hist_summary)
        self.actor_optimize_op = self.actor_optimizer.apply_gradients(
            self.actor_gradients)
        # self.actor_optimize_op = self.actor_optimizer.minimize(self.actor_loss,
        #                                                        var_list=tf_util.get_trainable_vars("model/pi/"))

    def _setup_critic_optimizer(self):
        """
        setup the optimizer for the critic
        """
        if self.verbose >= 2:
            logger.info('setting up critic optimizer')
        # normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
        #                                                self.return_range[0], self.return_range[1])
        # self.critic_loss = tf.reduce_mean(tf.square(self.critic_tf - normalized_critic_target_tf))
        self.critic_loss = tf.reduce_mean(
            tf.square(self.critic_tf - self.critic_target))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in tf_util.get_trainable_vars('model/qf/')
                if 'bias' not in var.name and 'qf_output' not in var.name
                and 'b' not in var.name
            ]
            if self.verbose >= 2:
                for var in critic_reg_vars:
                    logger.info('  regularizing: {}'.format(var.name))
                logger.info('  applying l2 regularization with {}'.format(
                    self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/qf/')
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        if self.verbose >= 2:
            logger.info('  critic shapes: {}'.format(critic_shapes))
            logger.info('  critic params: {}'.format(critic_nb_params))
        # self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'),
        #                                      clip_norm=self.clip_norm)
        # self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999,
        #                                 epsilon=1e-08)
        self.critic_optimizer = tf.train.AdamOptimizer(
            learning_rate=self.critic_lr)
        self.critic_optimize_op = self.critic_optimizer.minimize(
            self.critic_loss, var_list=tf_util.get_trainable_vars("model/qf/"))

    def _setup_stats(self):
        """
        setup the running means and std of the inputs and outputs of the model
        """
        ops = []
        names = []

        # if self.normalize_returns:
        #     ops += [self.ret_rms.mean, self.ret_rms.std]
        #     names += ['ret_rms_mean', 'ret_rms_std']
        #
        # if self.normalize_observations:
        #     ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
        #     names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def _policy(self, obs, apply_noise=True, compute_q=True):
        """
        Get the actions and critic output, from a given observation

        :param obs: ([float] or [int]) the observation
        :param apply_noise: (bool) enable the noise
        :param compute_q: (bool) compute the critic output
        :return: ([float], float) the action and critic value
        """
        obs = np.array(obs).reshape((-1, ) + self.observation_space.shape)
        feed_dict = {self.obs_train_ph: obs}
        actor_tf = self.actor_tf

        if compute_q:
            action, q_value = self.sess.run(
                [actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q_value = None

        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, -1, 1)
        return action, q_value

    def _store_transition(self, obs0, action, reward, obs1, terminal1):
        """
        Store a transition in the replay buffer

        :param obs0: ([float] or [int]) the last observation
        :param action: ([float]) the action
        :param reward: (float] the reward
        :param obs1: ([float] or [int]) the current observation
        :param terminal1: (bool) Whether the episode is over
        """
        reward *= self.reward_scale
        self.replay_buffer.add(obs0, action, reward, obs1, float(terminal1))
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def _train_step(self, step, writer, do_actor_update):
        """
        run a step of training from batch

        :param step: (int) the current step iteration
        :param writer: (TensorFlow Summary.writer) the writer for tensorboard
        :param log: (bool) whether or not to log to metadata
        :return: (float, float) critic loss, actor loss
        """
        # Get a batch
        obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample(
            batch_size=self.batch_size)
        # Reshape to match previous behavior and placeholder shape
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        target_q = self.sess.run(self.target_q,
                                 feed_dict={
                                     self.obs_target: obs1,
                                     self.rewards: rewards,
                                     self.terminals1: terminals1
                                 })

        # Get all gradients and perform a synced update.
        # ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        td_map = {
            self.obs_train_ph: obs0,
            # self.actions: actions,
            self.action_train_ph: actions,
            self.rewards: rewards,
            self.critic_target: target_q,
        }

        critic_loss_summary, reward_summary, obs_summary, critic_loss, _ = \
            self.sess.run([self.critic_loss_summary, self.reward_summary, self.obs_summary,
                           self.critic_loss, self.critic_optimize_op], td_map)
        # self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr)
        writer.add_summary(critic_loss_summary, step)
        writer.add_summary(reward_summary, step)
        writer.add_summary(obs_summary, step)

        actor_loss = None
        if do_actor_update:
            actor_loss_summary, actor_gradient_summary, actor_loss, _ = \
                self.sess.run([self.actor_loss_summary, self.actor_gradient_summary, self.actor_loss, self.actor_optimize_op], td_map)
            # self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr)
            writer.add_summary(actor_gradient_summary, step)
            writer.add_summary(actor_loss_summary, step)

        return critic_loss, actor_loss

    def _initialize(self, sess):
        """
        initialize the model parameters and optimizers

        :param sess: (TensorFlow Session) the current TensorFlow session
        """
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        # self.actor_optimizer.sync()
        # self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def _update_target_net(self):
        """
        run target soft update operation
        """
        self.sess.run(self.target_soft_updates)

    def _get_stats(self):
        """
        Get the mean and standard deviation of the model's inputs and outputs

        :return: (dict) the means and stds
        """
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample(
                batch_size=self.batch_size)
            self.stats_sample = {
                'obs0': obs0,
                'actions': actions,
                'rewards': rewards,
                'obs1': obs1,
                'terminals1': terminals1
            }

        # feed_dict = {
        #     self.actions: self.stats_sample['actions']
        # }
        feed_dict = {}

        for placeholder in [self.action_train_ph, self.action_target]:
            if placeholder is not None:
                feed_dict[placeholder] = self.stats_sample['actions']

        for placeholder in [self.obs_train_ph, self.obs_target]:
            if placeholder is not None:
                feed_dict[placeholder] = self.stats_sample['obs0']

        values = self.sess.run(self.stats_ops, feed_dict=feed_dict)

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        return stats

    def _reset(self):
        """
        Reset internal state after an episode is complete.
        """
        if self.action_noise is not None:
            self.action_noise.reset()

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=None,
              tb_log_name="DDPG",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
            self.tb_seen_steps = []

            # rank = MPI.COMM_WORLD.Get_rank()
            # we assume symmetric actions.
            assert np.all(
                np.abs(self.env.action_space.low) ==
                self.env.action_space.high)
            if self.verbose >= 2:
                logger.log('Using agent with the following configuration:')
                logger.log(str(self.__dict__.items()))

            with self.sess.as_default(), self.graph.as_default():
                # Prepare everything.
                self._reset()
                obs = self.env.reset()
                eval_obs = None
                if self.eval_env is not None:
                    eval_obs = self.eval_env.reset()

                episode_rewards_deque = deque(maxlen=100)
                eval_episode_rewards_deque = deque(maxlen=100)
                self.episode_reward = np.zeros((1, ))

                episode_successes = []
                episode_rewards_all = []
                episode_steps_all = []
                episode_reward = 0.
                episode_step = 0
                total_steps = 0
                step_since_eval = 0
                total_episode_num = 0

                start_time = time.time()

                while True:
                    # Perform rollouts.
                    qs_this_rollout_period = []
                    actions_this_rollout_period = []
                    while True:
                        if total_steps >= total_timesteps:
                            return self

                        # Predict next action.
                        if total_steps <= 10000:
                            action = self.env.action_space.sample()
                            q_value = 0
                        else:
                            action, q_value = self._policy(obs,
                                                           apply_noise=True,
                                                           compute_q=True)
                        assert action.shape == self.env.action_space.shape

                        rescaled_action = action * np.abs(
                            self.action_space.low)
                        new_obs, reward, done, info = self.env.step(
                            rescaled_action)

                        if writer is not None:
                            ep_rew = np.array([reward]).reshape((1, -1))
                            ep_done = np.array([done]).reshape((1, -1))
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, ep_rew, ep_done, writer,
                                self.num_timesteps)
                        total_steps += 1
                        self.num_timesteps += 1
                        episode_reward += reward
                        episode_step += 1
                        step_since_eval += 1

                        # Book-keeping.
                        actions_this_rollout_period.append(action)
                        qs_this_rollout_period.append(q_value)
                        self._store_transition(obs, action, reward, new_obs,
                                               done)
                        obs = new_obs

                        if done:
                            # Episode done.
                            episode_rewards_all.append(episode_reward)
                            episode_rewards_deque.append(episode_reward)
                            episode_steps_all.append(episode_step)
                            episode_reward = 0.
                            episode_step = 0
                            total_episode_num += 1

                            maybe_is_success = info.get('is_success')
                            if maybe_is_success is not None:
                                episode_successes.append(
                                    float(maybe_is_success))

                            self._reset()
                            if not isinstance(self.env, VecEnv):
                                obs = self.env.reset()
                            break

                    # Train.
                    actor_losses_this_train_period = []
                    critic_losses_this_train_period = []
                    last_episode_step = int(episode_steps_all[-1])
                    for t_train in range(last_episode_step):
                        # Not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size):
                            break

                        # weird equation to deal with the fact the nb_train_steps will be different
                        # to nb_rollout_steps

                        step = total_steps - last_episode_step + t_train

                        critic_loss, actor_loss = self._train_step(
                            step, writer, do_actor_update=t_train % 2 == 0)
                        critic_losses_this_train_period.append(critic_loss)
                        if actor_loss:
                            actor_losses_this_train_period.append(actor_loss)
                            self._update_target_net()

                    # Evaluate.
                    eval_episode_rewards = []
                    eval_qs = []
                    if self.eval_env is not None and step_since_eval >= self.eval_freq:
                        step_since_eval %= self.eval_freq
                        eval_episode_reward = 0.
                        eval_episode = 0
                        while eval_episode < 10:
                            eval_action, eval_q = self._policy(
                                eval_obs, apply_noise=False, compute_q=True)
                            eval_obs, eval_r, eval_done, _ = self.eval_env.step(
                                eval_action * np.abs(self.action_space.low))
                            eval_episode_reward += eval_r

                            eval_qs.append(eval_q)
                            if eval_done:
                                if not isinstance(self.env, VecEnv):
                                    eval_obs = self.eval_env.reset()
                                eval_episode_rewards.append(
                                    eval_episode_reward)
                                eval_episode_rewards_deque.append(
                                    eval_episode_reward)
                                eval_episode_reward = 0.
                                eval_episode += 1

                    if callback is not None:
                        # Only stop training if return value is False, not when it is None.
                        # This is for backwards compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            return self

                    # mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['rollout/return'] = episode_rewards_all[-1]
                    combined_stats['rollout/return_last_100'] = np.mean(
                        episode_rewards_deque)
                    combined_stats[
                        'rollout/episode_steps'] = episode_steps_all[-1]
                    combined_stats['debug/actions_mean'] = np.mean(
                        actions_this_rollout_period)
                    combined_stats['debug/actions_std'] = np.std(
                        actions_this_rollout_period)
                    combined_stats['debug/Q_mean'] = np.mean(
                        qs_this_rollout_period)
                    combined_stats['train/loss_actor'] = np.mean(
                        actor_losses_this_train_period)
                    combined_stats['train/loss_critic'] = np.mean(
                        critic_losses_this_train_period)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        total_steps) / float(duration)
                    # Evaluation statistics.
                    if self.eval_env is not None and eval_episode_rewards:
                        combined_stats['eval/return'] = np.mean(
                            eval_episode_rewards)
                        combined_stats['eval/return_history'] = np.mean(
                            eval_episode_rewards_deque)
                        combined_stats['eval/Q'] = np.mean(eval_qs)
                        combined_stats['eval/episodes'] = len(
                            eval_episode_rewards)

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    # combined_stats_sums = MPI.COMM_WORLD.allreduce(
                    #     np.array([as_scalar(x) for x in combined_stats.values()]))
                    # combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

                    # Total statistics.
                    combined_stats['total/episodes'] = total_episode_num
                    combined_stats['total/steps'] = total_steps

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.dump_tabular()
                    logger.info('')
                    logdir = logger.get_dir()
                    # if rank == 0 and logdir:
                    #     if hasattr(self.env, 'get_state'):
                    #         with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler:
                    #             pickle.dump(self.env.get_state(), file_handler)
                    #     if self.eval_env and hasattr(self.eval_env, 'get_state'):
                    #         with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler:
                    #             pickle.dump(self.eval_env.get_state(), file_handler)

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions, _, = self._policy(observation,
                                   apply_noise=not deterministic,
                                   compute_q=False)
        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = actions * np.abs(
            self.action_space.low)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None):
        observation = np.array(observation)

        if actions is not None:
            raise ValueError("Error: DDPG does not have action probabilities.")

        # here there are no action probabilities, as DDPG does not use a probability distribution
        warnings.warn(
            "Warning: action probability is meaningless for DDPG. Returning None"
        )
        return None

    def get_parameter_list(self):
        return (self.params + self.target_params + self.obs_rms_params +
                self.ret_rms_params)

    def save(self, save_path):
        data = {
            "ro": self.ro,
            "seed": self.seed,
            "sample_number": self.sample_number,
            "eval_freq": self.eval_freq,
            "adjust_lr": self.adjust_lr,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "verbose": self.verbose,
            "action_noise": self.action_noise,
            "gamma": self.gamma,
            "tau": self.tau,
            "normalize_returns": self.normalize_returns,
            "normalize_observations": self.normalize_observations,
            "batch_size": self.batch_size,
            "observation_range": self.observation_range,
            "return_range": self.return_range,
            "critic_l2_reg": self.critic_l2_reg,
            "actor_lr": self.actor_lr,
            "critic_lr": self.critic_lr,
            "clip_norm": self.clip_norm,
            "reward_scale": self.reward_scale,
            "buffer_size": self.buffer_size,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path, data=data, params=params_to_save)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[
                'policy_kwargs']:
            raise ValueError(
                "The specified policy kwargs do not equal the stored policy kwargs. "
                "Stored kwargs: {}, specified kwargs: {}".format(
                    data['policy_kwargs'], kwargs['policy_kwargs']))

        model = cls(None, env, _init_setup_model=False)
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()
        # Patch for version < v2.6.0, duplicated keys where saved
        if len(params) > len(model.get_parameter_list()):
            n_params = len(model.params)
            n_target_params = len(model.target_params)
            n_normalisation_params = len(model.obs_rms_params) + len(
                model.ret_rms_params)
            # Check that the issue is the one from
            # https://github.com/hill-a/stable-baselines/issues/363
            assert len(params) == 2 * (n_params + n_target_params) + n_normalisation_params,\
                "The number of parameter saved differs from the number of parameters"\
                " that should be loaded: {}!={}".format(len(params), len(model.get_parameter_list()))
            # Remove duplicates
            params_ = params[:n_params + n_target_params]
            if n_normalisation_params > 0:
                params_ += params[-n_normalisation_params:]
            params = params_
        model.load_parameters(params)

        return model
Example #23
0
class DQN(OffPolicyRLModel):
    """
    The DQN model class.
    DQN paper: https://arxiv.org/abs/1312.5602
    Dueling DQN: https://arxiv.org/abs/1511.06581
    Double-Q Learning: https://arxiv.org/abs/1509.06461
    Prioritized Experience Replay: https://arxiv.org/abs/1511.05952

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param exploration_initial_eps: (float) initial value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param double_q: (bool) Whether to enable Double-Q learning or not.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1,
                 exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True,
                 learning_starts=1000, target_network_update_freq=500, prioritized_replay=False,
                 prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6, param_noise=False,
                 n_cpu_tf_sess=None, verbose=0, tensorboard_log=None,
                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None,
                 use_rmsprop=False, rmsprop_alpha=0.95, rmsprop_epsilon=0.01, exploration_offset=0):

        # TODO: replay_buffer refactoring
        super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy,
                                  requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log
        self.double_q = double_q
        self.use_rmsprop = use_rmsprop
        self.rmsprop_alpha = rmsprop_alpha
        self.rmsprop_epsilon = rmsprop_epsilon
        self.exploration_offset = exploration_offset

        self.graph = None
        self.sess = None
        self._train_step = None
        self.step_model = None
        self.update_target = None
        self.act = None
        self.proba_step = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None
        self.episode_reward = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.step_model
        return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values

    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.use_rmsprop:
                    optimizer = tf.train.RMSPropOptimizer(
                        learning_rate=self.learning_rate, decay=self.rmsprop_alpha, epsilon=self.rmsprop_epsilon,
                        centered=True
                    )
                else:
                    optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q
                )
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN",
              reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                    initial_p=self.prioritized_replay_beta0,
                                                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
                                              initial_p=self.exploration_initial_eps,
                                              final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1,))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps - self.exploration_offset)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps - self.exploration_offset) +
                                self.exploration.value(self.num_timesteps - self.exploration_offset) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                                      self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(self.batch_size,
                                                               beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess, options=run_options,
                                                                  run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer, PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * self.exploration.value(self.num_timesteps - self.exploration_offset)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return episode_rewards

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)
        with self.sess.as_default():
            actions, _, _ = self.step_model.step(observation, deterministic=deterministic)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def action_probability(self, observation, state=None, mask=None, actions=None, logp=False):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if actions is not None:  # comparing the action distribution, to given actions
            actions = np.array([actions])
            assert isinstance(self.action_space, gym.spaces.Discrete)
            actions = actions.reshape((-1,))
            assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations."
            actions_proba = actions_proba[np.arange(actions.shape[0]), actions]
            # normalize action proba shape
            actions_proba = actions_proba.reshape((-1, 1))
            if logp:
                actions_proba = np.log(actions_proba)

        if not vectorized_env:
            if state is not None:
                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
            actions_proba = actions_proba[0]

        return actions_proba

    def get_parameter_list(self):
        return self.params

    def save(self, save_path, cloudpickle=False):
        # params
        data = {
            "double_q": self.double_q,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
Example #24
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert isinstance(self.action_space, gym.spaces.Box), \
                "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space)
            assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \
                                                        "an instance of DDPGPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self._setup_learn(self.seed)
                # self.sess = tf_util.single_threaded_session(graph=self.graph)
                self.sess = tf_util.make_session()

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Observation normalization.
                    # if self.normalize_observations:
                    #     with tf.variable_scope('obs_rms'):
                    #         self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
                    # else:
                    #     self.obs_rms = None

                    # Return normalization.
                    # if self.normalize_returns:
                    #     with tf.variable_scope('ret_rms'):
                    #         self.ret_rms = RunningMeanStd()
                    # else:
                    #     self.ret_rms = None

                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space, 1, 1, None,
                                                 **self.policy_kwargs)

                    # Create target networks.
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space, 1, 1,
                                                     None,
                                                     **self.policy_kwargs)
                    self.obs_target = self.target_policy.obs_ph
                    self.action_target = self.target_policy.action_ph

                    # normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms),
                    #                                    self.observation_range[0], self.observation_range[1])
                    # normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms),
                    #                                    self.observation_range[0], self.observation_range[1])

                    # Inputs.
                    self.obs_train_ph = self.policy_tf.obs_ph
                    self.action_train_ph = self.policy_tf.action_ph
                    self.terminals1 = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='terminals1')
                    self.rewards = tf.placeholder(tf.float32,
                                                  shape=(None, 1),
                                                  name='rewards')
                    self.critic_target = tf.placeholder(tf.float32,
                                                        shape=(None, 1),
                                                        name='critic_target')

                # Create networks and core TF parts that are shared across setup parts.
                with tf.variable_scope("model", reuse=False):
                    self.actor_tf = self.policy_tf.make_actor(
                        self.policy_tf.processed_obs)
                    self.critic_tf = self.policy_tf.make_critic(
                        self.policy_tf.processed_obs, self.action_train_ph)
                    self.critic_with_actor_tf = self.policy_tf.make_critic(
                        self.policy_tf.processed_obs,
                        self.actor_tf,
                        reuse=True)

                    if self.ro:

                        def tf_repeat(tensor_to_repeat, repeat_num):
                            tiled = tf.tile(tensor_to_repeat, [1, repeat_num])
                            repeated = tf.reshape(
                                tiled,
                                shape=[
                                    self.batch_size * repeat_num,
                                    tensor_to_repeat.shape[1]
                                ])
                            return repeated

                        self.augmented_obs0 = tf_repeat(
                            self.policy_tf.processed_obs, self.sample_number)
                        self.augmented_action_raw = tf_repeat(
                            self.actor_tf, self.sample_number)
                        noises = []
                        for b_index in range(self.batch_size):
                            noises.append(
                                tf.random_uniform((self.sample_number - 1, ) +
                                                  self.action_space.shape,
                                                  -0.1, 0.1))
                            noises.append(
                                tf.zeros((1, ) + self.action_space.shape))
                        noises = tf.concat(noises, axis=0)
                        self.augmented_action = self.augmented_action_raw + noises
                        self.augmented_action = tf.clip_by_value(
                            self.augmented_action, -1, 1)
                        self.augmented_critic_with_actor_tf = self.policy_tf.make_critic(
                            self.augmented_obs0,
                            self.augmented_action,
                            reuse=True)[:, 0]

                with tf.variable_scope("target", reuse=False):
                    critic_target = \
                        self.target_policy.make_critic(self.target_policy.processed_obs,
                                                       self.target_policy.make_actor(self.target_policy.processed_obs))

                with tf.variable_scope("loss", reuse=False):
                    # self.critic_tf = denormalize(
                    #     tf.clip_by_value(self.critic_tf, self.return_range[0], self.return_range[1]),
                    #     self.ret_rms)
                    #
                    # self.critic_with_actor_tf = denormalize(
                    #     tf.clip_by_value(self.critic_with_actor_tf,
                    #                      self.return_range[0], self.return_range[1]),
                    #     self.ret_rms)
                    #
                    # q_obs1 = denormalize(critic_target, self.ret_rms)
                    self.target_q = self.rewards + (
                        1. - self.terminals1) * self.gamma * critic_target

                    # tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('critic_target',
                                             self.critic_target)

                    # Set up parts.
                    self._setup_stats()
                    self._setup_target_network_updates()

                with tf.variable_scope("input_info", reuse=False):
                    self.reward_summary = tf.summary.scalar(
                        'rewards', tf.reduce_mean(self.rewards))
                    self.obs_summary = tf.summary.scalar(
                        'obs', tf.reduce_mean(self.obs_train_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('rewards', self.rewards)
                        if len(self.observation_space.shape
                               ) == 3 and self.observation_space.shape[0] in [
                                   1, 3, 4
                               ]:
                            tf.summary.image('observation', self.obs_train_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 self.obs_train_ph)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self._setup_actor_optimizer()
                    self._setup_critic_optimizer()
                    self.actor_loss_summary = tf.summary.scalar(
                        'actor_loss', self.actor_loss)
                    self.critic_loss_summary = tf.summary.scalar(
                        'critic_loss', self.critic_loss)

                self.params = tf_util.get_trainable_vars("model")

                self.target_params = tf_util.get_trainable_vars("target")
                self.obs_rms_params = [
                    var for var in tf.global_variables()
                    if "obs_rms" in var.name
                ]
                self.ret_rms_params = [
                    var for var in tf.global_variables()
                    if "ret_rms" in var.name
                ]

                with self.sess.as_default():
                    self._initialize(self.sess)
Example #25
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              initial_p=1.0):

        self.actions_weights = []
        self.actions_container = []

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        cnt = 0
        ds_rewards = [[0, 0]]
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=initial_p,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()

            reset = True
            self.episode_reward = np.zeros((1, ))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                ''' Hierarchical Step (Start) '''

                obs, new_obs, rew, action, done, reset = self.hierarchical_step(
                    obs, ds_rewards, cnt, kwargs, update_eps)
                ''' Hierarchical Step (End) '''

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        try:
                            new_priorities = np.array([
                                abs(x) for x in td_errors.tolist()
                            ]) + self.prioritized_replay_eps
                            self.replay_buffer.update_priorities(
                                batch_idxes, new_priorities)
                        except AssertionError:
                            print(td_errors)

                if self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1
        return self, ds_rewards
Example #26
0
class MADQN(OffPolicyRLModel):
    """
    The DQN model class.
    DQN paper: https://arxiv.org/abs/1312.5602
    Dueling DQN: https://arxiv.org/abs/1511.06581
    Double-Q Learning: https://arxiv.org/abs/1509.06461
    Prioritized Experience Replay: https://arxiv.org/abs/1511.05952

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param exploration_initial_eps: (float) initial value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param double_q: (bool) Whether to enable Double-Q learning or not.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 exploration_initial_eps=1.0,
                 train_freq=1,
                 batch_size=32,
                 double_q=True,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 n_cpu_tf_sess=None,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 num_agents=1):  # MA-MOD

        # TODO: replay_buffer refactoring
        super(MADQN, self).__init__(policy=policy,
                                    env=env,
                                    replay_buffer=None,
                                    verbose=verbose,
                                    policy_base=DQNPolicy,
                                    requires_vec_env=False,
                                    policy_kwargs=policy_kwargs,
                                    seed=seed,
                                    n_cpu_tf_sess=n_cpu_tf_sess)
        # print("POLICY TYPE", policy)
        if self.observation_space:
            obs_sp_low = self.observation_space.low[0, :]
            obs_sp_high = self.observation_space.high[0, :]
            self.observation_space = gym.spaces.Box(low=obs_sp_low,
                                                    high=obs_sp_high)

        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log
        self.double_q = double_q
        self.num_agents = num_agents

        self.graph = None
        self.sess = None
        self._train_step = []  # MA-MOD
        self.step_model = []  # MA-MOD
        self.update_target = []  # MA-MOD
        self.act = []  # MA-MOD
        self.proba_step = []  # MA-MOD
        self.replay_buffer = None  # TODO: Possibly try seperate replay buffer. If everything symmetric, OK for one.
        # If you have the same Value function, its fine. If you have seperate functions, if you have one replay buffer, they learn from the same data.
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        assert False, "MAKE SURE THIS FUNCTION ISNT CALLED"
        policy = self.step_model
        return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values

    def setup_model(self):

        with SetVerbosity(self.verbose):
            for i in range(self.num_agents):
                assert not isinstance(self.action_space, gym.spaces.Box), \
                    "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            # print(test_policy.type)
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)
                self.params = []

                print("AC SPC", self.action_space)
                for i in range(self.num_agents):
                    with tf.variable_scope("agent" + str(i)):
                        optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate)
                        act, _train_step, update_target, step_model = build_train(
                            q_func=partial(self.policy, **self.policy_kwargs),
                            ob_space=self.observation_space,
                            ac_space=self.action_space,
                            optimizer=optimizer,
                            gamma=self.gamma,
                            grad_norm_clipping=10,
                            param_noise=self.param_noise,
                            sess=self.sess,
                            full_tensorboard_log=
                            False,  #self.full_tensorboard_log,
                            double_q=self.double_q)
                        self.act.append(act)
                        self._train_step.append(_train_step)
                        self.step_model.append(step_model)
                        self.proba_step.append(step_model.proba_step)
                        self.update_target.append(update_target)
                        self.params.extend(
                            tf_util.get_trainable_vars("agent" + str(i) +
                                                       "/deepq"))

                print(self.params)

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(
                    self.sess
                )  # TODO: copy this file, make two versions of the algorithm.
                for i in range(self.num_agents):
                    self.update_target[i](
                        sess=self.sess
                    )  # TODO: Not sure, seems like the best thing to do is try using each agents own target first.

                # self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        # callback = self._init_callback(callback)

        # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
        #         as writer:
        self._setup_learn()

        # Create the replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = total_timesteps
            else:
                prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
            self.beta_schedule = None

        if replay_wrapper is not None:
            assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(self.exploration_fraction *
                                   total_timesteps),
            initial_p=self.exploration_initial_eps,
            final_p=self.exploration_final_eps)

        episode_rewards = [[0.0] * self.num_agents]  #MA-MOD
        episode_successes = []

        #callback.on_training_start(locals(), globals())
        #callback.on_rollout_start()

        reset = True
        obs = self.env.reset()

        for _ in range(total_timesteps):
            # Take action and update exploration to the newest value
            kwargs = {}
            if not self.param_noise:
                update_eps = self.exploration.value(self.num_timesteps)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = \
                    -np.log(1. - self.exploration.value(self.num_timesteps) +
                            self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            with self.sess.as_default():
                env_action = []  # MA-MOD
                for i in range(self.num_agents
                               ):  # MA-MOD. This is fine for one policy.
                    action = self.act[i](
                        np.array(obs[i])[None],
                        update_eps=update_eps,
                        **kwargs
                    )[0]  # TODO: Is this the correct way to get the correct agent obs?
                    env_action.append(action)
            reset = False
            new_obs, rew, done, info = self.env.step(
                env_action
            )  # NOUPDATE - env.step should take a vector of actions
            '''
            Obs: x_me, x_opp --- agent 1. In env: x_1, x_2
            Obs: x_me, x_opp -- agent 2. In env: x_2, x_1
            Env: (n_agents, state_dim)
            '''

            self.num_timesteps += 1

            # Stop training if return value is False
            # if callback.on_step() is False:
            #    break

            # Store transition in the replay buffer.
            # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index]
            # Joey: Does this look right to you?
            # print(obs, action, rew, new_obs, done)
            #print("obs",obs[0])
            #print(action)
            #print("ac", action[0])
            #print("rew", rew[0])
            #print("done", done[0])
            for num_agent in range(self.num_agents):
                self.replay_buffer.add(obs[num_agent], env_action[num_agent],
                                       rew[num_agent], new_obs[num_agent],
                                       float(done[num_agent]))
            obs = new_obs

            # if writer is not None:
            #     ep_rew = np.array([rew]).reshape((1, -1))
            #     ep_done = np.array([done]).reshape((1, -1))
            #     tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
            #                                         self.num_timesteps)

            # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps
            #     append the newest reward to the end of each list for each agent
            for num_agent in range(self.num_agents):  #MA-MOD
                episode_rewards[-1][num_agent] += rew[num_agent]
                if done.any():
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append([0.0] * self.num_agents)
                    reset = True

            # Do not train if the warmup phase is not over
            # or if there are not enough samples in the replay buffer
            can_sample = self.replay_buffer.can_sample(self.batch_size)
            if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:

                # callback.on_rollout_end()

                for i in range(self.num_agents):  # MA-MOD
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                                "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    # if writer is not None:
                    #     # run loss backprop with summary, but once every 100 steps save the metadata
                    #     # (memory, compute time, ...)
                    #     if (1 + self.num_timesteps) % 100 == 0:
                    #         run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    #         run_metadata = tf.RunMetadata()
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess, options=run_options,
                    #                                               run_metadata=run_metadata)
                    #         writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i))
                    #     else:
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess)
                    #     writer.add_summary(summary, self.num_timesteps)
                    # else:
                    td_errors = self._train_step[i](obses_t,
                                                    actions,
                                                    rewards,
                                                    obses_tp1,
                                                    obses_tp1,
                                                    dones,
                                                    weights,
                                                    sess=self.sess)

                if self.prioritized_replay:  # NOUPDATE - not inside main agent for loop
                    new_priorities = np.abs(
                        td_errors) + self.prioritized_replay_eps  # NOUPDATE
                    assert isinstance(self.replay_buffer,
                                      PrioritizedReplayBuffer)
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)

                # callback.on_rollout_start()

            if can_sample and self.num_timesteps > self.learning_starts and \
                    self.num_timesteps % self.target_network_update_freq == 0:
                # Update target network periodically.
                for i in range(self.num_agents):
                    self.update_target[i](sess=self.sess)  # MA-MOD

            if len(episode_rewards[-101:-1]) == 0:  # MA-MOD
                mean_100ep_reward = -np.inf
            else:
                mean_100ep_reward = round(
                    float(np.mean(episode_rewards[-101:-1])), 1)  #MA-MOD

            # below is what's logged in terminal.
            num_episodes = len(episode_rewards)  #MA-MOD
            if self.verbose >= 1 and done.any(
            ) and log_interval is not None and len(
                    episode_rewards) % log_interval == 0:  #MA-MOD
                logger.record_tabular("steps", self.num_timesteps)
                logger.record_tabular("episodes", num_episodes)
                if len(episode_successes) > 0:
                    logger.logkv("success rate",
                                 np.mean(episode_successes[-100:]))
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular(
                    "% time spent exploring",
                    int(100 * self.exploration.value(self.num_timesteps)))
                logger.dump_tabular()

        return self

    def predict(
            self,
            observation,
            agent_idx,
            state=None,
            mask=None,
            deterministic=True):  # MA-MOD - added `agent_idx` as a parameter
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        with self.sess.as_default():
            actions, _, _ = self.step_model[agent_idx].step(
                observation, deterministic=deterministic)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    # No one ever calls this, so we don't need it?
    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        print("Should not be called")
        return None
        '''
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if actions is not None:  # comparing the action distribution, to given actions
            actions = np.array([actions])
            assert isinstance(self.action_space, gym.spaces.Discrete)
            actions = actions.reshape((-1,))
            assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations."
            actions_proba = actions_proba[np.arange(actions.shape[0]), actions]
            # normalize action proba shape
            actions_proba = actions_proba.reshape((-1, 1))
            if logp:
                actions_proba = np.log(actions_proba)

        if not vectorized_env:
            if state is not None:
                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
            actions_proba = actions_proba[0]

        return actions_proba
        '''

    def get_parameter_list(self):
        print(self.params)
        return self.params

    def save(self, save_path, cloudpickle=False):
        # params
        data = {
            "double_q": self.double_q,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs,
            "num_agents": self.num_agents
        }

        params_to_save = self.get_parameters()
        # print(params_to_save)

        self._save_to_file(save_path,
                           data=data,
                           params=params_to_save,
                           cloudpickle=cloudpickle)
Example #27
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                    self.beta_schedule = LinearSchedule(
                        prioritized_replay_beta_iters,
                        initial_p=self.prioritized_replay_beta0,
                        final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            for step in range(total_timesteps):
                if callback is not None:
                    callback(locals(), globals())
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(step)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(step) +
                                self.exploration.value(step) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer, step)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if step > self.learning_starts and step % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(step))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + step) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata,
                                                    'step%d' % step)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, step)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if step > self.learning_starts and step % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", step)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(step)))
                    logger.dump_tabular()

        return self
Example #28
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        # callback = self._init_callback(callback)

        # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
        #         as writer:
        self._setup_learn()

        # Create the replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = total_timesteps
            else:
                prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
            self.beta_schedule = None

        if replay_wrapper is not None:
            assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(self.exploration_fraction *
                                   total_timesteps),
            initial_p=self.exploration_initial_eps,
            final_p=self.exploration_final_eps)

        episode_rewards = [[0.0] * self.num_agents]  #MA-MOD
        episode_successes = []

        #callback.on_training_start(locals(), globals())
        #callback.on_rollout_start()

        reset = True
        obs = self.env.reset()

        for _ in range(total_timesteps):
            # Take action and update exploration to the newest value
            kwargs = {}
            if not self.param_noise:
                update_eps = self.exploration.value(self.num_timesteps)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = \
                    -np.log(1. - self.exploration.value(self.num_timesteps) +
                            self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            with self.sess.as_default():
                env_action = []  # MA-MOD
                for i in range(self.num_agents
                               ):  # MA-MOD. This is fine for one policy.
                    action = self.act[i](
                        np.array(obs[i])[None],
                        update_eps=update_eps,
                        **kwargs
                    )[0]  # TODO: Is this the correct way to get the correct agent obs?
                    env_action.append(action)
            reset = False
            new_obs, rew, done, info = self.env.step(
                env_action
            )  # NOUPDATE - env.step should take a vector of actions
            '''
            Obs: x_me, x_opp --- agent 1. In env: x_1, x_2
            Obs: x_me, x_opp -- agent 2. In env: x_2, x_1
            Env: (n_agents, state_dim)
            '''

            self.num_timesteps += 1

            # Stop training if return value is False
            # if callback.on_step() is False:
            #    break

            # Store transition in the replay buffer.
            # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index]
            # Joey: Does this look right to you?
            # print(obs, action, rew, new_obs, done)
            #print("obs",obs[0])
            #print(action)
            #print("ac", action[0])
            #print("rew", rew[0])
            #print("done", done[0])
            for num_agent in range(self.num_agents):
                self.replay_buffer.add(obs[num_agent], env_action[num_agent],
                                       rew[num_agent], new_obs[num_agent],
                                       float(done[num_agent]))
            obs = new_obs

            # if writer is not None:
            #     ep_rew = np.array([rew]).reshape((1, -1))
            #     ep_done = np.array([done]).reshape((1, -1))
            #     tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
            #                                         self.num_timesteps)

            # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps
            #     append the newest reward to the end of each list for each agent
            for num_agent in range(self.num_agents):  #MA-MOD
                episode_rewards[-1][num_agent] += rew[num_agent]
                if done.any():
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append([0.0] * self.num_agents)
                    reset = True

            # Do not train if the warmup phase is not over
            # or if there are not enough samples in the replay buffer
            can_sample = self.replay_buffer.can_sample(self.batch_size)
            if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:

                # callback.on_rollout_end()

                for i in range(self.num_agents):  # MA-MOD
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                                "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    # if writer is not None:
                    #     # run loss backprop with summary, but once every 100 steps save the metadata
                    #     # (memory, compute time, ...)
                    #     if (1 + self.num_timesteps) % 100 == 0:
                    #         run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    #         run_metadata = tf.RunMetadata()
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess, options=run_options,
                    #                                               run_metadata=run_metadata)
                    #         writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i))
                    #     else:
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess)
                    #     writer.add_summary(summary, self.num_timesteps)
                    # else:
                    td_errors = self._train_step[i](obses_t,
                                                    actions,
                                                    rewards,
                                                    obses_tp1,
                                                    obses_tp1,
                                                    dones,
                                                    weights,
                                                    sess=self.sess)

                if self.prioritized_replay:  # NOUPDATE - not inside main agent for loop
                    new_priorities = np.abs(
                        td_errors) + self.prioritized_replay_eps  # NOUPDATE
                    assert isinstance(self.replay_buffer,
                                      PrioritizedReplayBuffer)
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)

                # callback.on_rollout_start()

            if can_sample and self.num_timesteps > self.learning_starts and \
                    self.num_timesteps % self.target_network_update_freq == 0:
                # Update target network periodically.
                for i in range(self.num_agents):
                    self.update_target[i](sess=self.sess)  # MA-MOD

            if len(episode_rewards[-101:-1]) == 0:  # MA-MOD
                mean_100ep_reward = -np.inf
            else:
                mean_100ep_reward = round(
                    float(np.mean(episode_rewards[-101:-1])), 1)  #MA-MOD

            # below is what's logged in terminal.
            num_episodes = len(episode_rewards)  #MA-MOD
            if self.verbose >= 1 and done.any(
            ) and log_interval is not None and len(
                    episode_rewards) % log_interval == 0:  #MA-MOD
                logger.record_tabular("steps", self.num_timesteps)
                logger.record_tabular("episodes", num_episodes)
                if len(episode_successes) > 0:
                    logger.logkv("success rate",
                                 np.mean(episode_successes[-100:]))
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular(
                    "% time spent exploring",
                    int(100 * self.exploration.value(self.num_timesteps)))
                logger.dump_tabular()

        return self
Example #29
0
class SAC(OffPolicyRLModel):
    """
    Soft Actor-Critic (SAC)
    Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor,
    This implementation borrows code from original implementation (https://github.com/haarnoja/sac)
    from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo
    (https://github.com/rail-berkeley/softlearning/)
    Paper: https://arxiv.org/abs/1801.01290
    Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html

    :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount factor
    :param learning_rate: (float or callable) learning rate for adam optimizer,
        the same learning rate will be used for all networks (Q-Values, Actor and Value function)
        it can be a function of the current progress (from 1 to 0)
    :param buffer_size: (int) size of the replay buffer
    :param batch_size: (int) Minibatch size for each gradient update
    :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
    :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to
        inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
        Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
    :param train_freq: (int) Update the model every `train_freq` steps.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
    :param gradient_steps: (int) How many gradient update after each step
    :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
    :param action_noise: (ActionNoise) the action noise type (None by default), this can help
        for hard exploration problem. Cf DDPG for the different action noise type.
    :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy)
        This is not needed for SAC normally but can help exploring when using HER + SAC.
        This hack was present in the original OpenAI Baselines repo (DDPG + HER)
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        Note: this has no effect on SAC logging for now
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 tau=0.005,
                 ent_coef='auto',
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 action_noise=None,
                 random_exploration=0.0,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=None):

        super(SAC, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=SACPolicy,
                                  requires_vec_env=False,
                                  policy_kwargs=policy_kwargs,
                                  seed=seed,
                                  n_cpu_tf_sess=n_cpu_tf_sess)

        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.tau = tau
        # In the original paper, same learning rate is used for all networks
        # self.policy_lr = learning_rate
        # self.qf_lr = learning_rate
        # self.vf_lr = learning_rate
        # Entropy coefficient / Entropy temperature
        # Inverse of the reward scale
        self.ent_coef = ent_coef
        self.target_update_interval = target_update_interval
        self.gradient_steps = gradient_steps
        self.gamma = gamma
        self.action_noise = action_noise
        self.random_exploration = random_exploration

        self.value_fn = None
        self.graph = None
        self.replay_buffer = None
        self.episode_reward = None
        self.sess = None
        self.tensorboard_log = tensorboard_log
        self.verbose = verbose
        self.params = None
        self.summary = None
        self.policy_tf = None
        self.target_entropy = target_entropy
        self.full_tensorboard_log = full_tensorboard_log

        self.obs_target = None
        self.target_policy = None
        self.actions_ph = None
        self.rewards_ph = None
        self.terminals_ph = None
        self.observations_ph = None
        self.action_target = None
        self.next_observations_ph = None
        self.value_target = None
        self.step_ops = None
        self.target_update_op = None
        self.infos_names = None
        self.entropy = None
        self.target_params = None
        self.learning_rate_ph = None
        self.processed_obs_ph = None
        self.processed_next_obs_ph = None
        self.log_ent_coef = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.policy_tf
        # Rescale
        deterministic_action = unscale_action(self.action_space,
                                              self.deterministic_action)
        return policy.obs_ph, self.actions_ph, deterministic_action

    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)  # Q(s,a)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)  # Q(s, pi(a|s))

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.env.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable(
                            'log_ent_coef',
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')

                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    'ent_coef_loss', 'ent_coef'
                                ]
                                self.step_ops += [
                                    ent_coef_op, ent_coef_loss, self.ent_coef
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

    def _train_step(self, step, writer, learning_rate):
        # Sample a batch from the replay buffer
        batch = self.replay_buffer.sample(self.batch_size)
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch

        feed_dict = {
            self.observations_ph: batch_obs,
            self.actions_ph: batch_actions,
            self.next_observations_ph: batch_next_obs,
            self.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.learning_rate_ph: learning_rate
        }

        # out  = [policy_loss, qf1_loss, qf2_loss,
        #         value_loss, qf1, qf2, value_fn, logp_pi,
        #         self.entropy, policy_train_op, train_values_op]

        # Do one gradient step
        # and optionally compute log for tensorboard
        if writer is not None:
            out = self.sess.run([self.summary] + self.step_ops, feed_dict)
            summary = out.pop(0)
            writer.add_summary(summary, step)
        else:
            out = self.sess.run(self.step_ops, feed_dict)

        # Unpack to monitor losses and entropy
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = out
        # qf1, qf2, value_fn, logp_pi, entropy, *_ = values
        entropy = values[4]

        if self.log_ent_coef is not None:
            ent_coef_loss, ent_coef = values[-2:]
            return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef

        return policy_loss, qf1_loss, qf2_loss, value_loss, entropy

    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(unscaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv("episode reward", episode_rewards[-2])
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        if actions is not None:
            raise ValueError("Error: SAC does not have action probabilities.")

        warnings.warn(
            "Even though SAC has a Gaussian policy, it cannot return a distribution as it "
            "is squashed by a tanh before being scaled and outputed.")

        return None

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation, deterministic=deterministic)
        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = unscale_action(
            self.action_space, actions)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def get_parameter_list(self):
        return (self.params + self.target_params)

    def save(self, save_path, cloudpickle=False):
        data = {
            "learning_rate": self.learning_rate,
            "buffer_size": self.buffer_size,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "batch_size": self.batch_size,
            "tau": self.tau,
            "ent_coef":
            self.ent_coef if isinstance(self.ent_coef, float) else 'auto',
            "target_entropy": self.target_entropy,
            # Should we also store the replay buffer?
            # this may lead to high memory usage
            # with all transition inside
            # "replay_buffer": self.replay_buffer
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "action_noise": self.action_noise,
            "random_exploration": self.random_exploration,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path,
                           data=data,
                           params=params_to_save,
                           cloudpickle=cloudpickle)
Example #30
0
def main(args):
    """
    Train a DQN agent on cartpole env
    :param args: (Parsed Arguments) the input arguments
    """
    with tf_utils.make_session(8) as sess:
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, _ = deepq.build_train(
            q_func=CustomPolicy,
            ob_space=env.observation_space,
            ac_space=env.action_space,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            sess=sess)
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
                                     initial_p=1.0,
                                     final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        tf_utils.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for step in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(step))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            if len(episode_rewards[-101:-1]) == 0:
                mean_100ep_reward = -np.inf
            else:
                mean_100ep_reward = round(
                    float(np.mean(episode_rewards[-101:-1])), 1)

            is_solved = step > 100 and mean_100ep_reward >= 200

            if args.no_render and step > args.max_timesteps:
                break

            if is_solved:
                if args.no_render:
                    break
                # Show off the result
                env.render()
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if step > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))
                # Update target network periodically.
                if step % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", step)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(step)))
                logger.dump_tabular()