def test_extend_uniform():
    nvals = 16
    states = [np.random.rand(2, 2) for _ in range(nvals)]
    actions = [np.random.rand(2) for _ in range(nvals)]
    rewards = [np.random.rand() for _ in range(nvals)]
    newstate = [np.random.rand(2, 2) for _ in range(nvals)]
    done = [np.random.randint(0, 2) for _ in range(nvals)]

    size = 32
    baseline = ReplayBuffer(size)
    ext = ReplayBuffer(size)
    for data in zip(states, actions, rewards, newstate, done):
        baseline.add(*data)

    states, actions, rewards, newstates, done = map(
        np.array, [states, actions, rewards, newstate, done])

    ext.extend(states, actions, rewards, newstates, done)
    assert len(baseline) == len(ext)

    # Check buffers have same values
    for i in range(nvals):
        for j in range(5):
            condition = (baseline.storage[i][j] == ext.storage[i][j])
            if isinstance(condition, np.ndarray):
                # for obs, obs_t1
                assert np.all(condition)
            else:
                # for done, reward action
                assert condition
Exemple #2
0
    def fill_expert_buffer(self, num_trajectories=500, max_expert_steps=300):
        self.pretrained = True

        buffer_size = int(1e6)
        self.demo_buffer = ReplayBuffer(buffer_size)

        obs = self.env.reset()
        i = 0

        last_obs = None
        env_steps = 0

        while i < num_trajectories:
            a = self.env.env.expert(self.env.convert_obs_to_dict(obs))

            last_obs = deepcopy(obs)
            obs, reward, done, info = self.env.step(a)
            env_steps += 1
            if last_obs is not None:
                self.demo_buffer.add(last_obs, a, reward, obs, done)

            if info['is_success'] or env_steps > max_expert_steps:
                env_steps = 0
                last_obs = None
                obs = self.env.reset()
                i += 1
 def initializeExpertBuffer(self, obs, act):
     for i in range(len(self._models)):
         self._models[i].expert_buffer = ReplayBuffer(
             sum([len(elem) for elem in obs])
         )
         for j in range(len(obs)):
             self._models[i]._initializeExpertBuffer(obs[j], act[j][:, i])
Exemple #4
0
 def _setup_model(self):
     self._setup_learning_rate()
     obs_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
     self.set_random_seed(self.seed)
     self.replay_buffer = ReplayBuffer(self.buffer_size, obs_dim, action_dim)
     self.policy = self.policy_class(self.observation_space, self.action_space,
                                     self.learning_rate, **self.policy_kwargs)
     self._create_aliases()
    def initializeExpertBuffer(self, ar_list_obs, ar_list_act):
        """
        initizalize Expert Buffer
        """
        self.expert_buffer = ReplayBuffer(
            sum([len(elem) for elem in ar_list_act]))

        for i in range(0, len(ar_list_act)):
            self._initializeExpertBuffer(ar_list_obs[i], ar_list_act[i])
Exemple #6
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.risk_factor_ph = tf.placeholder(tf.float32, [],
                                                         name='risk_factor_ph')

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    #double_policy = self.policy_tf.make_actor(self.processed_next_obs_ph,reuse=True)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    if self.model_type == "QR":
                        self.qrtau = tf.tile(
                            tf.reshape(
                                tf.range(0.5 / self.n_support, 1,
                                         1 / self.n_support),
                                [1, self.n_support]),
                            [tf.shape(self.processed_obs_ph)[0], 1])
                        qf1, qf2 = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            self.actions_ph,
                            n_support=self.n_support)
                        # Q value when following the current policy
                        qrtau_pi = self.qrtau
                        qf1_pi, _ = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            policy_out,
                            reuse=True,
                            n_support=self.n_support)
                    elif self.model_type == "IQN":
                        self.qrtau = tf.random_uniform([
                            tf.shape(self.processed_obs_ph)[0], self.n_support
                        ],
                                                       minval=self.tau_clamp,
                                                       maxval=1.0 -
                                                       self.tau_clamp)
                        qf1, qf2 = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            self.actions_ph,
                            model_type=self.model_type,
                            iqn_tau=self.qrtau,
                            n_support=self.n_support)
                        # Q value when following the current policy
                        qrtau_pi = tf.random_uniform([
                            tf.shape(self.processed_obs_ph)[0], self.n_support
                        ],
                                                     minval=self.tau_clamp,
                                                     maxval=1.0 -
                                                     self.tau_clamp)
                        qf1_pi, _ = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            policy_out,
                            model_type=self.model_type,
                            iqn_tau=qrtau_pi,
                            reuse=True,
                            n_support=self.n_support)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1 + 1e-2, 1 - 1e-2)
                    # Q values when following the target policy
                    if self.model_type == "QR":
                        target_qrtau = self.qrtau
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(
                            self.processed_next_obs_ph,
                            noisy_target_action,
                            n_support=self.n_support)
                    elif self.model_type == "IQN":
                        target_qrtau = tf.random_uniform([
                            tf.shape(self.processed_next_obs_ph)[0],
                            self.n_support
                        ],
                                                         minval=self.tau_clamp,
                                                         maxval=1.0 -
                                                         self.tau_clamp)
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(
                            self.processed_next_obs_ph,
                            noisy_target_action,
                            model_type=self.model_type,
                            iqn_tau=target_qrtau,
                            n_support=self.n_support)

                with tf.variable_scope("loss", reuse=False):

                    quantile_weight = 1.0 - self.risk_factor_ph * (
                        2.0 * qrtau_pi - 1.0)
                    min_quantile = tf.reduce_mean(qf1_pi[:, 0])
                    max_quantile = tf.reduce_mean(qf1_pi[:, -1])

                    #min_qf_target = tf.minimum(qf1_target, qf2_target)
                    #max_arg = tf.argmax(target_qrtau,axis=-1)
                    qf1_t_flag = qf1_target[:, -1]
                    qf2_t_flag = qf2_target[:, -1]
                    #qf1_t_flag = qf1_target[:,max_arg]
                    #qf2_t_flag = qf2_target[:,max_arg]
                    #qf1_t_flag = tf.reduce_max(qf1_target,axis=-1)
                    #qf2_t_flag = tf.reduce_max(qf2_target,axis=-1)
                    #min_flag = qf1_t_flag > qf2_t_flag
                    min_flag = qf1_t_flag < qf2_t_flag
                    min_qf_target = tf.where(min_flag, qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1.0 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=2),
                                    [1, 1, self.n_support])
                    #qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=1), [1, self.n_support, 1])
                    #mulmax = 2.0
                    logit_valid_tile = tf.tile(
                        tf.expand_dims(q_backup, axis=1),
                        [1, self.n_support, 1])

                    theta_loss_tile = tf.tile(tf.expand_dims(qf1, axis=2),
                                              [1, 1, self.n_support])
                    Huber_loss = tf.compat.v1.losses.huber_loss(
                        logit_valid_tile,
                        theta_loss_tile,
                        reduction=tf.losses.Reduction.NONE,
                        delta=self.kappa) / self.kappa
                    bellman_errors = logit_valid_tile - theta_loss_tile
                    Loss = tf.abs(qrtau - tf.stop_gradient(
                        tf.to_float(bellman_errors < 0))) * Huber_loss
                    qf1_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1),
                                                axis=1)
                    #qf1_gmul = qf1_losses - tf.reduce_min(qf1_losses)
                    #qf1_gmul = 1.0 + mulmax*qf1_gmul/tf.reduce_max(qf1_gmul)#(1.0 - mulmax) + 2*mulmax*qf1_gmul/tf.reduce_max(qf1_gmul)
                    qf1_loss = tf.reduce_mean(qf1_losses)

                    theta_loss_tile = tf.tile(tf.expand_dims(qf2, axis=2),
                                              [1, 1, self.n_support])
                    Huber_loss = tf.compat.v1.losses.huber_loss(
                        logit_valid_tile,
                        theta_loss_tile,
                        reduction=tf.losses.Reduction.NONE,
                        delta=self.kappa) / self.kappa
                    bellman_errors = logit_valid_tile - theta_loss_tile
                    Loss = tf.abs(qrtau - tf.stop_gradient(
                        tf.to_float(bellman_errors < 0))) * Huber_loss
                    qf2_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1),
                                                axis=1)
                    #qf2_gmul = qf2_losses - tf.reduce_min(qf2_losses)
                    #qf2_gmul = 1.0 + mulmax*qf2_gmul/tf.reduce_max(qf2_gmul)#(1.0 - mulmax) + 2*mulmax*qf2_gmul/tf.reduce_max(qf2_gmul)
                    qf2_loss = tf.reduce_mean(qf2_losses)
                    qvalues_losses = qf1_loss + qf2_loss

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -tf.reduce_mean(
                        tf.multiply(qf1_pi,
                                    quantile_weight))  # + policy_update_ratio
                    # Q Values optimizer
                    #qvalues_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    #qvalues_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars(
                        'model/values_fn/')

                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    #policy_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    #policy_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph)

                    # Initializing target to match source variables
                    self.target_init_op = tf.group([
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ])

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)
                    #grad_values = tf.gradients(qvalues_losses, qvalues_params)
                    #grad_values = list(zip(grad_values, qvalues_params))

                    with tf.control_dependencies([train_values_op]):
                        policy_train_op = policy_optimizer.minimize(
                            policy_loss,
                            var_list=tf_util.get_trainable_vars('model/pi'))
                        #grad_policy = tf.gradients(policy_loss, tf_util.get_trainable_vars('model/pi'))
                        #grad_policy = list(zip(grad_policy, tf_util.get_trainable_vars('model/pi')))
                        self.policy_train_op = policy_train_op

                    with tf.control_dependencies([self.policy_train_op]):
                        # Polyak averaging for target variables
                        self.target_ops = tf.group([
                            tf.assign(target, (1.0 - self.tau) * target +
                                      self.tau * source) for target, source in
                            zip(target_params, source_params)
                        ])

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('min_quantile', min_quantile)
                    tf.summary.scalar('max_quantile', max_quantile)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    '''
                    for grad, var in grad_values + grad_policy:
                        tf.summary.histogram(var.name, var)
                        tf.summary.histogram(var.name + '/gradient', grad)
                    '''

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(self.target_init_op)

                self.summary = tf.summary.merge_all()
    def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="BDQ",
              reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                    initial_p=self.prioritized_replay_beta0,
                                                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)


            if self.epsilon_greedy:
                approximate_num_iters = 2e6 / 4
                # TODO Decide which schedule type to use
                # self.exploration = PiecewiseSchedule([(0, 1.0),
                #                                 (approximate_num_iters / 50, 0.1), 
                #                                 (approximate_num_iters / 5, 0.01) 
                #                                 ], outside_value=0.01)
                self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
                                                  initial_p=self.exploration_initial_eps,
                                                  final_p=self.exploration_final_eps)
            else:
                self.exploration = ConstantSchedule(value=0.0) # greedy policy
                std_schedule = LinearSchedule(schedule_timesteps=self.timesteps_std,
                                              initial_p=self.initial_std,
                                              final_p=self.final_std)

            episode_rewards = [0.0]
            episode_successes = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1,))
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            for _ in range(total_timesteps):
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
                    # print("time step {} and update eps {}".format(self.num_timesteps, update_eps))
                    action_idxes = np.array(self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)) #update_eps=exploration.value(t)))
                    action = action_idxes / self.num_action_grains * self.actions_range + self.low
                
                if not self.epsilon_greedy: # Gaussian noise
                    actions_greedy = action
                    action_idx_stoch = []
                    action = []
                    for index in range(len(actions_greedy)): 
                        a_greedy = actions_greedy[index]
                        out_of_range_action = True 
                        while out_of_range_action:
                            # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice  
                            a_stoch = np.random.normal(loc=a_greedy, scale=std_schedule.value(self.num_timesteps))

                            # Convert sampled cont action to an action idx
                            a_idx_stoch = np.rint((a_stoch + self.high[index]) / self.actions_range[index] * self.num_action_grains)

                            # Check if action is in range
                            if a_idx_stoch >= 0 and a_idx_stoch < self.num_actions_pad:
                                action_idx_stoch.append(a_idx_stoch)
                                action.append(a_stoch)
                                out_of_range_action = False

                    action_idxes = action_idx_stoch
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)

                self.num_timesteps += 1

                # Stop training if return value is False
                if callback.on_step() is False:
                    break
                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs().squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward().squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, rew
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs_, action_idxes, reward_, new_obs_, float(done))
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                if writer is not None:
                    ep_rew = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                        self.num_timesteps)
                    # self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                    #                                                   self.num_timesteps)

                # episode_rewards[-1] += rew
                episode_rewards[-1] += reward_
                if done:
                    # print("ep number", len(episode_rewards))
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    
                    callback.on_rollout_end()
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(self.batch_size,
                                                               beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking
                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess, options=run_options,
                                                                  run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer, PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                    callback.on_rollout_start()

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                # Log training infos
                kvs = {}
                if self.verbose >= 1 and done and log_interval is not None \
                    and len(episode_rewards) % log_interval == 0 \
                    and self.num_timesteps > self.train_freq \
                    and self.num_timesteps > self.learning_starts:
                    
                    if self.log_dir is not None:
                        kvs["episodes"] = num_episodes
                        kvs["mean_100rew"] = mean_100ep_reward
                        kvs["current_lr"] = self.learning_rate
                        kvs["success_rate"] = np.mean(episode_successes[-100:])
                        kvs["total_timesteps"] = self.num_timesteps
                        kvs["mean_loss"] = mean_loss
                        kvs["mean_td_errors"] = np.mean(td_errors)
                        kvs["time_spent_exploring"] = int(100 * self.exploration.value(self.num_timesteps))
                        self.log_csv.writekvs(kvs) 

                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
Exemple #8
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 
                                                config=self.config, pretrained_model=self.pretrained_model, **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess, self.observation_space, self.action_space,
                                                     config=self.config, pretrained_model=self.pretrained_model, **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph,
                                                                     create_qf=True, create_vf=True)
                    
                    self._qf1 = qf1
                    self._qf2 = qf2
                    self._value_fn = value_fn

                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph,
                                                                    policy_out, create_qf=True, create_vf=False,
                                                                    reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,
                                                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph,
                                                                         create_qf=False, create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * self.value_target
                    )

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi'))

                    # policy_train_op = tf.contrib.layers.optimize_loss(
                    #     policy_loss,
                    #     None,
                    #     self.learning_rate_ph,
                    #     "Adam",
                    #     variables=tf_util.get_trainable_vars('model/pi'),
                    #     summaries=["gradients"],
                    #     increment_global_step=False
                    # )


                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars('model/values_fn')

                    source_params = tf_util.get_trainable_vars("model/values_fn")
                    target_params = tf_util.get_trainable_vars("target/values_fn")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target, (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(values_losses, var_list=values_params)

                        self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy']
                        # All ops to call during one training step
                        self.step_ops = [policy_loss, qf1_loss, qf2_loss,
                                         value_loss, qf1, qf2, value_fn, logp_pi,
                                         self.entropy, policy_train_op, train_values_op]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += ['ent_coef_loss', 'ent_coef']
                                self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))


                    # for var in tf.trainable_variables():
                    #     tf.summary.histogram(var.name, var)
                

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/values_fn")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())

                    if self.pretrained_model is not None: 
                        list_of_vars_to_load = ['cnn_model/BatchNorm/beta',
                                                'cnn_model/BatchNorm/moving_mean',
                                                'cnn_model/BatchNorm/moving_variance',
                                                'cnn_model/c1/w',
                                                'cnn_model/c1/b',
                                                'cnn_model/c2/w',
                                                'cnn_model/c2/b',
                                                'cnn_model/c3/w',
                                                'cnn_model/c3/b',
                                                'cnn_model/fc1/w',
                                                'cnn_model/fc1/b',
                                                'cnn_model/dense/kernel',
                                                'cnn_model/dense/bias']

                        def _load_vars(var_dict, ckpt_path):
                            saver = tf.train.Saver(var_list=var_dict)
                            ckpt  = tf.train.get_checkpoint_state(ckpt_path)
                            saver.restore(self.sess, ckpt.model_checkpoint_path)

                        all_tensors = [x.op.name for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)]
                        # all_tensors = [x.op.name for x in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)]

                        var_dict_pi = {x: self.graph.get_tensor_by_name(f"model/pi/{x}:0")  for x in list_of_vars_to_load if f"model/pi/{x}" in all_tensors}
                        var_dict_values_fn ={x: self.graph.get_tensor_by_name(f"model/values_fn/{x}:0")  for x in list_of_vars_to_load if f"model/values_fn/{x}" in all_tensors}


                        _load_vars(var_dict_pi, self.pretrained_model)
                        _load_vars(var_dict_values_fn, self.pretrained_model)
                    
                    self.sess.run(target_init_op)


                self.summary = tf.summary.merge_all()
Exemple #9
0
    def setup_model(self):

        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess, self.observation_space, self.action_space,
                                                     **self.policy_kwargs)
                    self.support = tf.constant(np.arange(self.v_min, self.v_max + 1e-6, self.delta), dtype=tf.float32)
                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")
                    self.projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="v_projection")
                    self.q_projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="q_projection")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1_distr, qf2_distr, value_fn_distr = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph,
                                                                     create_qf=True, create_vf=True)
                    qf1_pi_distr, qf2_pi_distr, _ = self.policy_tf.make_critics(self.processed_obs_ph,
                                                                    policy_out, create_qf=True, create_vf=False,
                                                                    reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,
                                                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target_distr = self.target_policy.make_critics(self.processed_next_obs_ph,
                                                                         create_qf=False, create_vf=True)
                    self.value_target_distr = value_target_distr

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    # compute qf_pi, qf2_pi with pdf
                    min_qf_pi_distr = tf.where(tf.less(tf.reduce_mean(qf1_pi_distr * self.support),
                                                       tf.reduce_mean(qf2_pi_distr * self.support)),
                                               qf1_pi_distr, qf2_pi_distr)

                    min_qf_pi = tf.reduce_mean(tf.reduce_sum(min_qf_pi_distr * self.support, axis=-1))
                    self.min_qf_pi = min_qf_pi
                    q_backup_op = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * self.support
                    )
                    q_backup_op = tf.clip_by_value(q_backup_op, self.v_min, self.v_max)
                    self.q_backup_op = q_backup_op
                    qf1_loss = -tf.reduce_mean(tf.log(qf1_distr + 1e-12) * tf.stop_gradient(self.projection_ph))
                    qf2_loss = -tf.reduce_mean(tf.log(qf2_distr + 1e-12) * tf.stop_gradient(self.projection_ph))

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    # to clip policy loss
                    qf_pi = tf.reduce_mean(self.support * min_qf_pi_distr, axis=-1, keepdims=True)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.

                    value_loss = -tf.reduce_mean(tf.log(value_fn_distr + 1e-12) * tf.stop_gradient(min_qf_pi_distr)) \
                                                 - tf.stop_gradient(tf.reduce_mean(self.ent_coef * logp_pi))
                    value_fn = tf.reduce_sum(value_fn_distr * self.support, axis=-1)
                    # value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars('model/values_fn')

                    source_params = tf_util.get_trainable_vars("model/values_fn")
                    target_params = tf_util.get_trainable_vars("target/values_fn")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target, (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    qf1, qf2 = tf.reduce_mean(tf.reduce_sum(self.support * qf1_distr, axis=-1)), tf.reduce_mean(tf.reduce_sum(self.support * qf2_distr, axis=-1))
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(values_losses, var_list=values_params)

                        self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy']
                        # All ops to call during one training step
                        self.step_ops = [policy_loss, qf1_loss, qf2_loss,
                                         value_loss, qf1, qf2, value_fn, logp_pi,
                                         self.entropy, policy_train_op, train_values_op]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += ['ent_coef_loss', 'ent_coef']
                                self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/values_fn")

                # Initialize Variables and target network
                self.projection_op = Projection(self.sess, self.graph, self.n_spt, self.v_min, self.v_max, self.delta,
                                                self.batch_size)
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Exemple #10
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                if self.replay_buffer and len(self.replay_buffer) > 0:
                    # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added
                    # during continual learning
                    pass
                else:
                    self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name="terminals")
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name="rewards")
                    self.actions_ph = tf.placeholder(
                        tf.float32,
                        shape=(None, ) + self.action_space.shape,
                        name="actions",
                    )
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    (
                        self.deterministic_action,
                        policy_out,
                        logp_pi,
                    ) = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True,
                    )
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True,
                    )

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == "auto":
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith("auto"):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if "_" in self.ent_coef:
                            init_value = float(self.ent_coef.split("_")[1])
                            assert init_value > 0.0, "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable(
                            "log_ent_coef",
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32),
                        )
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss,
                        var_list=tf_util.get_trainable_vars("model/pi"))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars(
                        "model/values_fn")

                    source_params = tf_util.get_trainable_vars(
                        "model/values_fn/vf")
                    target_params = tf_util.get_trainable_vars(
                        "target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            "policy_loss",
                            "qf1_loss",
                            "qf2_loss",
                            "value_loss",
                            "entropy",
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss,
                            qf1_loss,
                            qf2_loss,
                            value_loss,
                            qf1,
                            qf2,
                            value_fn,
                            logp_pi,
                            self.entropy,
                            policy_train_op,
                            train_values_op,
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    "ent_coef_loss", "ent_coef"
                                ]
                                self.step_ops += [
                                    ent_coef_op,
                                    ent_coef_loss,
                                    self.ent_coef,
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar("policy_loss", policy_loss)
                    tf.summary.scalar("qf1_loss", qf1_loss)
                    tf.summary.scalar("qf2_loss", qf2_loss)
                    tf.summary.scalar("value_loss", value_loss)
                    tf.summary.scalar("entropy", self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar("ent_coef_loss", ent_coef_loss)
                        tf.summary.scalar("ent_coef", self.ent_coef)

                    tf.summary.scalar("learning_rate",
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars(
                    "target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Exemple #11
0
    def setup_model(self):
        # print("setup model ",self.observation_space.shape)
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.qvalues_ph = tf.placeholder(tf.float32,
                                                     shape=(None, self.num_q),
                                                     name='qvalues')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    qfs = self.policy_tf.make_many_critics(self.processed_obs_ph, self.actions_ph,
                                                           scope="buffer_values_fn", num_q=self.num_q)
                    # Q value when following the current policy
                    self.qfs = qfs
                    self.qfs_pi = self.policy_tf.make_many_critics(self.processed_obs_ph,
                                                                   policy_out, scope="buffer_values_fn",
                                                                   num_q=self.num_q, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph)

                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1)

                    # Q values when following the target policy
                    qfs_target = self.target_policy_tf.make_many_critics(self.processed_next_obs_ph,
                                                                         # target_policy_out,
                                                                         noisy_target_action,
                                                                         scope="buffer_values_fn",
                                                                         num_q=self.num_q,
                                                                         reuse=False)

                    self.qfs_target = qfs_target
                    self.qfs_target_no_pi = self.target_policy_tf.make_many_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        scope="buffer_values_fn", num_q=self.num_q, reuse=True)

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.reduce_mean(qfs_target, axis=0) - self.q_base
                    # min_qf_target = tf.minimum(qf1_target, qf2_target)
                    print("here", min_qf_target.shape)
                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * min_qf_target
                    )
                    self.q_backup = q_backup
                    # Compute Q-Function loss

                    # Method 2
                    alpha = self.alpha
                    if alpha > 1:
                        alpha = 1. / alpha
                        sign = 1
                    else:
                        sign = -1

                    if self.double_type == "inner":
                        qfs = tf.reshape(qfs, (self.num_q, self.batch_size, 2))
                        qfs = tf.transpose(qfs, [1, 2, 0])
                        qfs = tf.reshape(qfs, (self.batch_size, 2, self.num_q // 2, 2))
                        qfs = tf.stack([qfs[:, 0, :, 0], qfs[:, 1, :, 1]], axis=-1)
                        qfs = tf.reshape(qfs, (self.batch_size, self.num_q))
                    elif self.double_type == "both":
                        qfs = tf.reshape(qfs, (self.num_q, self.batch_size, self.num_q))
                        qfs = tf.transpose(qfs, [1, 2, 0])
                        qfs = tf.stack([qfs[:, i, i] for i in range(self.num_q)], axis=-1)
                        qfs = tf.reshape(qfs, (self.batch_size, self.num_q))
                    # elif self.double_type == "identical":
                    #     qfs = tf.transpose(qfs, [1, 0])
                    diff = self.qvalues_ph - qfs + self.q_base
                    qfs_loss = tf.reduce_mean(
                        tf.nn.leaky_relu(sign * diff, alpha=alpha) ** 2) / alpha

                    qvalues_losses = qfs_loss
                    self.policy_loss = policy_loss = -tf.reduce_mean(self.qfs_pi)

                    # Policy loss: maximise q value

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(policy_loss,
                                                                var_list=tf_util.get_trainable_vars('model/pi'))
                    self.policy_train_op = policy_train_op

                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars(
                        'model/buffer_values_fn/')

                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    # Polyak averaging for target variables
                    # self.target_ops = [
                    #     tf.assign(target, (1 - self.tau) * target + self.tau * source)
                    #     for target, source in zip(target_params, source_params)
                    # ]
                    self.target_ops = [
                        tf.assign(target,
                                  (1 - self.tau) ** (self.gradient_steps * 1) * target +
                                  (1 - (1 - self.tau) ** (self.gradient_steps * 1)) * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # self.target_ops = [
                    #     tf.assign(target, source)
                    #     for target, source in zip(target_params, source_params)
                    # ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    grads = tf.gradients(qvalues_losses, qvalues_params)
                    grad_norm = tf.linalg.global_norm(grads)
                    # if self.clip_norm is not None:
                    #     grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in grads]
                    # train_values_op = qvalues_optimizer.apply_gradients(grads)
                    train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qfs_loss', 'q_grad_norm']
                    # All ops to call during one training step
                    self.step_ops = [qfs_loss, grad_norm,
                                     qfs, train_values_op]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qfs_loss', qfs_loss)
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

                self.memory = EpisodicMemoryTBP(self.buffer_size, state_dim=1,
                                                obs_space=self.observation_space,
                                                action_shape=self.action_space.shape,
                                                q_func=self.qfs_target, repr_func=None,
                                                obs_ph=self.processed_next_obs_ph,
                                                action_ph=self.actions_ph, sess=self.sess, gamma=self.gamma,
                                                max_step=self.max_step)
Exemple #12
0
def main(args):
    """
    Train a DQN agent on cartpole env
    :param args: (Parsed Arguments) the input arguments
    """
    with tf_utils.make_session(8) as sess:
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, _ = deepq.build_train(
            q_func=CustomPolicy,
            ob_space=env.observation_space,
            ac_space=env.action_space,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            sess=sess
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        tf_utils.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for step in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(step))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            if len(episode_rewards[-101:-1]) == 0:
                mean_100ep_reward = -np.inf
            else:
                mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

            is_solved = step > 100 and mean_100ep_reward >= 200

            if args.no_render and step > args.max_timesteps:
                break

            if is_solved:
                if args.no_render:
                    break
                # Show off the result
                env.render()
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if step > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if step % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", step)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(step)))
                logger.dump_tabular()
    def learn(
        self,
        total_timesteps,
        model_coworker,
        role,
        callback=None,
        log_interval=100,
        tb_log_name="DQN",
        reset_num_timesteps=True,
        replay_wrapper=None,
        clipping_during_training=True,
    ):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name,
                new_tb_log) as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0,
                )
            else:
                if self.replay_buffer is None:
                    self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert (not self.prioritized_replay
                        ), "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps,
            )

            episode_rewards = [0.0]
            episode_successes = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            reset = True
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            for _ in range(total_timesteps):
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.0
                else:
                    update_eps = 0.0
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1.0 - self.exploration.value(self.num_timesteps) +
                        self.exploration.value(self.num_timesteps) /
                        float(self.env.action_space.n))
                    kwargs["reset"] = reset
                    kwargs[
                        "update_param_noise_threshold"] = update_param_noise_threshold
                    kwargs["update_param_noise_scale"] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]

                turn, speed = None, None
                if role == "turn":
                    turn = action
                    speed, nothing = model_coworker.predict(np.array(obs))
                else:
                    turn, nothing = model_coworker.predict(np.array(obs))
                    speed = action

                if clipping_during_training:
                    # check if next state (after action) would be outside of fish tank (CLIPPING)
                    env_state = self.env.get_state()
                    turn_speed = self.env.action([turn, speed])
                    global_turn = env_state[0][2] + turn_speed[0]
                    coords = np.array([
                        env_state[0][0] + turn_speed[1] * np.cos(global_turn),
                        env_state[0][1] + turn_speed[1] * np.sin(global_turn),
                    ])
                    changed = False
                    if coords[0] < -0.49:
                        coords[0] = -0.47
                        changed = True
                    elif coords[0] > 0.49:
                        coords[0] = 0.47
                        changed = True

                    if coords[1] < -0.49:
                        coords[1] = -0.47
                        changed = True
                    elif coords[1] > 0.49:
                        coords[1] = 0.47
                        changed = True

                    if changed:
                        diff = coords - env_state[0, :2]
                        speed = np.linalg.norm(diff)
                        angles = np.arctan2(diff[1], diff[0])
                        turn = angles - env_state[0, 2]
                        turn = turn - 2 * np.pi if turn > np.pi else turn
                        turn = turn + 2 * np.pi if turn < -np.pi else turn

                        # convert to DQN output
                        dist_turn = np.abs(self.env.turn_rate_bins - turn)
                        dist_speed = np.abs(self.env.speed_bins - speed)

                        # convert to bins
                        turn = np.argmin(dist_turn, axis=0)
                        speed = np.argmin(dist_speed, axis=0)

                        if role == "turn":
                            action = turn
                        else:
                            action = speed

                reset = False
                new_obs, rew, done, info = self.env.step([turn, speed])

                self.num_timesteps += 1

                # Stop training if return value is False
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs(
                    ).squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward(
                    ).squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, rew

                # Store transition in the replay buffer, but change reward to 0 (use it for plot later though)
                self.replay_buffer.add(obs_, action, 0, new_obs_, float(done))

                # Also give transition to model coworker
                if model_coworker.replay_buffer is None:
                    model_coworker.replay_buffer = ReplayBuffer(
                        self.buffer_size)
                if role == "turn":
                    model_coworker.replay_buffer.add(obs_, speed, 0, new_obs_,
                                                     float(done))
                else:
                    model_coworker.replay_buffer.add(obs_, turn, 0, new_obs_,
                                                     float(done))

                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                if writer is not None:
                    ep_rew = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += reward_
                if done:
                    maybe_is_success = info.get("is_success")
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if (can_sample and self.num_timesteps > self.learning_starts
                        and self.num_timesteps % self.train_freq == 0):

                    callback.on_rollout_end()
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert (
                            self.beta_schedule is not None
                        ), "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps),
                            env=self._vec_normalize_env,
                        )
                        (
                            obses_t,
                            actions,
                            rewards,
                            obses_tp1,
                            dones,
                            weights,
                            batch_idxes,
                        ) = experience
                    else:
                        (
                            obses_t,
                            actions,
                            rewards,
                            obses_tp1,
                            dones,
                        ) = self.replay_buffer.sample(
                            self.batch_size, env=self._vec_normalize_env)
                        # also sample from expert buffer
                        (
                            obses_t_exp,
                            actions_exp,
                            rewards_exp,
                            obses_tp1_exp,
                            dones_exp,
                        ) = self.expert_buffer.sample(
                            self.batch_size, env=self._vec_normalize_env)
                        weights, batch_idxes = np.ones_like(rewards), None
                        weights_exp, batch_idxes_exp = np.ones_like(
                            rewards_exp), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                np.append(obses_t, obses_t_exp, axis=0),
                                np.append(actions,
                                          actions_exp.flatten(),
                                          axis=0),
                                np.append(rewards,
                                          rewards_exp.flatten(),
                                          axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(dones.flatten(),
                                          dones_exp.flatten(),
                                          axis=0),
                                np.append(weights, weights_exp),
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata,
                            )
                            writer.add_run_metadata(
                                run_metadata, "step%d" % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                np.append(obses_t, obses_t_exp, axis=0),
                                np.append(actions,
                                          actions_exp.flatten(),
                                          axis=0),
                                np.append(rewards,
                                          rewards_exp.flatten(),
                                          axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(dones.flatten(),
                                          dones_exp.flatten(),
                                          axis=0),
                                np.append(weights, weights_exp),
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata,
                            )
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(
                            np.append(obses_t, obses_t_exp, axis=0),
                            np.append(actions, actions_exp.flatten(), axis=0),
                            np.append(rewards, rewards_exp.flatten(), axis=0),
                            np.append(obses_tp1, obses_tp1_exp, axis=0),
                            np.append(obses_tp1, obses_tp1_exp, axis=0),
                            np.append(dones.flatten(),
                                      dones_exp.flatten(),
                                      axis=0),
                            np.append(weights, weights_exp),
                            sess=self.sess,
                        )

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                    callback.on_rollout_start()

                if (can_sample and self.num_timesteps > self.learning_starts
                        and self.num_timesteps %
                        self.target_network_update_freq == 0):
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if (self.verbose >= 1 and done and log_interval is not None
                        and len(episode_rewards) % log_interval == 0):
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)),
                    )
                    logger.dump_tabular()

        callback.on_training_end()
        return self
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            obs_hdqn_old = None
            action_hdqn = None
            reset = True
            F = 0

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).

                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                # Check if agent is busy or idle
                OBS_IS_IDLE = True
                if (OBS_IS_IDLE):
                    if not reset:
                        # Store HDQN transition
                        self.replay_buffer.add(obs_hdqn_old, action_hdqn, F,
                                               obs, float(done))

                    # Select new goal for the agent using the current Q function
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                    env_action = action

                    # Update bookkeepping for next HDQN buffer update
                    obs_hdqn_old = obs
                    action_hdqn = env_action
                    F = 0.
                else:
                    # Agent is busy, so select a dummy action (it will be ignored anyway)
                    env_action = 0

                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                F = F + rew

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    total_episode_reward_logger(self.episode_reward, ep_rew,
                                                ep_done, writer,
                                                self.num_timesteps)

                episode_rewards[-1] += rew

                if done:
                    # Store HDQN transition
                    self.replay_buffer.add(obs_hdqn_old, action_hdqn, F, obs,
                                           float(done))

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return self
Exemple #15
0
    def initDemoBuffer(self, demoDataFile, update_stats=True):

        #setupz dims
        '''
        env = self.env
        env.reset()
        obs, _, _, info = env.step(env.action_space.sample())

        dims = {
            'o': obs['observation'].shape[0],
            'u': env.action_space.shape[0],
            'g': obs['desired_goal'].shape[0],
        }
        
        dims ={'o': 25, 'u': 4, 'g': 3}

        for key, value in info.items():
            value = np.array(value)
            if value.ndim == 0:
                value = value.reshape(1)
            dims['info_{}'.format(key)] = value.shape[0]
        '''

        input_dims = {'o': 25, 'u': 4, 'g': 3}
        buffer_size = int(1E6)
        self.demo_buffer = ReplayBuffer(buffer_size)
        update_stats = True

        T = 50  #max_episode steps
        rollout_batch_size = 1

        demoData = np.load(demoDataFile, allow_pickle=True)
        info_keys = [
            key.replace('info_', '') for key in input_dims.keys()
            if key.startswith('info_')
        ]
        info_values = [
            np.empty((T, rollout_batch_size, input_dims['info_' + key]),
                     np.float32) for key in info_keys
        ]

        num_demo = np.shape(demoData['obs'])[0]

        #num_demo = 3

        print('===================================================')

        for epsd in range(num_demo):
            print('Filling the demonstration buffer: (' + str(epsd) + "/" +
                  str(num_demo) + ")")
            cat_obs, obs, acts, goals, achieved_goals = [], [], [], [], []
            i = 0
            for transition in range(T):
                obs.append(
                    [demoData['obs'][epsd][transition].get('observation')])
                acts.append([demoData['acs'][epsd][transition]])
                goals.append(
                    [demoData['obs'][epsd][transition].get('desired_goal')])
                achieved_goals.append(
                    [demoData['obs'][epsd][transition].get('achieved_goal')])

                cat_obs.append(
                    np.concatenate(obs[-1] + achieved_goals[-1] + goals[-1]))
                for idx, key in enumerate(info_keys):
                    info_values[idx][
                        transition,
                        i] = demoData['info'][epsd][transition][key]

                if transition > 0:
                    # def add(self, obs_t, action, reward, obs_tp1, done):
                    self.demo_buffer.add(cat_obs[transition - 1],
                                         acts[transition - 1], 1.0,
                                         cat_obs[transition], 1.0)

            obs.append([demoData['obs'][epsd][T].get('observation')])
            achieved_goals.append(
                [demoData['obs'][epsd][T].get('achieved_goal')])

            episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = self.convert_episode_to_batch_major(episode)

            #demo_buffer = ReplayBuffer(buffer_size)
            #self.demo_buffer.store_episode(episode)
            #self.demo_buffer.add(obs_t, action, reward, obs_tp1, done)

            update_stats = False
            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = self.transitions_in_episode_batch(
                    episode)
                transitions = self.sample_transitions(
                    episode, num_normalizing_transitions)

                o, o_2, g, ag = transitions['o'], transitions[
                    'o_2'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(
                    o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()
Exemple #16
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    if self.recurrent_policy:
                        import inspect
                        policy_tf_args = inspect.signature(self.policy).parameters

                        policy_tf_kwargs = {}
                        if "my_size" in policy_tf_args:
                            policy_tf_kwargs["my_size"] = len(self._get_env_parameters())
                        if "goal_size" in policy_tf_args:
                            policy_tf_kwargs["goal_size"] = self.env.goal_dim  # TODO: need to get this some other way or save it

                        if self.buffer_kwargs is not None:
                            sequence_length = self.buffer_kwargs.get("sequence_length", 1)
                        else:
                            sequence_length = 1

                        self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                     n_batch=self.batch_size,
                                                     n_steps=sequence_length,
                                                     **policy_tf_kwargs, **self.policy_kwargs)
                        self.policy_tf_act = self.policy(self.sess, self.observation_space, self.action_space,
                                                         n_batch=1, **policy_tf_kwargs,
                                                         **self.policy_kwargs)
                        self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                            n_batch=self.batch_size,
                                                            n_steps=sequence_length, **policy_tf_kwargs,
                                                            **self.policy_kwargs)

                        self.dones_ph = self.policy_tf.dones_ph
                    else:
                        self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                     **self.policy_kwargs)
                        self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                            **self.policy_kwargs)

                    if hasattr(self.policy_tf, "extra_phs"):
                        for ph_name in self.policy_tf.extra_phs:
                            if "target_" in ph_name:
                                self.train_extra_phs[ph_name] = getattr(self.target_policy_tf,
                                                                        ph_name.replace("target_", "") + "_ph")
                            else:
                                self.train_extra_phs[ph_name] = getattr(self.policy_tf, ph_name + "_ph")

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                self.buffer_is_prioritized = self.buffer_type.__name__ in ["PrioritizedReplayBuffer",
                                                                           "RankPrioritizedReplayBuffer"]

                if self.replay_buffer is None:
                    if self.buffer_is_prioritized:
                        if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0:
                            self.replay_buffer = ReplayBuffer(self.buffer_size)
                        else:
                            buffer_kw = {"size": self.buffer_size, "alpha": 0.7}
                            if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer":
                                buffer_kw.update(
                                    {"learning_starts": self.prioritization_starts, "batch_size": self.batch_size})
                            self.replay_buffer = self.buffer_type(**buffer_kw)
                    else:
                        replay_buffer_kw = {"size": self.buffer_size}
                        if self.buffer_kwargs is not None:
                            replay_buffer_kw.update(self.buffer_kwargs)
                        if self.recurrent_policy:
                            replay_buffer_kw["rnn_inputs"] = self.policy_tf.rnn_inputs
                        if hasattr(self.policy_tf, "extra_data_names"):
                            replay_buffer_kw["extra_data_names"] = self.policy_tf.extra_data_names
                        self.replay_buffer = self.buffer_type(**replay_buffer_kw)

                if self.recurrent_policy:
                    self.sequence_length = self.replay_buffer.sequence_length
                    self.scan_length = self.replay_buffer.scan_length
                    assert self.scan_length % self.sequence_length == 0

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    if self.recurrent_policy:
                        actor_args = inspect.signature(self.policy_tf.make_actor).parameters
                        critic_args = inspect.signature(self.policy_tf.make_critics).parameters
                        actor_kws = {k: v for k, v in self.train_extra_phs.items() if k in actor_args}
                        critic_kws = {k: v for k, v in self.train_extra_phs.items() if k in critic_args}
                        self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph, **actor_kws)
                        self.policy_act = policy_act = self.policy_tf_act.make_actor(reuse=True)
                        # Use two Q-functions to improve performance by reducing overestimation bias
                        qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, **critic_kws)
                        _, _ = self.policy_tf_act.make_critics(None, self.actions_ph, reuse=True)
                        # Q value when following the current policy
                        qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, **critic_kws,
                                                                     reuse=True)

                        train_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" not in var.name]
                        act_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" in var.name]
                        self.act_ops = [
                            tf.assign(act, train)
                            for act, train in zip(act_params, train_params)
                        ]

                    else:
                        self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph)
                        # Use two Q-functions to improve performance by reducing overestimation bias
                        qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph)
                        # Q value when following the current policy
                        qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph,
                                                                policy_out, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    if self.recurrent_policy:
                        # Create target networks
                        target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph,
                                                                             **actor_kws,
                                                                             dones=self.dones_ph)
                        # Target policy smoothing, by adding clipped noise to target actions
                        target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise)
                        target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip)
                        # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                        noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1)
                        # Q values when following the target policy
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph,
                                                                                    noisy_target_action,
                                                                                    dones=self.dones_ph,
                                                                                    **critic_kws)
                    else:
                        # Create target networks
                        target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph)
                        # Target policy smoothing, by adding clipped noise to target actions
                        target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise)
                        target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip)
                        # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                        noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1)
                        # Q values when following the target policy
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph,
                                                                                    noisy_target_action)

                policy_pre_activation = self.policy_tf.policy_pre_activation

                if self.full_tensorboard_log:
                    for var in tf_util.get_trainable_vars("model"):
                        tf.summary.histogram(var.name, var)
                if self.recurrent_policy and self.policy_tf.keras_reuse:
                    tf.summary.histogram("rnn/PI state", self.policy_tf.pi_state)
                    tf.summary.histogram("rnn/QF1 state", self.policy_tf.qf1_state)
                    tf.summary.histogram("rnn/QF2 state", self.policy_tf.qf2_state)

                # TODO: introduce somwehere here the placeholder for history which updates internal state?
                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * min_qf_target
                    )

                    if self.clip_q_target is not None:
                        q_backup = tf.clip_by_value(q_backup, self.clip_q_target[0], self.clip_q_target[1], name="q_backup_clipped")

                    # Compute Q-Function loss
                    if self.buffer_is_prioritized:
                        self.train_extra_phs["is_weights"] = tf.placeholder(tf.float32, shape=(None, 1), name="is_weights")
                        qf1_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf1) ** 2)
                        qf2_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf2) ** 2)
                    else:
                        qf1_loss = tf.reduce_mean((q_backup - qf1) ** 2)
                        qf2_loss = tf.reduce_mean((q_backup - qf2) ** 2)

                    qvalues_losses = qf1_loss + qf2_loss

                    rew_loss = tf.reduce_mean(qf1_pi)
                    action_loss = self.action_l2_scale * tf.nn.l2_loss(policy_pre_activation)

                    self.policy_loss = policy_loss = -rew_loss + action_loss
                    # Policy loss: maximise q value
                    if hasattr(self.policy_tf, "policy_loss"):
                        tf.summary.scalar("custom_policy_loss", self.policy_tf.policy_loss)
                        self.policy_loss += self.policy_tf.policy_loss
                        policy_loss = self.policy_loss

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_vars = tf_util.get_trainable_vars("model/pi") + tf_util.get_trainable_vars("model/shared")
                    policy_train_op = policy_optimizer.minimize(policy_loss, var_list=policy_vars)
                    self.policy_train_op = policy_train_op
                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars("model/shared/")


                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    if self.recurrent_policy:
                        source_params = [var for var in tf_util.get_trainable_vars("model/") if "act" not in var.name]

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target, (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [qf1_loss, qf2_loss,
                                     qf1, qf2, train_values_op]
                    if hasattr(self.policy_tf, "step_ops"):
                        self.step_ops.extend(self.policy_tf.step_ops)
                    self.policy_step_ops = [self.policy_train_op, self.target_ops, self.policy_loss]
                    if hasattr(self.policy_tf, "policy_step_ops"):
                        self.policy_step_ops.extend(self.policy_tf.policy_step_ops)

                    if self.recurrent_policy and self.policy_tf.save_state:
                        if self.policy_tf.share_lstm:
                            state_objects = [self.policy_tf.state]
                            if self.target_policy_tf.save_target_state:
                                state_objects.append(self.target_policy_tf.state)
                        else:
                            state_objects = [self.policy_tf.pi_state, self.policy_tf.qf1_state, self.policy_tf.qf2_state]
                            if self.target_policy_tf.save_target_state:
                                state_objects.extend([self.target_policy_tf.pi_state, self.target_policy_tf.qf1_state,
                                                      self.target_policy_tf.qf2_state])
                        self.step_ops.extend(state_objects)

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar("rew_loss", rew_loss)
                    tf.summary.scalar("action_loss", action_loss)
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                if self.full_tensorboard_log:
                    policy_grads = policy_optimizer.compute_gradients(policy_loss)
                    for g in policy_grads:
                        if g[0] is not None and g[1] in policy_vars:
                            tf.summary.histogram("grad-policy/{}".format(g[1].name), g[0])

                    qf_grads = qvalues_optimizer.compute_gradients(qvalues_losses)
                    for g in qf_grads:
                        if g[0] is not None and g[1] in qvalues_params:
                            tf.summary.histogram("grad-qf/{}".format(g[1].name), g[0])

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
    def __init__(self,
                 env,
                 network: str = "cnn",
                 intrinsic_reward_weight: float = 1.0,
                 buffer_size: int = 65536,
                 train_freq: int = 16384,
                 gradient_steps: int = 4,
                 batch_size: int = 4096,
                 learning_starts: int = 100,
                 filter_end_of_episode: bool = True,
                 filter_reward: bool = False,
                 norm_obs: bool = True,
                 norm_ext_reward: bool = True,
                 gamma: float = 0.99,
                 learning_rate: float = 0.0001,
                 training: bool = True,
                 _init_setup_model=True):

        super().__init__(env, _init_setup_model)

        self.network_type = network
        self.buffer = ReplayBuffer(buffer_size)
        self.train_freq = train_freq
        self.gradient_steps = gradient_steps
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.intrinsic_reward_weight = intrinsic_reward_weight
        self.filter_end_of_episode = filter_end_of_episode
        self.filter_extrinsic_reward = filter_reward
        self.clip_obs = 5
        self.norm_obs = norm_obs
        self.norm_ext_reward = norm_ext_reward
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.training = training

        self.epsilon = 1e-8
        self.int_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon)
        self.ext_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon)
        self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.int_ret = np.zeros(
            self.num_envs)  # discounted return for intrinsic reward
        self.ext_ret = np.zeros(
            self.num_envs)  # discounted return for extrinsic reward

        self.updates = 0
        self.steps = 0
        self.last_action = None
        self.last_obs = None
        self.last_update = 0

        self.graph = None
        self.sess = None
        self.observation_ph = None
        self.processed_obs = None
        self.predictor_network = None
        self.target_network = None
        self.params = None
        self.int_reward = None
        self.aux_loss = None
        self.optimizer = None
        self.training_op = None

        if _init_setup_model:
            self.setup_model()
Exemple #18
0
    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN",
              reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                    initial_p=self.prioritized_replay_beta0,
                                                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
                                              initial_p=self.exploration_initial_eps,
                                              final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            reset = True
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            for _ in range(total_timesteps):
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)

                self.num_timesteps += 1

                # Stop training if return value is False
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs().squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward().squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, rew
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done))
                if self.expert_exp is not None:
                    self.add_expert_exp()
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                if writer is not None:
                    ep_rew = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                        self.num_timesteps)

                episode_rewards[-1] += reward_
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:

                    callback.on_rollout_end()
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(self.batch_size,
                                                               beta=self.beta_schedule.value(self.num_timesteps),
                                                               env=self._vec_normalize_env)
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size,
                                                                                                env=self._vec_normalize_env)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess, options=run_options,
                                                                  run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer, PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                    callback.on_rollout_start()

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
Exemple #19
0
    def exec(self):
        """
        Train a DQN agent on cartpole env
        :param args: (Parsed Arguments) the input arguments
        """
        with tf_utils.make_session(8) as sess:
            # Create the environment
            env = self.env
            # Create all the functions necessary to train the model
            act, train, update_target, _ = deepq.build_train(
                q_func=CustomPolicy,
                ob_space=env.observation_space,
                ac_space=env.action_space,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
                sess=sess,
                double_q = False,
            )
            # Create the replay buffer
            replay_buffer = ReplayBuffer(50000)
            # Create the schedule for exploration starting from 1 (every action is random) down to
            # 0.02 (98% of actions are selected according to values predicted by the model).
            solved_yet = False
            is_solved = False
            steps_so_far = 0
            # Initialize the parameters and copy them to the target network.
            tf_utils.initialize()
            update_target()

            episode_rewards = [0.0]
            obs = env.reset()
            for i in trange(self.episode_count):

                step = 0
                done = False
                while not done:
                    step += 1
                    steps_so_far += 1

                    if not self.mode_rbed:
                        self.linear_decay(step=steps_so_far)
                    # Take action and update exploration to the newest value
                    action = act(obs[None], update_eps=self.epsilon)[0]
                    new_obs, rew, done, _ = env.step(action)
                    # Store transition in the replay buffer.
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                    obs = new_obs

                    episode_rewards[-1] += rew

                    if done:
                        obs = env.reset()

                        last_reward = episode_rewards[-1]

                        if self.mode_rbed:
                            self.rb_decay_epsilon(current_reward=last_reward)

                        if len(episode_rewards[-101:-1]) == 0:
                            mean_100ep_reward = sum(episode_rewards)/100
                        else:
                            mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                        # is_solved = step > 100 and mean_100ep_reward >= self.env_target

                        # log epsilon
                        self.ex.log_scalar(self.VAL_EPSILON, self.epsilon)

                        # log reward
                        self.ex.log_scalar(self.VAL_REWARD, last_reward)

                        # log average reward
                        self.ex.log_scalar(self.VAL_AVG100, mean_100ep_reward)

                        # log solved at
                        if mean_100ep_reward >= self.env_target and (not solved_yet):
                            solved_yet = True
                            self.ex.log_scalar(self.VAL_SOLVEDAT, i)

                        # For next run
                        episode_rewards.append(0)

                    # Do not train further once solved. Keeping consistent with the original scheme
                    if not solved_yet:
                        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                        if steps_so_far > 1000:
                            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                            train(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                dones,
                                np.ones_like(rewards))
                        # Update target network periodically.
                        if steps_so_far % 1000 == 0:
                            update_target()
Exemple #20
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    qf1, qf2 = self.policy_tf.make_critics(
                        self.processed_obs_ph, self.actions_ph)
                    # Q value when following the current policy
                    qf1_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph, policy_out, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1, 1)
                    # Q values when following the target policy
                    qf1_target, qf2_target = self.target_policy_tf.make_critics(
                        self.processed_next_obs_ph, noisy_target_action)

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    qf1_loss = tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = tf.reduce_mean((q_backup - qf2)**2)

                    qvalues_losses = qf1_loss + qf2_loss

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi)

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss,
                        var_list=tf_util.get_trainable_vars('model/pi'))
                    self.policy_train_op = policy_train_op

                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars(
                        'model/values_fn/')

                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Exemple #21
0
    def initializeExpertBuffer(self, obs_list, acts_list):
        self.expert_buffer = ReplayBuffer(
            sum([len(elem) for elem in acts_list]))

        for i in range(0, len(obs_list)):
            self._initializeExpertBuffer(obs_list[i], acts_list[i])