def fill_expert_buffer(self, num_trajectories=500, max_expert_steps=300): self.pretrained = True buffer_size = int(1e6) self.demo_buffer = ReplayBuffer(buffer_size) obs = self.env.reset() i = 0 last_obs = None env_steps = 0 while i < num_trajectories: a = self.env.env.expert(self.env.convert_obs_to_dict(obs)) last_obs = deepcopy(obs) obs, reward, done, info = self.env.step(a) env_steps += 1 if last_obs is not None: self.demo_buffer.add(last_obs, a, reward, obs, done) if info['is_success'] or env_steps > max_expert_steps: env_steps = 0 last_obs = None obs = self.env.reset() i += 1
def _setup_model(self): self._setup_learning_rate() obs_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0] self.set_random_seed(self.seed) self.replay_buffer = ReplayBuffer(self.buffer_size, obs_dim, action_dim) self.policy = self.policy_class(self.observation_space, self.action_space, self.learning_rate, **self.policy_kwargs) self._create_aliases()
def initializeExpertBuffer(self, ar_list_obs, ar_list_act): """ initizalize Expert Buffer """ self.expert_buffer = ReplayBuffer( sum([len(elem) for elem in ar_list_act])) for i in range(0, len(ar_list_act)): self._initializeExpertBuffer(ar_list_obs[i], ar_list_act[i])
def test_extend_uniform(): nvals = 16 states = [np.random.rand(2, 2) for _ in range(nvals)] actions = [np.random.rand(2) for _ in range(nvals)] rewards = [np.random.rand() for _ in range(nvals)] newstate = [np.random.rand(2, 2) for _ in range(nvals)] done = [np.random.randint(0, 2) for _ in range(nvals)] size = 32 baseline = ReplayBuffer(size) ext = ReplayBuffer(size) for data in zip(states, actions, rewards, newstate, done): baseline.add(*data) states, actions, rewards, newstates, done = map( np.array, [states, actions, rewards, newstate, done]) ext.extend(states, actions, rewards, newstates, done) assert len(baseline) == len(ext) # Check buffers have same values for i in range(nvals): for j in range(5): condition = (baseline.storage[i][j] == ext.storage[i][j]) if isinstance(condition, np.ndarray): # for obs, obs_t1 assert np.all(condition) else: # for done, reward action assert condition
def initializeExpertBuffer(self, obs, act): for i in range(len(self._models)): self._models[i].expert_buffer = ReplayBuffer( sum([len(elem) for elem in obs]) ) for j in range(len(obs)): self._models[i]._initializeExpertBuffer(obs[j], act[j][:, i])
def initDemoBuffer(self, demoDataFile, update_stats=True): #setupz dims ''' env = self.env env.reset() obs, _, _, info = env.step(env.action_space.sample()) dims = { 'o': obs['observation'].shape[0], 'u': env.action_space.shape[0], 'g': obs['desired_goal'].shape[0], } dims ={'o': 25, 'u': 4, 'g': 3} for key, value in info.items(): value = np.array(value) if value.ndim == 0: value = value.reshape(1) dims['info_{}'.format(key)] = value.shape[0] ''' input_dims = {'o': 25, 'u': 4, 'g': 3} buffer_size = int(1E6) self.demo_buffer = ReplayBuffer(buffer_size) update_stats = True T = 50 #max_episode steps rollout_batch_size = 1 demoData = np.load(demoDataFile, allow_pickle=True) info_keys = [ key.replace('info_', '') for key in input_dims.keys() if key.startswith('info_') ] info_values = [ np.empty((T, rollout_batch_size, input_dims['info_' + key]), np.float32) for key in info_keys ] num_demo = np.shape(demoData['obs'])[0] #num_demo = 3 print('===================================================') for epsd in range(num_demo): print('Filling the demonstration buffer: (' + str(epsd) + "/" + str(num_demo) + ")") cat_obs, obs, acts, goals, achieved_goals = [], [], [], [], [] i = 0 for transition in range(T): obs.append( [demoData['obs'][epsd][transition].get('observation')]) acts.append([demoData['acs'][epsd][transition]]) goals.append( [demoData['obs'][epsd][transition].get('desired_goal')]) achieved_goals.append( [demoData['obs'][epsd][transition].get('achieved_goal')]) cat_obs.append( np.concatenate(obs[-1] + achieved_goals[-1] + goals[-1])) for idx, key in enumerate(info_keys): info_values[idx][ transition, i] = demoData['info'][epsd][transition][key] if transition > 0: # def add(self, obs_t, action, reward, obs_tp1, done): self.demo_buffer.add(cat_obs[transition - 1], acts[transition - 1], 1.0, cat_obs[transition], 1.0) obs.append([demoData['obs'][epsd][T].get('observation')]) achieved_goals.append( [demoData['obs'][epsd][T].get('achieved_goal')]) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(info_keys, info_values): episode['info_{}'.format(key)] = value episode = self.convert_episode_to_batch_major(episode) #demo_buffer = ReplayBuffer(buffer_size) #self.demo_buffer.store_episode(episode) #self.demo_buffer.add(obs_t, action, reward, obs_tp1, done) update_stats = False if update_stats: # add transitions to normalizer to normalize the demo data as well episode['o_2'] = episode['o'][:, 1:, :] episode['ag_2'] = episode['ag'][:, 1:, :] num_normalizing_transitions = self.transitions_in_episode_batch( episode) transitions = self.sample_transitions( episode, num_normalizing_transitions) o, o_2, g, ag = transitions['o'], transitions[ 'o_2'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og( o, ag, g) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() episode.clear()
class SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf DDPG for the different action noise type. :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for SAC normally but can help exploring when using HER + SAC. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.value_fn = None self.graph = None self.replay_buffer = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_ent_coef = None self.pretrained = False if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = self.deterministic_action * np.abs( self.action_space.low) return policy.obs_ph, self.actions_ph, None, None, None, deterministic_action ######################################################## def convert_episode_to_batch_major(self, episode): """Converts an episode to have the batch dimension in the major (first) dimension. """ episode_batch = {} for key in episode.keys(): val = np.array(episode[key]).copy() # make inputs batch-major instead of time-major episode_batch[key] = val.swapaxes(0, 1) return episode_batch def transitions_in_episode_batch(self, episode_batch): """Number of transitions in a given episode batch. """ shape = episode_batch['u'].shape return shape[0] * shape[1] def fill_expert_buffer(self, num_trajectories=500, max_expert_steps=300): self.pretrained = True buffer_size = int(1e6) self.demo_buffer = ReplayBuffer(buffer_size) obs = self.env.reset() i = 0 last_obs = None env_steps = 0 while i < num_trajectories: a = self.env.env.expert(self.env.convert_obs_to_dict(obs)) last_obs = deepcopy(obs) obs, reward, done, info = self.env.step(a) env_steps += 1 if last_obs is not None: self.demo_buffer.add(last_obs, a, reward, obs, done) if info['is_success'] or env_steps > max_expert_steps: env_steps = 0 last_obs = None obs = self.env.reset() i += 1 def initDemoBuffer(self, demoDataFile, update_stats=True): #setupz dims ''' env = self.env env.reset() obs, _, _, info = env.step(env.action_space.sample()) dims = { 'o': obs['observation'].shape[0], 'u': env.action_space.shape[0], 'g': obs['desired_goal'].shape[0], } dims ={'o': 25, 'u': 4, 'g': 3} for key, value in info.items(): value = np.array(value) if value.ndim == 0: value = value.reshape(1) dims['info_{}'.format(key)] = value.shape[0] ''' input_dims = {'o': 25, 'u': 4, 'g': 3} buffer_size = int(1E6) self.demo_buffer = ReplayBuffer(buffer_size) update_stats = True T = 50 #max_episode steps rollout_batch_size = 1 demoData = np.load(demoDataFile, allow_pickle=True) info_keys = [ key.replace('info_', '') for key in input_dims.keys() if key.startswith('info_') ] info_values = [ np.empty((T, rollout_batch_size, input_dims['info_' + key]), np.float32) for key in info_keys ] num_demo = np.shape(demoData['obs'])[0] #num_demo = 3 print('===================================================') for epsd in range(num_demo): print('Filling the demonstration buffer: (' + str(epsd) + "/" + str(num_demo) + ")") cat_obs, obs, acts, goals, achieved_goals = [], [], [], [], [] i = 0 for transition in range(T): obs.append( [demoData['obs'][epsd][transition].get('observation')]) acts.append([demoData['acs'][epsd][transition]]) goals.append( [demoData['obs'][epsd][transition].get('desired_goal')]) achieved_goals.append( [demoData['obs'][epsd][transition].get('achieved_goal')]) cat_obs.append( np.concatenate(obs[-1] + achieved_goals[-1] + goals[-1])) for idx, key in enumerate(info_keys): info_values[idx][ transition, i] = demoData['info'][epsd][transition][key] if transition > 0: # def add(self, obs_t, action, reward, obs_tp1, done): self.demo_buffer.add(cat_obs[transition - 1], acts[transition - 1], 1.0, cat_obs[transition], 1.0) obs.append([demoData['obs'][epsd][T].get('observation')]) achieved_goals.append( [demoData['obs'][epsd][T].get('achieved_goal')]) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(info_keys, info_values): episode['info_{}'.format(key)] = value episode = self.convert_episode_to_batch_major(episode) #demo_buffer = ReplayBuffer(buffer_size) #self.demo_buffer.store_episode(episode) #self.demo_buffer.add(obs_t, action, reward, obs_tp1, done) update_stats = False if update_stats: # add transitions to normalizer to normalize the demo data as well episode['o_2'] = episode['o'][:, 1:, :] episode['ag_2'] = episode['ag'][:, 1:, :] num_normalizing_transitions = self.transitions_in_episode_batch( episode) transitions = self.sample_transitions( episode, num_normalizing_transitions) o, o_2, g, ag = transitions['o'], transitions[ 'o_2'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og( o, ag, g) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() episode.clear() def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars( 'model/values_fn') source_params = tf_util.get_trainable_vars( "model/values_fn") target_params = tf_util.get_trainable_vars( "target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars( "target/values_fn") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer if self.pretrained: batch = self.samplelicous(self.batch_size, env=self._vec_normalize_env) else: batch = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_ent_coef is not None: ent_coef_loss, ent_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC_LR_CYCLED", reset_num_timesteps=True, replay_wrapper=None, lr_cycler=False): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) def cyclic_lr(step, num_cycle_steps=10000, base_lr=5e-4, max_lr=1e-2): mod_step = step % num_cycle_steps half = num_cycle_steps / 2 mod_step_half = mod_step % half pct = mod_step_half / half if mod_step < half: diff = max_lr - base_lr diff = pct * diff return base_lr + diff else: diff = max_lr - base_lr diff = pct * diff return max_lr - diff with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate if lr_cycler: current_lr = cyclic_lr(step) else: frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if actions is not None: raise ValueError("Error: SAC does not have action probabilities.") warnings.warn( "Even though SAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by a tanh before being scaled and outputed.") return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', "target_entropy": self.target_entropy, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) def samplelicous(self, batch_size, env): #print('lets get it doe') #print('batch_size: ' + str(batch_size)) self.demo_batch_size = 32 batch = [] if True: # batch = self.replay_buffer.sample(batch_size - self.demo_batch_size, env=env) transitions = self.replay_buffer.sample(batch_size - self.demo_batch_size, env=env) transitionsDemo = self.demo_buffer.sample(self.demo_batch_size, env=env) #import pickle #pickle.dump( transitions, open( "transitions.p", "wb" ) ) #pickle.dump( transitionsDemo, open( "transitionsDemo.p", "wb" ) ) #hyb_transitions = transitions.copy() hyb_obs = np.concatenate((transitions[0], transitionsDemo[0])) hyb_acts = np.concatenate( (transitions[1], (np.array(transitionsDemo[1])).reshape( self.demo_batch_size, *self.env.action_space.shape))) hyb_rews = np.concatenate((transitions[2], transitionsDemo[2])) hyb_obs_next = np.concatenate((transitions[3], transitionsDemo[3])) hyb_dones = np.concatenate((transitions[4], transitionsDemo[4])) transitions = deepcopy( (hyb_obs, hyb_acts, hyb_rews, hyb_obs_next, hyb_dones)) else: transitions = self.replay_buffer.sample(batch_size, env=env) ''' o, o_2, g = transitions['o'], transitions['o_2'], transitions['g'] ag, ag_2 = transitions['ag'], transitions['ag_2'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g) transitions_batch = [transitions[key] for key in self.stage_shapes.keys()] ''' return transitions
class SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf DDPG for the different action noise type. :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for SAC normally but can help exploring when using HER + SAC. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None, pretrained_model=None, config=None): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.value_fn = None self.graph = None self.replay_buffer = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_ent_coef = None self.pretrained_model = pretrained_model self.config = config self.data_counter = 0 if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = unscale_action(self.action_space, self.deterministic_action) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, config=self.config, pretrained_model=self.pretrained_model, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, config=self.config, pretrained_model=self.pretrained_model, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) self._qf1 = qf1 self._qf2 = qf2 self._value_fn = value_fn qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target ) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # policy_train_op = tf.contrib.layers.optimize_loss( # policy_loss, # None, # self.learning_rate_ph, # "Adam", # variables=tf_util.get_trainable_vars('model/pi'), # summaries=["gradients"], # increment_global_step=False # ) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars('model/values_fn') source_params = tf_util.get_trainable_vars("model/values_fn") target_params = tf_util.get_trainable_vars("target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # for var in tf.trainable_variables(): # tf.summary.histogram(var.name, var) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/values_fn") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) if self.pretrained_model is not None: list_of_vars_to_load = ['cnn_model/BatchNorm/beta', 'cnn_model/BatchNorm/moving_mean', 'cnn_model/BatchNorm/moving_variance', 'cnn_model/c1/w', 'cnn_model/c1/b', 'cnn_model/c2/w', 'cnn_model/c2/b', 'cnn_model/c3/w', 'cnn_model/c3/b', 'cnn_model/fc1/w', 'cnn_model/fc1/b', 'cnn_model/dense/kernel', 'cnn_model/dense/bias'] def _load_vars(var_dict, ckpt_path): saver = tf.train.Saver(var_list=var_dict) ckpt = tf.train.get_checkpoint_state(ckpt_path) saver.restore(self.sess, ckpt.model_checkpoint_path) all_tensors = [x.op.name for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)] # all_tensors = [x.op.name for x in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] var_dict_pi = {x: self.graph.get_tensor_by_name(f"model/pi/{x}:0") for x in list_of_vars_to_load if f"model/pi/{x}" in all_tensors} var_dict_values_fn ={x: self.graph.get_tensor_by_name(f"model/values_fn/{x}:0") for x in list_of_vars_to_load if f"model/values_fn/{x}" in all_tensors} _load_vars(var_dict_pi, self.pretrained_model) _load_vars(var_dict_values_fn, self.pretrained_model) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones, batch_objStates = batch batch_objStates_array = np.array(batch_objStates) feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } # self.get_values_at_state(batch_obs[0]) # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: if self.pretrained_model is not None: out = self.sess.run([self.summary] + self.step_ops + [self.policy_tf.model.layer_9], feed_dict) coordinates_dict = {0: "X", 1: "Y", 2:"Theta", 3:"dX", 4:"dY", 5:"dTheta"} num_coords = 6 error = np.linalg.norm(batch_objStates_array - out[-1], axis = 0) for coord in range(num_coords): coord_summary = tf.Summary(value=[tf.Summary.Value(tag=coordinates_dict[coord], simple_value=error[coord])]) writer.add_summary(coord_summary, step) else: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) if (step % 10 == 0): writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_ent_coef is not None: ent_coef_loss, ent_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # start_time = time.time() if self.num_timesteps == self.learning_starts: print("START LEARNING") # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info, objState = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info, objState) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks # learnStartTime = time.time() for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # learnEndTime = time.time() - learnStartTime # learnTimeSummary = tf.Summary(value=[tf.Summary.Value(tag='learnTime', simple_value=learnEndTime)]) # writer.add_summary(learnTimeSummary, step) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: callback.on_episode_end() if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] # end_time = time.time()-start_time # timeSummary = tf.Summary(value=[tf.Summary.Value(tag='trainTime', simple_value=end_time)]) # writer.add_summary(timeSummary, step) # if ((len(self.env.intersection) + self.env.numCollisions) > 1000): # print("INTERSECTION!!") # break; # else: # print((len(self.env.intersection) + self.env.numCollisions)) callback.on_training_end() return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if actions is not None: raise ValueError("Error: SAC does not have action probabilities.") warnings.warn("Even though SAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by a tanh before being scaled and outputed.") return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action(self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_value_at_state(self,state): feed_dict = { self.observations_ph: state } value = self.sess.run(self._value_fn, feed_dict=feed_dict) return value def get_qvalues_at_state(self, state): from itertools import product num_actions = self.action_space.shape[0] all_action_coords = [] for a in range(num_actions): action_coords = np.linspace(self.action_space.low[a], self.action_space.high[a], 25) all_action_coords.append(action_coords) all_action_possibilities = np.array(tuple(product(*all_action_coords))) obs = np.tile(state, (len(all_action_possibilities),1)) feed_dict = { self.observations_ph: obs, self.actions_ph: all_action_possibilities, # self.next_observations_ph: batch_next_obs, # self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), # self.terminals_ph: batch_dones.reshape(self.batch_size, -1), # self.learning_rate_ph: learning_rate } q_values_1 = self.sess.run(self._qf1, feed_dict=feed_dict) q_values_2 = self.sess.run(self._qf2, feed_dict=feed_dict) q_value = np.array([q_values_1, q_values_2]).min(axis=0) return q_value, all_action_possibilities def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', "target_entropy": self.target_entropy, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "config": self.config } # if self.pretrained_model is not None: # data["pretrained_model"] = "/Users/marion/mnt/ws/planet/" + self.pretrained_model params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
class CustomDQN(DQN): """ Custom version of DQN (DQN). It is adapted from the stable-baselines version. Notable changes: - save replay buffer and restore it while loading """ def __init__(self, save_replay_buffer: bool = True, **kwargs): super(CustomDQN, self).__init__(**kwargs) self.save_replay_buffer = save_replay_buffer def learn( self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, ): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: if self.replay_buffer and len(self.replay_buffer) > 0: # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added # during continual learning pass else: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0, ) else: if self.replay_buffer and len(self.replay_buffer) > 0: # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added # during continual learning pass else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps, ) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0.0 else: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1.0 - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs["reset"] = reset kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold kwargs["update_param_noise_scale"] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get("is_success") if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert ( self.beta_schedule is not None ), "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env, ) ( obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes, ) = experience else: ( obses_t, actions, rewards, obses_tp1, dones, ) = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata, ) writer.add_run_metadata( run_metadata, "step%d" % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, ) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, ) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance( self.replay_buffer, PrioritizedReplayBuffer ), "replay_buffer should be an instance of PrioritizedReplayBuffer: {}".format( type(self.replay_buffer)) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if (can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0): # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps)), ) logger.dump_tabular() callback.on_training_end() return self def save(self, save_path, cloudpickle=False): if self.save_replay_buffer: data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "replay_buffer": self.replay_buffer, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, } else: data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn( self, total_timesteps, model_coworker, role, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, clipping_during_training=True, ): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0, ) else: if self.replay_buffer is None: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert (not self.prioritized_replay ), "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps, ) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0.0 else: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1.0 - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs["reset"] = reset kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold kwargs["update_param_noise_scale"] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] turn, speed = None, None if role == "turn": turn = action speed, nothing = model_coworker.predict(np.array(obs)) else: turn, nothing = model_coworker.predict(np.array(obs)) speed = action if clipping_during_training: # check if next state (after action) would be outside of fish tank (CLIPPING) env_state = self.env.get_state() turn_speed = self.env.action([turn, speed]) global_turn = env_state[0][2] + turn_speed[0] coords = np.array([ env_state[0][0] + turn_speed[1] * np.cos(global_turn), env_state[0][1] + turn_speed[1] * np.sin(global_turn), ]) changed = False if coords[0] < -0.49: coords[0] = -0.47 changed = True elif coords[0] > 0.49: coords[0] = 0.47 changed = True if coords[1] < -0.49: coords[1] = -0.47 changed = True elif coords[1] > 0.49: coords[1] = 0.47 changed = True if changed: diff = coords - env_state[0, :2] speed = np.linalg.norm(diff) angles = np.arctan2(diff[1], diff[0]) turn = angles - env_state[0, 2] turn = turn - 2 * np.pi if turn > np.pi else turn turn = turn + 2 * np.pi if turn < -np.pi else turn # convert to DQN output dist_turn = np.abs(self.env.turn_rate_bins - turn) dist_speed = np.abs(self.env.speed_bins - speed) # convert to bins turn = np.argmin(dist_turn, axis=0) speed = np.argmin(dist_speed, axis=0) if role == "turn": action = turn else: action = speed reset = False new_obs, rew, done, info = self.env.step([turn, speed]) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer, but change reward to 0 (use it for plot later though) self.replay_buffer.add(obs_, action, 0, new_obs_, float(done)) # Also give transition to model coworker if model_coworker.replay_buffer is None: model_coworker.replay_buffer = ReplayBuffer( self.buffer_size) if role == "turn": model_coworker.replay_buffer.add(obs_, speed, 0, new_obs_, float(done)) else: model_coworker.replay_buffer.add(obs_, turn, 0, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get("is_success") if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if (can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0): callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert ( self.beta_schedule is not None ), "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env, ) ( obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes, ) = experience else: ( obses_t, actions, rewards, obses_tp1, dones, ) = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) # also sample from expert buffer ( obses_t_exp, actions_exp, rewards_exp, obses_tp1_exp, dones_exp, ) = self.expert_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None weights_exp, batch_idxes_exp = np.ones_like( rewards_exp), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( np.append(obses_t, obses_t_exp, axis=0), np.append(actions, actions_exp.flatten(), axis=0), np.append(rewards, rewards_exp.flatten(), axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(dones.flatten(), dones_exp.flatten(), axis=0), np.append(weights, weights_exp), sess=self.sess, options=run_options, run_metadata=run_metadata, ) writer.add_run_metadata( run_metadata, "step%d" % self.num_timesteps) else: summary, td_errors = self._train_step( np.append(obses_t, obses_t_exp, axis=0), np.append(actions, actions_exp.flatten(), axis=0), np.append(rewards, rewards_exp.flatten(), axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(dones.flatten(), dones_exp.flatten(), axis=0), np.append(weights, weights_exp), sess=self.sess, options=run_options, run_metadata=run_metadata, ) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step( np.append(obses_t, obses_t_exp, axis=0), np.append(actions, actions_exp.flatten(), axis=0), np.append(rewards, rewards_exp.flatten(), axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(dones.flatten(), dones_exp.flatten(), axis=0), np.append(weights, weights_exp), sess=self.sess, ) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if (can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0): # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if (self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0): logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps)), ) logger.dump_tabular() callback.on_training_end() return self
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars( 'model/values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.replay_buffer and len(self.replay_buffer) > 0: # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added # during continual learning pass else: self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name="terminals") self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name="rewards") self.actions_ph = tf.placeholder( tf.float32, shape=(None, ) + self.action_space.shape, name="actions", ) self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy ( self.deterministic_action, policy_out, logp_pi, ) = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True, ) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True, ) # Target entropy is used when learning the entropy coefficient if self.target_entropy == "auto": # automatically set target entropy if needed self.target_entropy = -np.prod( self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith("auto"): # Default initial value of ent_coef when learned init_value = 1.0 if "_" in self.ent_coef: init_value = float(self.ent_coef.split("_")[1]) assert init_value > 0.0, "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log(init_value).astype(np.float32), ) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars("model/pi")) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars( "model/values_fn") source_params = tf_util.get_trainable_vars( "model/values_fn/vf") target_params = tf_util.get_trainable_vars( "target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ "policy_loss", "qf1_loss", "qf2_loss", "value_loss", "entropy", ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op, ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ "ent_coef_loss", "ent_coef" ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef, ] # Monitor losses and entropy in tensorboard tf.summary.scalar("policy_loss", policy_loss) tf.summary.scalar("qf1_loss", qf1_loss) tf.summary.scalar("qf2_loss", qf2_loss) tf.summary.scalar("value_loss", value_loss) tf.summary.scalar("entropy", self.entropy) if ent_coef_loss is not None: tf.summary.scalar("ent_coef_loss", ent_coef_loss) tf.summary.scalar("ent_coef", self.ent_coef) tf.summary.scalar("learning_rate", tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars( "target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) if self.expert_exp is not None: self.add_expert_exp() obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
class CuriosityWrapper(BaseTFWrapper): """ Random Network Distillation (RND) curiosity reward. https://arxiv.org/abs/1810.12894 :param env: (gym.Env) Environment to wrap. :param network: (str) Network type. Can be a "cnn" or a "mlp". :param intrinsic_reward_weight: (float) Weight for the intrinsic reward. :param buffer_size: (int) Size of the replay buffer for predictor training. :param train_freq: (int) Frequency of predictor training in steps. :param gradient_steps: (int) Number of optimization epochs. :param batch_size: (int) Number of samples to draw from the replay buffer per optimization epoch. :param learning_starts: (int) Number of steps to wait before training the predictor for the first time. :param filter_end_of_episode: (bool) Weather or not to filter end of episode signals (dones). :param filter_reward: (bool) Weather or not to filter extrinsic reward from the environment. :param norm_obs: (bool) Weather or not to normalize and clip obs for the target/predictor network. Note that obs returned will be unaffected. :param norm_ext_reward: (bool) Weather or not to normalize extrinsic reward. :param gamma: (float) Reward discount factor for intrinsic reward normalization. :param learning_rate: (float) Learning rate for the Adam optimizer of the predictor network. """ def __init__(self, env, network: str = "cnn", intrinsic_reward_weight: float = 1.0, buffer_size: int = 65536, train_freq: int = 16384, gradient_steps: int = 4, batch_size: int = 4096, learning_starts: int = 100, filter_end_of_episode: bool = True, filter_reward: bool = False, norm_obs: bool = True, norm_ext_reward: bool = True, gamma: float = 0.99, learning_rate: float = 0.0001, training: bool = True, _init_setup_model=True): super().__init__(env, _init_setup_model) self.network_type = network self.buffer = ReplayBuffer(buffer_size) self.train_freq = train_freq self.gradient_steps = gradient_steps self.batch_size = batch_size self.learning_starts = learning_starts self.intrinsic_reward_weight = intrinsic_reward_weight self.filter_end_of_episode = filter_end_of_episode self.filter_extrinsic_reward = filter_reward self.clip_obs = 5 self.norm_obs = norm_obs self.norm_ext_reward = norm_ext_reward self.gamma = gamma self.learning_rate = learning_rate self.training = training self.epsilon = 1e-8 self.int_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon) self.ext_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.int_ret = np.zeros( self.num_envs) # discounted return for intrinsic reward self.ext_ret = np.zeros( self.num_envs) # discounted return for extrinsic reward self.updates = 0 self.steps = 0 self.last_action = None self.last_obs = None self.last_update = 0 self.graph = None self.sess = None self.observation_ph = None self.processed_obs = None self.predictor_network = None self.target_network = None self.params = None self.int_reward = None self.aux_loss = None self.optimizer = None self.training_op = None if _init_setup_model: self.setup_model() def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=None, graph=self.graph) self.observation_ph, self.processed_obs = observation_input( self.venv.observation_space, scale=(self.network_type == "cnn")) with tf.variable_scope("target_model"): if self.network_type == 'cnn': self.target_network = small_convnet( self.processed_obs, tf.nn.leaky_relu) elif self.network_type == 'mlp': self.target_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.target_network = tf_layers.linear( self.target_network, "out", 512) else: raise ValueError("Unknown network type {}!".format( self.network_type)) with tf.variable_scope("predictor_model"): if self.network_type == 'cnn': self.predictor_network = tf.nn.relu( small_convnet(self.processed_obs, tf.nn.leaky_relu)) elif self.network_type == 'mlp': self.predictor_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.predictor_network = tf.nn.relu( tf_layers.linear(self.predictor_network, "pred_fc1", 512)) self.predictor_network = tf_layers.linear( self.predictor_network, "out", 512) with tf.name_scope("loss"): self.int_reward = tf.reduce_mean(tf.square( tf.stop_gradient(self.target_network) - self.predictor_network), axis=1) self.aux_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.target_network) - self.predictor_network)) with tf.name_scope("train"): self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.training_op = self.optimizer.minimize(self.aux_loss) self.params = tf.trainable_variables() tf.global_variables_initializer().run(session=self.sess) def reset(self): obs = self.venv.reset() self.last_obs = obs return obs def step_async(self, actions): super().step_async(actions) self.last_action = actions self.steps += self.num_envs def step_wait(self): obs, rews, dones, infos = self.venv.step_wait() self.buffer.extend(self.last_obs, self.last_action, rews, obs, dones) if self.filter_extrinsic_reward: rews = np.zeros(rews.shape) if self.filter_end_of_episode: dones = np.zeros(dones.shape) if self.training: self.obs_rms.update(obs) obs_n = self.normalize_obs(obs) loss = self.sess.run([self.int_reward], {self.observation_ph: obs_n}) if self.training: self._update_ext_reward_rms(rews) self._update_int_reward_rms(loss) intrinsic_reward = np.array(loss) / np.sqrt(self.int_rwd_rms.var + self.epsilon) if self.norm_ext_reward: extrinsic_reward = np.array(rews) / np.sqrt(self.ext_rwd_rms.var + self.epsilon) else: extrinsic_reward = rews reward = np.squeeze(extrinsic_reward + self.intrinsic_reward_weight * intrinsic_reward) if self.training and self.steps > self.learning_starts and self.steps - self.last_update > self.train_freq: self.updates += 1 self.last_update = self.steps self.learn() return obs, reward, dones, infos def close(self): VecEnvWrapper.close(self) def learn(self): total_loss = 0 for _ in range(self.gradient_steps): obs_batch, act_batch, rews_batch, next_obs_batch, done_mask = self.buffer.sample( self.batch_size) obs_batch = self.normalize_obs(obs_batch) test = self.sess.run(self.aux_loss, {self.observation_ph: obs_batch}) train, loss = self.sess.run([self.training_op, self.aux_loss], {self.observation_ph: obs_batch}) total_loss += loss logging.info("Trained predictor. Avg loss: {}".format( total_loss / self.gradient_steps)) def _update_int_reward_rms(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.int_ret = self.gamma * self.int_ret + reward self.int_rwd_rms.update(self.int_ret) def _update_ext_reward_rms(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.ext_ret = self.gamma * self.ext_ret + reward self.ext_rwd_rms.update(self.ext_ret) def normalize_obs(self, obs: np.ndarray) -> np.ndarray: """ Normalize observations using observations statistics. Calling this method does not update statistics. """ if self.norm_obs: obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs def get_parameter_list(self): return self.params def save(self, save_path): #os.makedirs(os.path.dirname(save_path), exist_ok=True) #self.saver.save(self.sess, save_path) data = { 'network': self.network_type, 'intrinsic_reward_weight': self.intrinsic_reward_weight, 'buffer_size': self.buffer.buffer_size, 'train_freq': self.train_freq, 'gradient_steps': self.gradient_steps, 'batch_size': self.batch_size, 'learning_starts': self.learning_starts, 'filter_end_of_episode': self.filter_end_of_episode, 'filter_extrinsic_reward': self.filter_extrinsic_reward, 'norm_obs': self.norm_obs, 'norm_ext_reward': self.norm_ext_reward, 'gamma': self.gamma, 'learning_rate': self.learning_rate, 'int_rwd_rms': self.int_rwd_rms, 'ext_rwd_rms': self.ext_rwd_rms, 'obs_rms': self.obs_rms } params_to_save = self.get_parameters() self._save_to_file_zip(save_path, data=data, params=params_to_save)
def __init__(self, env, network: str = "cnn", intrinsic_reward_weight: float = 1.0, buffer_size: int = 65536, train_freq: int = 16384, gradient_steps: int = 4, batch_size: int = 4096, learning_starts: int = 100, filter_end_of_episode: bool = True, filter_reward: bool = False, norm_obs: bool = True, norm_ext_reward: bool = True, gamma: float = 0.99, learning_rate: float = 0.0001, training: bool = True, _init_setup_model=True): super().__init__(env, _init_setup_model) self.network_type = network self.buffer = ReplayBuffer(buffer_size) self.train_freq = train_freq self.gradient_steps = gradient_steps self.batch_size = batch_size self.learning_starts = learning_starts self.intrinsic_reward_weight = intrinsic_reward_weight self.filter_end_of_episode = filter_end_of_episode self.filter_extrinsic_reward = filter_reward self.clip_obs = 5 self.norm_obs = norm_obs self.norm_ext_reward = norm_ext_reward self.gamma = gamma self.learning_rate = learning_rate self.training = training self.epsilon = 1e-8 self.int_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon) self.ext_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.int_ret = np.zeros( self.num_envs) # discounted return for intrinsic reward self.ext_ret = np.zeros( self.num_envs) # discounted return for extrinsic reward self.updates = 0 self.steps = 0 self.last_action = None self.last_obs = None self.last_update = 0 self.graph = None self.sess = None self.observation_ph = None self.processed_obs = None self.predictor_network = None self.target_network = None self.params = None self.int_reward = None self.aux_loss = None self.optimizer = None self.training_op = None if _init_setup_model: self.setup_model()
def setup_model(self): # print("setup model ",self.observation_space.shape) with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.qvalues_ph = tf.placeholder(tf.float32, shape=(None, self.num_q), name='qvalues') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qfs = self.policy_tf.make_many_critics(self.processed_obs_ph, self.actions_ph, scope="buffer_values_fn", num_q=self.num_q) # Q value when following the current policy self.qfs = qfs self.qfs_pi = self.policy_tf.make_many_critics(self.processed_obs_ph, policy_out, scope="buffer_values_fn", num_q=self.num_q, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qfs_target = self.target_policy_tf.make_many_critics(self.processed_next_obs_ph, # target_policy_out, noisy_target_action, scope="buffer_values_fn", num_q=self.num_q, reuse=False) self.qfs_target = qfs_target self.qfs_target_no_pi = self.target_policy_tf.make_many_critics( self.processed_obs_ph, self.actions_ph, scope="buffer_values_fn", num_q=self.num_q, reuse=True) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.reduce_mean(qfs_target, axis=0) - self.q_base # min_qf_target = tf.minimum(qf1_target, qf2_target) print("here", min_qf_target.shape) # Targets for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target ) self.q_backup = q_backup # Compute Q-Function loss # Method 2 alpha = self.alpha if alpha > 1: alpha = 1. / alpha sign = 1 else: sign = -1 if self.double_type == "inner": qfs = tf.reshape(qfs, (self.num_q, self.batch_size, 2)) qfs = tf.transpose(qfs, [1, 2, 0]) qfs = tf.reshape(qfs, (self.batch_size, 2, self.num_q // 2, 2)) qfs = tf.stack([qfs[:, 0, :, 0], qfs[:, 1, :, 1]], axis=-1) qfs = tf.reshape(qfs, (self.batch_size, self.num_q)) elif self.double_type == "both": qfs = tf.reshape(qfs, (self.num_q, self.batch_size, self.num_q)) qfs = tf.transpose(qfs, [1, 2, 0]) qfs = tf.stack([qfs[:, i, i] for i in range(self.num_q)], axis=-1) qfs = tf.reshape(qfs, (self.batch_size, self.num_q)) # elif self.double_type == "identical": # qfs = tf.transpose(qfs, [1, 0]) diff = self.qvalues_ph - qfs + self.q_base qfs_loss = tf.reduce_mean( tf.nn.leaky_relu(sign * diff, alpha=alpha) ** 2) / alpha qvalues_losses = qfs_loss self.policy_loss = policy_loss = -tf.reduce_mean(self.qfs_pi) # Policy loss: maximise q value # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars( 'model/buffer_values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Polyak averaging for target variables # self.target_ops = [ # tf.assign(target, (1 - self.tau) * target + self.tau * source) # for target, source in zip(target_params, source_params) # ] self.target_ops = [ tf.assign(target, (1 - self.tau) ** (self.gradient_steps * 1) * target + (1 - (1 - self.tau) ** (self.gradient_steps * 1)) * source) for target, source in zip(target_params, source_params) ] # self.target_ops = [ # tf.assign(target, source) # for target, source in zip(target_params, source_params) # ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] grads = tf.gradients(qvalues_losses, qvalues_params) grad_norm = tf.linalg.global_norm(grads) # if self.clip_norm is not None: # grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in grads] # train_values_op = qvalues_optimizer.apply_gradients(grads) train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params) self.infos_names = ['qfs_loss', 'q_grad_norm'] # All ops to call during one training step self.step_ops = [qfs_loss, grad_norm, qfs, train_values_op] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qfs_loss', qfs_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() self.memory = EpisodicMemoryTBP(self.buffer_size, state_dim=1, obs_space=self.observation_space, action_shape=self.action_space.shape, q_func=self.qfs_target, repr_func=None, obs_ph=self.processed_next_obs_ph, action_ph=self.actions_ph, sess=self.sess, gamma=self.gamma, max_step=self.max_step)
class SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf DDPG for the different action noise type. :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for SAC normally but can help exploring when using HER + SAC. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. :param non_vec_environment: (env) an alternate gym environment if needed. :param action_to_prices_fn: (fn action -> prices) function that takes action and outputs price curve """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None, non_vec_env=None, plotter_person_reaction=None, people_reaction_log_dir=None, action_to_prices_fn=lambda x: x): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.value_fn = None self.graph = None self.replay_buffer = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_ent_coef = None # lucas added these self.non_vec_env = non_vec_env self.people_reaction_log_dir = people_reaction_log_dir self.plotter_person_reaction = plotter_person_reaction self.action_to_prices_fn = action_to_prices_fn if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = unscale_action(self.action_space, self.deterministic_action) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars( 'model/values_fn') source_params = tf_util.get_trainable_vars( "model/values_fn") target_params = tf_util.get_trainable_vars( "target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars( "target/values_fn") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_ent_coef is not None: ent_coef_loss, ent_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None, planning_steps=0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) # TODO: use builtin log writer instead of this old lib tb_configure(self.tensorboard_log) action_log_csv = self.tensorboard_log + "_actions.csv" action_log_df = pd.DataFrame(columns=np.concatenate(( ["iteration"], ["p" + str(i) for i in range(24)], ["b" + str(i) for i in range(24)], ["e" + str(i) for i in range(24)], ))) action_log_index = 0 steps_in_real_env = 0 person_data_dict = {} if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape # if not planning: # new_obs, reward, done, info = self.env.step(unscaled_action) # else: if not self.num_timesteps % (planning_steps + 1): ## TODO: work on this? # if self.num_timesteps ==1: # # form the control # from sklearn.preprocessing import MinMaxScaler # grid_price = self.non_vec_env.prices[self.non_vec_env.day - 1] # scaler = MinMaxScaler(feature_range = (0, 10)) # scaled_grid_price = scaler.fit_transform(np.array(grid_price).reshape(-1, 1)) # scaled_grid_price = np.squeeze(scaled_grid_price) # energy_consumptions = self.non_vec_env._simulate_humans(scaled_grid_price) # person_data_dict["control"] = { # "x" : list(range(8, 18)), # "grid_price" : scaled_grid_price, # "energy_consumption" : energy_consumptions["avg"], # "reward" : self.non_vec_env._get_reward(price = grid_price, energy_consumptions = energy_consumptions), # } # # form the data_dict # if self.num_timesteps in [100, 1000, 9500]: # person_data_dict["Step " + str(self.num_timesteps)] = { # "x" : list(range(8, 18)), # "grid_price" : self.non_vec_env.prices[self.non_vec_env.day - 1], # "action" : unscaled_action, # "energy_consumption" : self.non_vec_env.prev_energy, # "reward" : reward, # } # if self.num_timesteps == 9501 and self.people_reaction_log_dir and self.plotter_person_reaction: # # call the plotting statement # self.plotter_person_reaction(person_data_dict, self.people_reaction_log_dir) new_obs, reward, done, info = self.env.step( unscaled_action) #, step_num = self.num_timesteps) steps_in_real_env += 1 else: print("planning step") new_obs, reward, done, info = self.non_vec_env.planning_step( unscaled_action) # write the action to a csv # if ((not self.num_timesteps % 10) & (self.num_timesteps > 10000)) or self.num_timesteps>19500: # ### get the battery charging # battery_op = {} # total_battery_consumption = np.zeros(24) # total_energy_consumption = np.zeros(24) # for prosumer_name in self.non_vec_env.prosumer_dict: # #Get players response to agent's actions # day = self.non_vec_env.day # price = self.non_vec_env.price # prosumer = self.non_vec_env.prosumer_dict[prosumer_name] # prosumer_battery = prosumer.get_battery_operation(day, price) # prosumer_demand = prosumer.get_response(day, price) # total_battery_consumption += prosumer_battery # total_energy_consumption += prosumer_demand # action_log_df.loc[action_log_index] = np.concatenate( # ([self.num_timesteps], # price, # total_battery_consumption, # total_energy_consumption,)) # action_log_index += 1 # action_log_df.to_csv(action_log_csv) # print("Iteration: " + str(self.num_timesteps)) # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward if not self.num_timesteps % (planning_steps + 1): tb_log_value("reward_in_environment", reward_, steps_in_real_env) # tb_log_value("reward_planning", reward_, self.num_timesteps) self.num_timesteps += 1 # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % 100 == 0 and not np.any( unscaled_action == np.inf): if self.action_to_prices_fn: prices = self.action_to_prices_fn(unscaled_action) # tf_util.log_histogram(writer, "action_vec_hist", unscaled_action, self.num_timesteps, bins=10, flush=False) # tb_log_value("constant_load_price", np.sum(prices), self.num_timesteps) # tf_util.log_vec_as_histogram(writer, "prices", prices, self.num_timesteps, flush=True) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self #, ep_reward #, reward_ def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if actions is not None: raise ValueError("Error: SAC does not have action probabilities.") warnings.warn( "Even though SAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by a tanh before being scaled and outputed.") return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', "target_entropy": self.target_entropy, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters()
def main(args): """ Train a DQN agent on cartpole env :param args: (Parsed Arguments) the input arguments """ with tf_utils.make_session(8) as sess: # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, _ = deepq.build_train( q_func=CustomPolicy, ob_space=env.observation_space, ac_space=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), sess=sess ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for step in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(step))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) is_solved = step > 100 and mean_100ep_reward >= 200 if args.no_render and step > args.max_timesteps: break if is_solved: if args.no_render: break # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if step > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if step % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) logger.dump_tabular()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects if self.recurrent_policy: import inspect policy_tf_args = inspect.signature(self.policy).parameters policy_tf_kwargs = {} if "my_size" in policy_tf_args: policy_tf_kwargs["my_size"] = len(self._get_env_parameters()) if "goal_size" in policy_tf_args: policy_tf_kwargs["goal_size"] = self.env.goal_dim # TODO: need to get this some other way or save it if self.buffer_kwargs is not None: sequence_length = self.buffer_kwargs.get("sequence_length", 1) else: sequence_length = 1 self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, n_batch=self.batch_size, n_steps=sequence_length, **policy_tf_kwargs, **self.policy_kwargs) self.policy_tf_act = self.policy(self.sess, self.observation_space, self.action_space, n_batch=1, **policy_tf_kwargs, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, n_batch=self.batch_size, n_steps=sequence_length, **policy_tf_kwargs, **self.policy_kwargs) self.dones_ph = self.policy_tf.dones_ph else: self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) if hasattr(self.policy_tf, "extra_phs"): for ph_name in self.policy_tf.extra_phs: if "target_" in ph_name: self.train_extra_phs[ph_name] = getattr(self.target_policy_tf, ph_name.replace("target_", "") + "_ph") else: self.train_extra_phs[ph_name] = getattr(self.policy_tf, ph_name + "_ph") # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.buffer_is_prioritized = self.buffer_type.__name__ in ["PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer"] if self.replay_buffer is None: if self.buffer_is_prioritized: if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0: self.replay_buffer = ReplayBuffer(self.buffer_size) else: buffer_kw = {"size": self.buffer_size, "alpha": 0.7} if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer": buffer_kw.update( {"learning_starts": self.prioritization_starts, "batch_size": self.batch_size}) self.replay_buffer = self.buffer_type(**buffer_kw) else: replay_buffer_kw = {"size": self.buffer_size} if self.buffer_kwargs is not None: replay_buffer_kw.update(self.buffer_kwargs) if self.recurrent_policy: replay_buffer_kw["rnn_inputs"] = self.policy_tf.rnn_inputs if hasattr(self.policy_tf, "extra_data_names"): replay_buffer_kw["extra_data_names"] = self.policy_tf.extra_data_names self.replay_buffer = self.buffer_type(**replay_buffer_kw) if self.recurrent_policy: self.sequence_length = self.replay_buffer.sequence_length self.scan_length = self.replay_buffer.scan_length assert self.scan_length % self.sequence_length == 0 with tf.variable_scope("model", reuse=False): # Create the policy if self.recurrent_policy: actor_args = inspect.signature(self.policy_tf.make_actor).parameters critic_args = inspect.signature(self.policy_tf.make_critics).parameters actor_kws = {k: v for k, v in self.train_extra_phs.items() if k in actor_args} critic_kws = {k: v for k, v in self.train_extra_phs.items() if k in critic_args} self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph, **actor_kws) self.policy_act = policy_act = self.policy_tf_act.make_actor(reuse=True) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, **critic_kws) _, _ = self.policy_tf_act.make_critics(None, self.actions_ph, reuse=True) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, **critic_kws, reuse=True) train_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" not in var.name] act_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" in var.name] self.act_ops = [ tf.assign(act, train) for act, train in zip(act_params, train_params) ] else: self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): if self.recurrent_policy: # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph, **actor_kws, dones=self.dones_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, noisy_target_action, dones=self.dones_ph, **critic_kws) else: # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, noisy_target_action) policy_pre_activation = self.policy_tf.policy_pre_activation if self.full_tensorboard_log: for var in tf_util.get_trainable_vars("model"): tf.summary.histogram(var.name, var) if self.recurrent_policy and self.policy_tf.keras_reuse: tf.summary.histogram("rnn/PI state", self.policy_tf.pi_state) tf.summary.histogram("rnn/QF1 state", self.policy_tf.qf1_state) tf.summary.histogram("rnn/QF2 state", self.policy_tf.qf2_state) # TODO: introduce somwehere here the placeholder for history which updates internal state? with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target ) if self.clip_q_target is not None: q_backup = tf.clip_by_value(q_backup, self.clip_q_target[0], self.clip_q_target[1], name="q_backup_clipped") # Compute Q-Function loss if self.buffer_is_prioritized: self.train_extra_phs["is_weights"] = tf.placeholder(tf.float32, shape=(None, 1), name="is_weights") qf1_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf1) ** 2) qf2_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf2) ** 2) else: qf1_loss = tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = tf.reduce_mean((q_backup - qf2) ** 2) qvalues_losses = qf1_loss + qf2_loss rew_loss = tf.reduce_mean(qf1_pi) action_loss = self.action_l2_scale * tf.nn.l2_loss(policy_pre_activation) self.policy_loss = policy_loss = -rew_loss + action_loss # Policy loss: maximise q value if hasattr(self.policy_tf, "policy_loss"): tf.summary.scalar("custom_policy_loss", self.policy_tf.policy_loss) self.policy_loss += self.policy_tf.policy_loss policy_loss = self.policy_loss # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_vars = tf_util.get_trainable_vars("model/pi") + tf_util.get_trainable_vars("model/shared") policy_train_op = policy_optimizer.minimize(policy_loss, var_list=policy_vars) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars("model/shared/") # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") if self.recurrent_policy: source_params = [var for var in tf_util.get_trainable_vars("model/") if "act" not in var.name] # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [qf1_loss, qf2_loss, qf1, qf2, train_values_op] if hasattr(self.policy_tf, "step_ops"): self.step_ops.extend(self.policy_tf.step_ops) self.policy_step_ops = [self.policy_train_op, self.target_ops, self.policy_loss] if hasattr(self.policy_tf, "policy_step_ops"): self.policy_step_ops.extend(self.policy_tf.policy_step_ops) if self.recurrent_policy and self.policy_tf.save_state: if self.policy_tf.share_lstm: state_objects = [self.policy_tf.state] if self.target_policy_tf.save_target_state: state_objects.append(self.target_policy_tf.state) else: state_objects = [self.policy_tf.pi_state, self.policy_tf.qf1_state, self.policy_tf.qf2_state] if self.target_policy_tf.save_target_state: state_objects.extend([self.target_policy_tf.pi_state, self.target_policy_tf.qf1_state, self.target_policy_tf.qf2_state]) self.step_ops.extend(state_objects) # Monitor losses and entropy in tensorboard tf.summary.scalar("rew_loss", rew_loss) tf.summary.scalar("action_loss", action_loss) tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") if self.full_tensorboard_log: policy_grads = policy_optimizer.compute_gradients(policy_loss) for g in policy_grads: if g[0] is not None and g[1] in policy_vars: tf.summary.histogram("grad-policy/{}".format(g[1].name), g[0]) qf_grads = qvalues_optimizer.compute_gradients(qvalues_losses) for g in qf_grads: if g[0] is not None and g[1] in qvalues_params: tf.summary.histogram("grad-qf/{}".format(g[1].name), g[0]) # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.support = tf.constant(np.arange(self.v_min, self.v_max + 1e-6, self.delta), dtype=tf.float32) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="v_projection") self.q_projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="q_projection") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1_distr, qf2_distr, value_fn_distr = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi_distr, qf2_pi_distr, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target_distr = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target_distr = value_target_distr with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) # compute qf_pi, qf2_pi with pdf min_qf_pi_distr = tf.where(tf.less(tf.reduce_mean(qf1_pi_distr * self.support), tf.reduce_mean(qf2_pi_distr * self.support)), qf1_pi_distr, qf2_pi_distr) min_qf_pi = tf.reduce_mean(tf.reduce_sum(min_qf_pi_distr * self.support, axis=-1)) self.min_qf_pi = min_qf_pi q_backup_op = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.support ) q_backup_op = tf.clip_by_value(q_backup_op, self.v_min, self.v_max) self.q_backup_op = q_backup_op qf1_loss = -tf.reduce_mean(tf.log(qf1_distr + 1e-12) * tf.stop_gradient(self.projection_ph)) qf2_loss = -tf.reduce_mean(tf.log(qf2_distr + 1e-12) * tf.stop_gradient(self.projection_ph)) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) # to clip policy loss qf_pi = tf.reduce_mean(self.support * min_qf_pi_distr, axis=-1, keepdims=True) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. value_loss = -tf.reduce_mean(tf.log(value_fn_distr + 1e-12) * tf.stop_gradient(min_qf_pi_distr)) \ - tf.stop_gradient(tf.reduce_mean(self.ent_coef * logp_pi)) value_fn = tf.reduce_sum(value_fn_distr * self.support, axis=-1) # value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars('model/values_fn') source_params = tf_util.get_trainable_vars("model/values_fn") target_params = tf_util.get_trainable_vars("target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses qf1, qf2 = tf.reduce_mean(tf.reduce_sum(self.support * qf1_distr, axis=-1)), tf.reduce_mean(tf.reduce_sum(self.support * qf2_distr, axis=-1)) with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/values_fn") # Initialize Variables and target network self.projection_op = Projection(self.sess, self.graph, self.n_spt, self.v_min, self.v_max, self.delta, self.batch_size) with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
class TD3(OffPolicyRLModel): """ Twin Delayed DDPG (TD3) Addressing Function Approximation Error in Actor-Critic Methods. Original implementation: https://github.com/sfujim/TD3 Paper: https://arxiv.org/pdf/1802.09477.pdf Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values and Actor networks) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step). :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type. :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param gradient_steps: (int) How many gradient update after each step :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for TD3 normally but can help exploring when using HER + TD3. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on TD3 logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, buffer_type=ReplayBuffer, buffer_kwargs=None, prioritization_starts=0, beta_schedule=None, learning_starts=100, train_freq=100, gradient_steps=100, batch_size=128, tau=0.005, policy_delay=2, action_noise=None, action_l2_scale=0, target_policy_noise=0.2, target_noise_clip=0.5, random_exploration=0.0, verbose=0, write_freq=1, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None, time_aware=False, reward_transformation=None, clip_q_target=None): super(TD3, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, write_freq=write_freq, policy_base=TD3Policy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.prioritization_starts = prioritization_starts self.beta_schedule = beta_schedule self.buffer_is_prioritized = buffer_type.__name__ in ["PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer"] self.loss_history = None self.buffer_type = buffer_type self.buffer_size = buffer_size self.buffer_kwargs = buffer_kwargs self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.action_l2_scale = action_l2_scale self.random_exploration = random_exploration self.policy_delay = policy_delay self.target_noise_clip = target_noise_clip self.target_policy_noise = target_policy_noise self.time_aware = time_aware self.reward_transformation = reward_transformation self.graph = None self.replay_buffer = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy_tf = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.is_weights_ph = None self.step_ops = None self.policy_step_ops = None self.target_ops = None self.infos_names = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.policy_out = None self.policy_train_op = None self.policy_loss = None self.clip_q_target = clip_q_target assert clip_q_target is None or len(clip_q_target) == 2 self.recurrent_policy = getattr(self.policy, "recurrent", False) if self.recurrent_policy: self.policy_tf_act = None self.policy_act = None self.act_ops = None self.dones_ph = None self.sequence_length = None self.scan_length = None self.train_extra_phs = {} self.active_sampling = False if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale policy_out = unscale_action(self.action_space, self.policy_out) return policy.obs_ph, self.actions_ph, policy_out def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects if self.recurrent_policy: import inspect policy_tf_args = inspect.signature(self.policy).parameters policy_tf_kwargs = {} if "my_size" in policy_tf_args: policy_tf_kwargs["my_size"] = len(self._get_env_parameters()) if "goal_size" in policy_tf_args: policy_tf_kwargs["goal_size"] = self.env.goal_dim # TODO: need to get this some other way or save it if self.buffer_kwargs is not None: sequence_length = self.buffer_kwargs.get("sequence_length", 1) else: sequence_length = 1 self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, n_batch=self.batch_size, n_steps=sequence_length, **policy_tf_kwargs, **self.policy_kwargs) self.policy_tf_act = self.policy(self.sess, self.observation_space, self.action_space, n_batch=1, **policy_tf_kwargs, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, n_batch=self.batch_size, n_steps=sequence_length, **policy_tf_kwargs, **self.policy_kwargs) self.dones_ph = self.policy_tf.dones_ph else: self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) if hasattr(self.policy_tf, "extra_phs"): for ph_name in self.policy_tf.extra_phs: if "target_" in ph_name: self.train_extra_phs[ph_name] = getattr(self.target_policy_tf, ph_name.replace("target_", "") + "_ph") else: self.train_extra_phs[ph_name] = getattr(self.policy_tf, ph_name + "_ph") # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.buffer_is_prioritized = self.buffer_type.__name__ in ["PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer"] if self.replay_buffer is None: if self.buffer_is_prioritized: if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0: self.replay_buffer = ReplayBuffer(self.buffer_size) else: buffer_kw = {"size": self.buffer_size, "alpha": 0.7} if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer": buffer_kw.update( {"learning_starts": self.prioritization_starts, "batch_size": self.batch_size}) self.replay_buffer = self.buffer_type(**buffer_kw) else: replay_buffer_kw = {"size": self.buffer_size} if self.buffer_kwargs is not None: replay_buffer_kw.update(self.buffer_kwargs) if self.recurrent_policy: replay_buffer_kw["rnn_inputs"] = self.policy_tf.rnn_inputs if hasattr(self.policy_tf, "extra_data_names"): replay_buffer_kw["extra_data_names"] = self.policy_tf.extra_data_names self.replay_buffer = self.buffer_type(**replay_buffer_kw) if self.recurrent_policy: self.sequence_length = self.replay_buffer.sequence_length self.scan_length = self.replay_buffer.scan_length assert self.scan_length % self.sequence_length == 0 with tf.variable_scope("model", reuse=False): # Create the policy if self.recurrent_policy: actor_args = inspect.signature(self.policy_tf.make_actor).parameters critic_args = inspect.signature(self.policy_tf.make_critics).parameters actor_kws = {k: v for k, v in self.train_extra_phs.items() if k in actor_args} critic_kws = {k: v for k, v in self.train_extra_phs.items() if k in critic_args} self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph, **actor_kws) self.policy_act = policy_act = self.policy_tf_act.make_actor(reuse=True) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, **critic_kws) _, _ = self.policy_tf_act.make_critics(None, self.actions_ph, reuse=True) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, **critic_kws, reuse=True) train_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" not in var.name] act_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" in var.name] self.act_ops = [ tf.assign(act, train) for act, train in zip(act_params, train_params) ] else: self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): if self.recurrent_policy: # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph, **actor_kws, dones=self.dones_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, noisy_target_action, dones=self.dones_ph, **critic_kws) else: # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, noisy_target_action) policy_pre_activation = self.policy_tf.policy_pre_activation if self.full_tensorboard_log: for var in tf_util.get_trainable_vars("model"): tf.summary.histogram(var.name, var) if self.recurrent_policy and self.policy_tf.keras_reuse: tf.summary.histogram("rnn/PI state", self.policy_tf.pi_state) tf.summary.histogram("rnn/QF1 state", self.policy_tf.qf1_state) tf.summary.histogram("rnn/QF2 state", self.policy_tf.qf2_state) # TODO: introduce somwehere here the placeholder for history which updates internal state? with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target ) if self.clip_q_target is not None: q_backup = tf.clip_by_value(q_backup, self.clip_q_target[0], self.clip_q_target[1], name="q_backup_clipped") # Compute Q-Function loss if self.buffer_is_prioritized: self.train_extra_phs["is_weights"] = tf.placeholder(tf.float32, shape=(None, 1), name="is_weights") qf1_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf1) ** 2) qf2_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf2) ** 2) else: qf1_loss = tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = tf.reduce_mean((q_backup - qf2) ** 2) qvalues_losses = qf1_loss + qf2_loss rew_loss = tf.reduce_mean(qf1_pi) action_loss = self.action_l2_scale * tf.nn.l2_loss(policy_pre_activation) self.policy_loss = policy_loss = -rew_loss + action_loss # Policy loss: maximise q value if hasattr(self.policy_tf, "policy_loss"): tf.summary.scalar("custom_policy_loss", self.policy_tf.policy_loss) self.policy_loss += self.policy_tf.policy_loss policy_loss = self.policy_loss # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_vars = tf_util.get_trainable_vars("model/pi") + tf_util.get_trainable_vars("model/shared") policy_train_op = policy_optimizer.minimize(policy_loss, var_list=policy_vars) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars("model/shared/") # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") if self.recurrent_policy: source_params = [var for var in tf_util.get_trainable_vars("model/") if "act" not in var.name] # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [qf1_loss, qf2_loss, qf1, qf2, train_values_op] if hasattr(self.policy_tf, "step_ops"): self.step_ops.extend(self.policy_tf.step_ops) self.policy_step_ops = [self.policy_train_op, self.target_ops, self.policy_loss] if hasattr(self.policy_tf, "policy_step_ops"): self.policy_step_ops.extend(self.policy_tf.policy_step_ops) if self.recurrent_policy and self.policy_tf.save_state: if self.policy_tf.share_lstm: state_objects = [self.policy_tf.state] if self.target_policy_tf.save_target_state: state_objects.append(self.target_policy_tf.state) else: state_objects = [self.policy_tf.pi_state, self.policy_tf.qf1_state, self.policy_tf.qf2_state] if self.target_policy_tf.save_target_state: state_objects.extend([self.target_policy_tf.pi_state, self.target_policy_tf.qf1_state, self.target_policy_tf.qf2_state]) self.step_ops.extend(state_objects) # Monitor losses and entropy in tensorboard tf.summary.scalar("rew_loss", rew_loss) tf.summary.scalar("action_loss", action_loss) tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") if self.full_tensorboard_log: policy_grads = policy_optimizer.compute_gradients(policy_loss) for g in policy_grads: if g[0] is not None and g[1] in policy_vars: tf.summary.histogram("grad-policy/{}".format(g[1].name), g[0]) qf_grads = qvalues_optimizer.compute_gradients(qvalues_losses) for g in qf_grads: if g[0] is not None and g[1] in qvalues_params: tf.summary.histogram("grad-qf/{}".format(g[1].name), g[0]) # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate, update_policy): # Sample a batch from the replay buffer sample_kw = {} if self.buffer_is_prioritized and self.num_timesteps >= self.prioritization_starts: sample_kw["beta"] = self.beta_schedule(self.num_timesteps) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones, *batch_extra = self.replay_buffer.sample(self.batch_size, **sample_kw) batch_extra = batch_extra[0] feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } if self.recurrent_policy and self.scan_length > 0: obs_scan = batch_extra.pop("scan_obs") # TODO: ensure that target network gets state calculated for that batch sample by main network, or fix separate target state saving and calculation if self.target_policy_tf.save_target_state: obs_tp1_scan = batch_extra.pop("scan_obs_tp1") for seq_i in range(self.scan_length // self.sequence_length): seq_data_idxs = np.zeros(shape=(self.scan_length,), dtype=np.bool) seq_data_idxs[seq_i * self.sequence_length:(seq_i + 1) * self.sequence_length] = True seq_data_idxs = np.tile(seq_data_idxs, self.batch_size // self.sequence_length) feed_dict_scan = {self.observations_ph: obs_scan[seq_data_idxs]} if self.target_policy_tf.save_target_state: feed_dict_scan[self.next_observations_ph] = obs_tp1_scan[seq_data_idxs] feed_dict_scan.update({self.train_extra_phs[k.replace("scan_", "")]: v[seq_data_idxs] for k, v in batch_extra.items() if "scan_" in k}) if self.policy_tf.share_lstm: state_objects = [self.policy_tf.state] state_names = ["state"] if self.target_policy_tf.save_target_state: state_objects.append(self.target_policy_tf.state) state_names.append("target_state") else: state_objects = [self.policy_tf.pi_state, self.policy_tf.qf1_state, self.policy_tf.qf2_state] state_names = ["pi_state", "qf1_state", "qf2_state"] if self.target_policy_tf.save_target_state: state_objects.extend([self.target_policy_tf.pi_state, self.target_policy_tf.qf1_state, self.target_policy_tf.qf2_state]) state_names.extend(["target_" + state_name for state_name in state_names]) states = self.sess.run(state_objects, feed_dict_scan) updated_states = {k: states[i] for i, k in enumerate(state_names)} batch_extra.update(updated_states) if self.policy_tf.save_state: self.replay_buffer.update_state([(idx[0], idx[1] - self.scan_length + self.sequence_length * seq_i) for idx in batch_extra["state_idxs_scan"]], updated_states) if self.recurrent_policy and not self.target_policy_tf.save_target_state: # If target states are not saved to replay buffer and/or computed with scan then set target network hidden state to output from the previous network if self.policy_tf.share_lstm: state_names = ["state"] else: state_names = ["pi_state", "qf1_state", "qf2_state"] batch_extra.update({"target_" + state_name: getattr(self.policy_tf, state_name) for state_name in state_names}) feed_dict.update({v: batch_extra[k] for k, v in self.train_extra_phs.items()}) step_ops = self.step_ops if update_policy: # Update policy and target networks step_ops = step_ops + self.policy_step_ops # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(step_ops, feed_dict) if self.recurrent_policy and self.policy_tf.save_state: if self.policy_tf.share_lstm: state_names = ["state"] else: state_names = ["pi_state", "qf1_state", "qf2_state"] if self.target_policy_tf.save_target_state: state_names.extend(["target_" + state_name for state_name in state_names]) states = {k: out[5 + i] for i, k in enumerate(state_names)} self.replay_buffer.update_state(batch_extra["state_idxs"], states) # Unpack to monitor losses qf1_loss, qf2_loss, *_values = out return qf1_loss, qf2_loss def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) last_replay_update = 0 if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) if isinstance(self.train_freq, tuple): # TODO: bug with optuna please FIX self.train_freq = self.train_freq[0] self.gradient_steps = self.gradient_steps[0] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] self.active_sampling = False initial_step = self.num_timesteps episode_data = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() if self.recurrent_policy: done = False policy_state = self.policy_tf_act.initial_state prev_policy_state = self.policy_tf_act.initial_state # Keep track of this so it doesnt have to be recalculated when saving it to replay buffer for step in range(initial_step, total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: if self.recurrent_policy: action, policy_state = self.policy_tf_act.step(obs[None], state=policy_state, mask=np.array(done)[None]) action = action.flatten() else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward if self.reward_transformation is not None: reward = self.reward_transformation(reward) # Store transition in the replay buffer. extra_data = {} if self.time_aware: bootstrap = True if done: info_time_limit = info.get("TimeLimit.truncated", None) bootstrap = info.get("termination", None) == "steps" or \ (info_time_limit is not None and info_time_limit) extra_data["bootstrap"] = bootstrap if hasattr(self.policy, "collect_data"): if self.recurrent_policy: extra_data.update(self.policy_tf_act.collect_data(locals(), globals())) if self.policy_tf.save_target_state: extra_data.update({"target_" + state_name: self.target_policy_tf.initial_state[0, :] for state_name in (["state"] if self.target_policy_tf.share_lstm else ["pi_state", "qf1_state", "qf2_state"])}) else: extra_data.update(self.policy_tf.collect_data(locals(), globals())) self.replay_buffer.add(obs, action, reward, new_obs, done, **extra_data) # Extra data must be sent as kwargs to support separate bootstrap and done signals (needed for HER style algorithms) episode_data.append({"obs": obs, "action": action, "reward": reward, "obs_tp1": new_obs, "done": done, **extra_data}) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\ or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \ self.num_timesteps % self.buffer_size == 0: self.replay_buffer.rebalance() # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None and self.num_timesteps >= self.learning_starts: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - self.num_timesteps / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter step_writer = writer if grad_step % self.write_freq == 0 else None mb_infos_vals.append( self._train_step(step, step_writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward if self.recurrent_policy: prev_policy_state = policy_state if done: if isinstance(self.replay_buffer, DiscrepancyReplayBuffer) and n_updates - last_replay_update >= 5000: self.replay_buffer.update_priorities() last_replay_update = n_updates if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): if self.active_sampling: sample_obs, sample_state = self.env.get_random_initial_states(25) obs_discrepancies = self.policy_tf.get_q_discrepancy(sample_obs) obs = self.env.reset(**sample_state[np.argmax(obs_discrepancies)]) else: obs = self.env.reset() episode_data = [] episode_rewards.append(0.0) if self.recurrent_policy: prev_policy_state = self.policy_tf_act.initial_state maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): _ = np.array(observation) if actions is not None: raise ValueError("Error: TD3 does not have action probabilities.") # here there are no action probabilities, as DDPG does not use a probability distribution warnings.warn("Warning: action probability is meaningless for TD3. Returning None") return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) if self.recurrent_policy: actions, state = self.policy_tf_act.step(observation, state=state, mask=mask) else: actions = self.policy_tf.step(observation, mask=mask) state = None if self.action_noise is not None and not deterministic: actions = np.clip(actions + self.action_noise(), -1, 1) actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action(self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, state def _get_env(self): env = self.env env = env.env return env def _get_env_parameters(self): return np.zeros((37,)) if isinstance(self.env, HERGoalEnvWrapper): return self.env.env.get_simulator_parameters() else: return self.env.get_simulator_parameters() def _set_prioritized_buffer(self): buffer_kw = {"size": self.buffer_size, "alpha": 0.7} if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer": buffer_kw.update({"learning_starts": self.prioritization_starts, "batch_size": self.batch_size}) r_buf = self.buffer_type(**buffer_kw) for i, transition in enumerate(self.replay_buffer._storage): r_buf.add(*transition) r_buf.update_priorities([i], self.policy_tf.get_q_discrepancy(transition[0])[0]) if r_buf.__name__ == "RankPrioritizedReplayBuffer": r_buf.rebalance() if isinstance(self.replay_buffer, HindsightExperienceReplayWrapper): self.replay_buffer.replay_buffer = r_buf else: self.replay_buffer = r_buf self.learning_rate = get_schedule_fn(self.learning_rate(1) / 4) # TODO: will not work with non-constant self.beta_schedule = get_schedule_fn(self.beta_schedule) print("Enabled prioritized replay buffer") def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False, save_replay_buffer=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside "policy_delay": self.policy_delay, "target_noise_clip": self.target_noise_clip, "target_policy_noise": self.target_policy_noise, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "num_timesteps": self.num_timesteps, "buffer_type": self.buffer_type, "buffer_kwargs": self.buffer_kwargs } if save_replay_buffer: data["replay_buffer"] = self.replay_buffer params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() obs_hdqn_old = None action_hdqn = None reset = True F = 0 for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # Check if agent is busy or idle OBS_IS_IDLE = True if (OBS_IS_IDLE): if not reset: # Store HDQN transition self.replay_buffer.add(obs_hdqn_old, action_hdqn, F, obs, float(done)) # Select new goal for the agent using the current Q function action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action # Update bookkeepping for next HDQN buffer update obs_hdqn_old = obs action_hdqn = env_action F = 0. else: # Agent is busy, so select a dummy action (it will be ignored anyway) env_action = 0 reset = False new_obs, rew, done, info = self.env.step(env_action) F = F + rew if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: # Store HDQN transition self.replay_buffer.add(obs_hdqn_old, action_hdqn, F, obs, float(done)) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
class TD3(BaseRLModel): """ Twin Delayed DDPG (TD3) Addressing Function Approximation Error in Actor-Critic Methods. Original implementation: https://github.com/sfujim/TD3 Paper: https://arxiv.org/abs/1802.09477 Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param buffer_size: (int) size of the replay buffer :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values and Actor networks) it can be a function of the current progress (from 1 to 0) :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step). :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param gamma: (float) the discount factor :param batch_size: (int) Minibatch size for each gradient update :param train_freq: (int) Update the model every `train_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type. :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param seed: (int) Seed for the pseudo random generators :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3, policy_delay=2, learning_starts=100, gamma=0.99, batch_size=100, train_freq=-1, gradient_steps=-1, n_episodes_rollout=1, tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0, seed=None, _init_setup_model=True): super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, create_eval_env=create_eval_env, seed=seed) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.gradient_steps = gradient_steps self.n_episodes_rollout = n_episodes_rollout self.batch_size = batch_size self.tau = tau self.gamma = gamma self.action_noise = action_noise self.policy_delay = policy_delay self.target_noise_clip = target_noise_clip self.target_policy_noise = target_policy_noise if _init_setup_model: self._setup_model() def _setup_model(self): self._setup_learning_rate() obs_dim, action_dim = self.observation_space.shape[ 0], self.action_space.shape[0] self.set_random_seed(self.seed) self.replay_buffer = ReplayBuffer(self.buffer_size, obs_dim, action_dim) self.policy = self.policy_class(self.observation_space, self.action_space, self.learning_rate, **self.policy_kwargs) self._create_aliases() def _create_aliases(self): self.actor = self.policy.actor self.actor_target = self.policy.actor_target self.critic = self.policy.critic self.critic_target = self.policy.critic_target def predict(self, observation, state=None, mask=None, deterministic=True): """ Get the model's action from an observation :param observation: (np.ndarray) the input observation :param state: (np.ndarray) The last states (can be None, used in recurrent policies) :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) :param deterministic: (bool) Whether or not to return deterministic actions. :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ return self.unscale_action( self.actor(np.array(observation).reshape(1, -1)).numpy()) @tf.function def critic_loss(self, obs, action, next_obs, done, reward): # Select action according to policy and add clipped noise noise = tf.random.normal(shape=action.shape) * self.target_policy_noise noise = tf.clip_by_value(noise, -self.target_noise_clip, self.target_noise_clip) next_action = tf.clip_by_value( self.actor_target(next_obs) + noise, -1., 1.) # Compute the target Q value target_q1, target_q2 = self.critic_target(next_obs, next_action) target_q = tf.minimum(target_q1, target_q2) target_q = reward + tf.stop_gradient( (1 - done) * self.gamma * target_q) # Get current Q estimates current_q1, current_q2 = self.critic(obs, action) # Compute critic loss return tf.keras.losses.MSE(current_q1, target_q) + tf.keras.losses.MSE( current_q2, target_q) @tf.function def actor_loss(self, obs): return -tf.reduce_mean(self.critic.q1_forward(obs, self.actor(obs))) @tf.function def update_targets(self): self.critic_target.soft_update(self.critic, self.tau) self.actor_target.soft_update(self.actor, self.tau) def train(self, gradient_steps, batch_size=100, policy_delay=2): # self._update_learning_rate() for gradient_step in range(gradient_steps): # Sample replay buffer obs, action, next_obs, done, reward = self.replay_buffer.sample( batch_size) with tf.GradientTape() as critic_tape: critic_tape.watch(self.critic.trainable_variables) critic_loss = self.critic_loss(obs, action, next_obs, done, reward) # Optimize the critic grads_critic = critic_tape.gradient( critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(grads_critic, self.critic.trainable_variables)) # Delayed policy updates if gradient_step % policy_delay == 0: with tf.GradientTape() as actor_tape: actor_tape.watch(self.actor.trainable_variables) # Compute actor loss actor_loss = self.actor_loss(obs) # Optimize the actor grads_actor = actor_tape.gradient( actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(grads_actor, self.actor.trainable_variables)) # Update the frozen target models self.update_targets() def learn(self, total_timesteps, callback=None, log_interval=4, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="TD3", reset_num_timesteps=True): timesteps_since_eval, episode_num, evaluations, obs, eval_env = self._setup_learn( eval_env) while self.num_timesteps < total_timesteps: if callback is not None: # Only stop training if return value is False, not when it is None. if callback(locals(), globals()) is False: break rollout = self.collect_rollouts( self.env, n_episodes=self.n_episodes_rollout, n_steps=self.train_freq, action_noise=self.action_noise, deterministic=False, callback=None, learning_starts=self.learning_starts, num_timesteps=self.num_timesteps, replay_buffer=self.replay_buffer, obs=obs, episode_num=episode_num, log_interval=log_interval) # Unpack episode_reward, episode_timesteps, n_episodes, obs = rollout episode_num += n_episodes self.num_timesteps += episode_timesteps timesteps_since_eval += episode_timesteps self._update_current_progress(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps self.train(gradient_steps, batch_size=self.batch_size, policy_delay=self.policy_delay) # Evaluate the agent timesteps_since_eval = self._eval_policy(eval_freq, eval_env, n_eval_episodes, timesteps_since_eval, deterministic=True) return self def collect_rollouts(self, env, n_episodes=1, n_steps=-1, action_noise=None, deterministic=False, callback=None, learning_starts=0, num_timesteps=0, replay_buffer=None, obs=None, episode_num=0, log_interval=None): """ Collect rollout using the current policy (and possibly fill the replay buffer) TODO: move this method to off-policy base class. :param env: (VecEnv) :param n_episodes: (int) :param n_steps: (int) :param action_noise: (ActionNoise) :param deterministic: (bool) :param callback: (callable) :param learning_starts: (int) :param num_timesteps: (int) :param replay_buffer: (ReplayBuffer) :param obs: (np.ndarray) :param episode_num: (int) :param log_interval: (int) """ episode_rewards = [] total_timesteps = [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv) assert env.num_envs == 1 while total_steps < n_steps or total_episodes < n_episodes: done = False # Reset environment: not needed for VecEnv # obs = env.reset() episode_reward, episode_timesteps = 0.0, 0 while not done: # Select action randomly or according to policy if num_timesteps < learning_starts: # Warmup phase unscaled_action = np.array([self.action_space.sample()]) else: unscaled_action = self.predict(obs) # Rescale the action from [low, high] to [-1, 1] scaled_action = self.scale_action(unscaled_action) # Add noise to the action (improve exploration) if action_noise is not None: scaled_action = np.clip(scaled_action + action_noise(), -1, 1) # Rescale and perform action new_obs, reward, done, infos = env.step( self.unscale_action(scaled_action)) done_bool = [float(done[0])] episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos) # Store data in replay buffer if replay_buffer is not None: replay_buffer.add(obs, new_obs, scaled_action, reward, done_bool) obs = new_obs num_timesteps += 1 episode_timesteps += 1 total_steps += 1 if 0 < n_steps <= total_steps: break if done: total_episodes += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Display training infos if self.verbose >= 1 and log_interval is not None and ( episode_num + total_episodes) % log_interval == 0: fps = int(num_timesteps / (time.time() - self.start_time)) logger.logkv("episodes", episode_num + total_episodes) if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.logkv( 'ep_rew_mean', self.safe_mean([ ep_info['r'] for ep_info in self.ep_info_buffer ])) logger.logkv( 'ep_len_mean', self.safe_mean([ ep_info['l'] for ep_info in self.ep_info_buffer ])) # logger.logkv("n_updates", n_updates) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - self.start_time)) logger.logkv("total timesteps", num_timesteps) logger.dumpkvs() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 return mean_reward, total_steps, total_episodes, obs
def exec(self): """ Train a DQN agent on cartpole env :param args: (Parsed Arguments) the input arguments """ with tf_utils.make_session(8) as sess: # Create the environment env = self.env # Create all the functions necessary to train the model act, train, update_target, _ = deepq.build_train( q_func=CustomPolicy, ob_space=env.observation_space, ac_space=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), sess=sess, double_q = False, ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). solved_yet = False is_solved = False steps_so_far = 0 # Initialize the parameters and copy them to the target network. tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for i in trange(self.episode_count): step = 0 done = False while not done: step += 1 steps_so_far += 1 if not self.mode_rbed: self.linear_decay(step=steps_so_far) # Take action and update exploration to the newest value action = act(obs[None], update_eps=self.epsilon)[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() last_reward = episode_rewards[-1] if self.mode_rbed: self.rb_decay_epsilon(current_reward=last_reward) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = sum(episode_rewards)/100 else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) # is_solved = step > 100 and mean_100ep_reward >= self.env_target # log epsilon self.ex.log_scalar(self.VAL_EPSILON, self.epsilon) # log reward self.ex.log_scalar(self.VAL_REWARD, last_reward) # log average reward self.ex.log_scalar(self.VAL_AVG100, mean_100ep_reward) # log solved at if mean_100ep_reward >= self.env_target and (not solved_yet): solved_yet = True self.ex.log_scalar(self.VAL_SOLVEDAT, i) # For next run episode_rewards.append(0) # Do not train further once solved. Keeping consistent with the original scheme if not solved_yet: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if steps_so_far > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train( obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if steps_so_far % 1000 == 0: update_target()
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/abs/1312.5602 Dueling DQN: https://arxiv.org/abs/1511.06581 Double-Q Learning: https://arxiv.org/abs/1509.06461 Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param exploration_initial_eps: (float) initial value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param double_q: (bool) Whether to enable Double-Q learning or not. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, expert_exp=None): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_initial_eps = exploration_initial_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log self.double_q = double_q self.expert_exp = expert_exp self.expert_ix = 0 self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def add_expert_exp(self): # doesn't work with vec_normalized environments obs, action, reward, new_obs, done = self.expert_exp[self.expert_ix] self.replay_buffer.add(obs, action, reward, new_obs, float(done)) self.expert_ix = (self.expert_ix - 1) % len(self.expert_exp) def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) if self.expert_exp is not None: self.add_expert_exp() obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba def get_parameter_list(self): return self.params def save(self, save_path, cloudpickle=False): # params data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="BDQ", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) if self.epsilon_greedy: approximate_num_iters = 2e6 / 4 # TODO Decide which schedule type to use # self.exploration = PiecewiseSchedule([(0, 1.0), # (approximate_num_iters / 50, 0.1), # (approximate_num_iters / 5, 0.01) # ], outside_value=0.01) self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) else: self.exploration = ConstantSchedule(value=0.0) # greedy policy std_schedule = LinearSchedule(schedule_timesteps=self.timesteps_std, initial_p=self.initial_std, final_p=self.final_std) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() obs = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] # print("time step {} and update eps {}".format(self.num_timesteps, update_eps)) action_idxes = np.array(self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)) #update_eps=exploration.value(t))) action = action_idxes / self.num_action_grains * self.actions_range + self.low if not self.epsilon_greedy: # Gaussian noise actions_greedy = action action_idx_stoch = [] action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice a_stoch = np.random.normal(loc=a_greedy, scale=std_schedule.value(self.num_timesteps)) # Convert sampled cont action to an action idx a_idx_stoch = np.rint((a_stoch + self.high[index]) / self.actions_range[index] * self.num_action_grains) # Check if action is in range if a_idx_stoch >= 0 and a_idx_stoch < self.num_actions_pad: action_idx_stoch.append(a_idx_stoch) action.append(a_stoch) out_of_range_action = False action_idxes = action_idx_stoch env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action_idxes, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) # self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # episode_rewards[-1] += rew episode_rewards[-1] += reward_ if done: # print("ep number", len(episode_rewards)) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Log training infos kvs = {} if self.verbose >= 1 and done and log_interval is not None \ and len(episode_rewards) % log_interval == 0 \ and self.num_timesteps > self.train_freq \ and self.num_timesteps > self.learning_starts: if self.log_dir is not None: kvs["episodes"] = num_episodes kvs["mean_100rew"] = mean_100ep_reward kvs["current_lr"] = self.learning_rate kvs["success_rate"] = np.mean(episode_successes[-100:]) kvs["total_timesteps"] = self.num_timesteps kvs["mean_loss"] = mean_loss kvs["mean_td_errors"] = np.mean(td_errors) kvs["time_spent_exploring"] = int(100 * self.exploration.value(self.num_timesteps)) self.log_csv.writekvs(kvs) logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
class C51SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf DDPG for the different action noise type. :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for SAC normally but can help exploring when using HER + SAC. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None, v_min=-400, v_max=400, n_spt=256): if policy_kwargs is None: policy_kwargs = {"n_spt": n_spt} else: policy_kwargs["n_spt"] = n_spt super(C51SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.n_spt = n_spt self.v_min = v_min self.v_max = v_max self.delta = (v_max - v_min) /(n_spt - 1) self.value_fn = None self.graph = None self.replay_buffer = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_ent_coef = None self.value_target_distr = None self.support = None self.projection_ph = None self.q_projection_ph = None self.q_backup_op = None self.projection_op = None self.min_qf_pi = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = unscale_action(self.action_space, self.deterministic_action) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.support = tf.constant(np.arange(self.v_min, self.v_max + 1e-6, self.delta), dtype=tf.float32) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="v_projection") self.q_projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="q_projection") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1_distr, qf2_distr, value_fn_distr = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi_distr, qf2_pi_distr, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target_distr = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target_distr = value_target_distr with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) # compute qf_pi, qf2_pi with pdf min_qf_pi_distr = tf.where(tf.less(tf.reduce_mean(qf1_pi_distr * self.support), tf.reduce_mean(qf2_pi_distr * self.support)), qf1_pi_distr, qf2_pi_distr) min_qf_pi = tf.reduce_mean(tf.reduce_sum(min_qf_pi_distr * self.support, axis=-1)) self.min_qf_pi = min_qf_pi q_backup_op = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.support ) q_backup_op = tf.clip_by_value(q_backup_op, self.v_min, self.v_max) self.q_backup_op = q_backup_op qf1_loss = -tf.reduce_mean(tf.log(qf1_distr + 1e-12) * tf.stop_gradient(self.projection_ph)) qf2_loss = -tf.reduce_mean(tf.log(qf2_distr + 1e-12) * tf.stop_gradient(self.projection_ph)) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) # to clip policy loss qf_pi = tf.reduce_mean(self.support * min_qf_pi_distr, axis=-1, keepdims=True) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. value_loss = -tf.reduce_mean(tf.log(value_fn_distr + 1e-12) * tf.stop_gradient(min_qf_pi_distr)) \ - tf.stop_gradient(tf.reduce_mean(self.ent_coef * logp_pi)) value_fn = tf.reduce_sum(value_fn_distr * self.support, axis=-1) # value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars('model/values_fn') source_params = tf_util.get_trainable_vars("model/values_fn") target_params = tf_util.get_trainable_vars("target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses qf1, qf2 = tf.reduce_mean(tf.reduce_sum(self.support * qf1_distr, axis=-1)), tf.reduce_mean(tf.reduce_sum(self.support * qf2_distr, axis=-1)) with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/values_fn") # Initialize Variables and target network self.projection_op = Projection(self.sess, self.graph, self.n_spt, self.v_min, self.v_max, self.delta, self.batch_size) with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch target_support, vf_target = self.sess.run([self.q_backup_op, self.value_target_distr], feed_dict={self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1) }) projection = self.projection_op(batch_rewards, batch_dones, target_support, vf_target) feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate, self.projection_ph: projection } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_ent_coef is not None: ent_coef_loss, ent_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if actions is not None: raise ValueError("Error: SAC does not have action probabilities.") warnings.warn("Even though SAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by a tanh before being scaled and outputed.") return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action(self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', "target_entropy": self.target_entropy, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "model_kwargs": { "v_min":self.v_min, "v_max":self.v_max, "n_spt":self.n_spt } } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) def replay_buffer_add(self, obs_t, action, reward, obs_tp1, done, info): """ Add a new transition to the replay buffer :param obs_t: (np.ndarray) the last observation :param action: ([float]) the action :param reward: (float) the reward of the transition :param obs_tp1: (np.ndarray) the new observation :param done: (bool) is the episode done :param info: (dict) extra values used to compute the reward when using HER """ # Pass info dict when using HER, as it can be used to compute the reward kwargs = dict(info=info) if self.is_using_her() else {} self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done), **kwargs) def is_using_her(self) -> bool: """ Check if is using HER :return: (bool) Whether is using HER or not """ # Avoid circular import from stable_baselines.her.replay_buffer import HindsightExperienceReplayWrapper return isinstance(self.replay_buffer, HindsightExperienceReplayWrapper) @classmethod def load(cls, load_path, env=None, custom_objects=None, **kwargs): """ Load the model from file :param load_path: (str or file-like) the saved parameter location :param env: (Gym Environment) the new environment to run the loaded model on (can be None if you only need prediction from a trained model) :param custom_objects: (dict) Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a key, it will not be deserialized and the corresponding item will be used instead. Similar to custom_objects in `keras.models.load_model`. Useful when you have an object in file that can not be deserialized. :param kwargs: extra arguments to change the model when loading """ data, params = cls._load_from_file(load_path, custom_objects=custom_objects) model_kwargs = data["model_kwargs"] del data["model_kwargs"] if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], kwargs['policy_kwargs'])) model = cls(policy=data["policy"], env=None, _init_setup_model=False, **model_kwargs) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() model.load_parameters(params) return model
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.risk_factor_ph = tf.placeholder(tf.float32, [], name='risk_factor_ph') with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) #double_policy = self.policy_tf.make_actor(self.processed_next_obs_ph,reuse=True) # Use two Q-functions to improve performance by reducing overestimation bias if self.model_type == "QR": self.qrtau = tf.tile( tf.reshape( tf.range(0.5 / self.n_support, 1, 1 / self.n_support), [1, self.n_support]), [tf.shape(self.processed_obs_ph)[0], 1]) qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, n_support=self.n_support) # Q value when following the current policy qrtau_pi = self.qrtau qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True, n_support=self.n_support) elif self.model_type == "IQN": self.qrtau = tf.random_uniform([ tf.shape(self.processed_obs_ph)[0], self.n_support ], minval=self.tau_clamp, maxval=1.0 - self.tau_clamp) qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, model_type=self.model_type, iqn_tau=self.qrtau, n_support=self.n_support) # Q value when following the current policy qrtau_pi = tf.random_uniform([ tf.shape(self.processed_obs_ph)[0], self.n_support ], minval=self.tau_clamp, maxval=1.0 - self.tau_clamp) qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, model_type=self.model_type, iqn_tau=qrtau_pi, reuse=True, n_support=self.n_support) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1 + 1e-2, 1 - 1e-2) # Q values when following the target policy if self.model_type == "QR": target_qrtau = self.qrtau qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action, n_support=self.n_support) elif self.model_type == "IQN": target_qrtau = tf.random_uniform([ tf.shape(self.processed_next_obs_ph)[0], self.n_support ], minval=self.tau_clamp, maxval=1.0 - self.tau_clamp) qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action, model_type=self.model_type, iqn_tau=target_qrtau, n_support=self.n_support) with tf.variable_scope("loss", reuse=False): quantile_weight = 1.0 - self.risk_factor_ph * ( 2.0 * qrtau_pi - 1.0) min_quantile = tf.reduce_mean(qf1_pi[:, 0]) max_quantile = tf.reduce_mean(qf1_pi[:, -1]) #min_qf_target = tf.minimum(qf1_target, qf2_target) #max_arg = tf.argmax(target_qrtau,axis=-1) qf1_t_flag = qf1_target[:, -1] qf2_t_flag = qf2_target[:, -1] #qf1_t_flag = qf1_target[:,max_arg] #qf2_t_flag = qf2_target[:,max_arg] #qf1_t_flag = tf.reduce_max(qf1_target,axis=-1) #qf2_t_flag = tf.reduce_max(qf2_target,axis=-1) #min_flag = qf1_t_flag > qf2_t_flag min_flag = qf1_t_flag < qf2_t_flag min_qf_target = tf.where(min_flag, qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1.0 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=2), [1, 1, self.n_support]) #qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=1), [1, self.n_support, 1]) #mulmax = 2.0 logit_valid_tile = tf.tile( tf.expand_dims(q_backup, axis=1), [1, self.n_support, 1]) theta_loss_tile = tf.tile(tf.expand_dims(qf1, axis=2), [1, 1, self.n_support]) Huber_loss = tf.compat.v1.losses.huber_loss( logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE, delta=self.kappa) / self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Loss = tf.abs(qrtau - tf.stop_gradient( tf.to_float(bellman_errors < 0))) * Huber_loss qf1_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1) #qf1_gmul = qf1_losses - tf.reduce_min(qf1_losses) #qf1_gmul = 1.0 + mulmax*qf1_gmul/tf.reduce_max(qf1_gmul)#(1.0 - mulmax) + 2*mulmax*qf1_gmul/tf.reduce_max(qf1_gmul) qf1_loss = tf.reduce_mean(qf1_losses) theta_loss_tile = tf.tile(tf.expand_dims(qf2, axis=2), [1, 1, self.n_support]) Huber_loss = tf.compat.v1.losses.huber_loss( logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE, delta=self.kappa) / self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Loss = tf.abs(qrtau - tf.stop_gradient( tf.to_float(bellman_errors < 0))) * Huber_loss qf2_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1) #qf2_gmul = qf2_losses - tf.reduce_min(qf2_losses) #qf2_gmul = 1.0 + mulmax*qf2_gmul/tf.reduce_max(qf2_gmul)#(1.0 - mulmax) + 2*mulmax*qf2_gmul/tf.reduce_max(qf2_gmul) qf2_loss = tf.reduce_mean(qf2_losses) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean( tf.multiply(qf1_pi, quantile_weight)) # + policy_update_ratio # Q Values optimizer #qvalues_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph) qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) #qvalues_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars( 'model/values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Policy train op # will be called only every n training steps, # where n is the policy delay #policy_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) #policy_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph) # Initializing target to match source variables self.target_init_op = tf.group([ tf.assign(target, source) for target, source in zip(target_params, source_params) ]) train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) #grad_values = tf.gradients(qvalues_losses, qvalues_params) #grad_values = list(zip(grad_values, qvalues_params)) with tf.control_dependencies([train_values_op]): policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) #grad_policy = tf.gradients(policy_loss, tf_util.get_trainable_vars('model/pi')) #grad_policy = list(zip(grad_policy, tf_util.get_trainable_vars('model/pi'))) self.policy_train_op = policy_train_op with tf.control_dependencies([self.policy_train_op]): # Polyak averaging for target variables self.target_ops = tf.group([ tf.assign(target, (1.0 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ]) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('min_quantile', min_quantile) tf.summary.scalar('max_quantile', max_quantile) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) ''' for grad, var in grad_values + grad_policy: tf.summary.histogram(var.name, var) tf.summary.histogram(var.name + '/gradient', grad) ''' # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init_op) self.summary = tf.summary.merge_all()
class TD3(OffPolicyRLModel): """ Twin Delayed DDPG (TD3) Addressing Function Approximation Error in Actor-Critic Methods. Original implementation: https://github.com/sfujim/TD3 Paper: https://arxiv.org/pdf/1802.09477.pdf Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values and Actor networks) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step). :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type. :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param gradient_steps: (int) How many gradient update after each step :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for TD3 normally but can help exploring when using HER + TD3. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on TD3 logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=100, gradient_steps=100, batch_size=128, tau=0.005, policy_delay=2, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(TD3, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=TD3Policy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.policy_delay = policy_delay self.target_noise_clip = target_noise_clip self.target_policy_noise = target_policy_noise self.graph = None self.replay_buffer = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy_tf = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.step_ops = None self.target_ops = None self.infos_names = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.policy_out = None self.policy_train_op = None self.policy_loss = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale policy_out = unscale_action(self.action_space, self.policy_out) return policy.obs_ph, self.actions_ph, policy_out def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars( 'model/values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate, update_policy): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } step_ops = self.step_ops if update_policy: # Update policy and target networks step_ops = step_ops + [ self.policy_train_op, self.target_ops, self.policy_loss ] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(step_ops, feed_dict) # Unpack to monitor losses qf1_loss, qf2_loss, *_values = out return qf1_loss, qf2_loss def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter mb_infos_vals.append( self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): _ = np.array(observation) if actions is not None: raise ValueError("Error: TD3 does not have action probabilities.") # here there are no action probabilities, as DDPG does not use a probability distribution warnings.warn( "Warning: action probability is meaningless for TD3. Returning None" ) return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation) if self.action_noise is not None and not deterministic: actions = np.clip(actions + self.action_noise(), -1, 1) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "policy_delay": self.policy_delay, "target_noise_clip": self.target_noise_clip, "target_policy_noise": self.target_policy_noise, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, config=self.config, pretrained_model=self.pretrained_model, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, config=self.config, pretrained_model=self.pretrained_model, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) self._qf1 = qf1 self._qf2 = qf2 self._value_fn = value_fn qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target ) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # policy_train_op = tf.contrib.layers.optimize_loss( # policy_loss, # None, # self.learning_rate_ph, # "Adam", # variables=tf_util.get_trainable_vars('model/pi'), # summaries=["gradients"], # increment_global_step=False # ) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars('model/values_fn') source_params = tf_util.get_trainable_vars("model/values_fn") target_params = tf_util.get_trainable_vars("target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # for var in tf.trainable_variables(): # tf.summary.histogram(var.name, var) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/values_fn") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) if self.pretrained_model is not None: list_of_vars_to_load = ['cnn_model/BatchNorm/beta', 'cnn_model/BatchNorm/moving_mean', 'cnn_model/BatchNorm/moving_variance', 'cnn_model/c1/w', 'cnn_model/c1/b', 'cnn_model/c2/w', 'cnn_model/c2/b', 'cnn_model/c3/w', 'cnn_model/c3/b', 'cnn_model/fc1/w', 'cnn_model/fc1/b', 'cnn_model/dense/kernel', 'cnn_model/dense/bias'] def _load_vars(var_dict, ckpt_path): saver = tf.train.Saver(var_list=var_dict) ckpt = tf.train.get_checkpoint_state(ckpt_path) saver.restore(self.sess, ckpt.model_checkpoint_path) all_tensors = [x.op.name for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)] # all_tensors = [x.op.name for x in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] var_dict_pi = {x: self.graph.get_tensor_by_name(f"model/pi/{x}:0") for x in list_of_vars_to_load if f"model/pi/{x}" in all_tensors} var_dict_values_fn ={x: self.graph.get_tensor_by_name(f"model/values_fn/{x}:0") for x in list_of_vars_to_load if f"model/values_fn/{x}" in all_tensors} _load_vars(var_dict_pi, self.pretrained_model) _load_vars(var_dict_values_fn, self.pretrained_model) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()