def ude_ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, policy_delay=2, max_ep_len=1000, n_post_action=10, sample_action_with_dropout=True, dropout_rate=0.1, action_choose_method='random_sample', uncertainty_noise_type='std_noise', a_var_clip_max=1, a_var_clip_min=0.1, a_std_clip_max=1, a_std_clip_min=0.1, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, pi_post_samplers, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs, create_post_samplers=True, n_post=n_post_action, dropout_rate=dropout_rate) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, pi_targ_post_samplers, _, q_pi_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_action_test(o): """Get action in test phase.""" a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] return np.clip(a, -act_limit, act_limit) def get_action_train(o): """Get action in training phase""" a_var_uncertainty = 0 a_var_uncertainty_clipped = 0 a_std_uncertainty = 0 a_std_uncertainty_clipped = 0 if sample_action_with_dropout: # Collect post samples into a ndarray of size (n_post, act_dim) pi_weights = sess.run( get_vars('main/pi')) # Get current policy weights a_post = np.array( ray.get([ p_s.sample_action.remote(pi_weights, o) for p_s in pi_post_samplers ])) # TODO: var and std must been scaled or clipped. # Otherwise, a huge variance will always cause action out of act_lim and then be clipped to -1 or 1. # we also need to set a lower bound to enforce a minimum exploration a_mean = np.mean(a_post, axis=0) a_median = np.median(a_post, axis=0) a_var = np.var(a_post, axis=0) a_var_clipped = np.clip(a_var, a_var_clip_min, a_var_clip_max) a_var_noise = a_var_clipped * np.random.randn(act_dim) a_std = np.std(a_post, axis=0) a_std_clipped = np.clip(a_std, a_std_clip_min, a_std_clip_max) a_std_noise = a_std_clipped * np.random.randn(act_dim) # TODO: define uncertainty to a value that is not affect by action dimension. a_var_uncertainty = np.mean(a_var) # np.sum(a_var) a_var_uncertainty_clipped = np.mean( a_var_clipped) # np.sum(a_var_clipped) a_std_uncertainty = np.mean(a_std) # np.sum(a_std) a_std_uncertainty_clipped = np.mean( a_std_clipped) # np.sum(a_std_clipped) # TODO: clip noise within a range. Maybe not necessary. if uncertainty_noise_type == 'var_noise': noise = a_var_noise elif uncertainty_noise_type == 'std_noise': noise = a_std_noise else: raise ValueError('Please choose a proper noise_type.') a = np.zeros((act_dim, )) if action_choose_method == 'random_sample': # Method 1: randomly sample one from post sampled actions a = a_post[np.random.choice(n_post_action)] elif action_choose_method == 'gaussian_sample': # Method 2: estimate mean and std, then sample from a Gaussian distribution for a_i in range(act_dim): a[a_i] = np.random.normal(a_mean[a_i], a_std_clipped[a_i], 1) elif action_choose_method == 'mean_of_samples': a = a_mean elif action_choose_method == 'median_of_sample': pass elif action_choose_method == 'mean_and_variance_based_noise': a = a_mean + noise elif action_choose_method == 'median_and_variance_based_noise': a = a_median + noise elif action_choose_method == 'prediction_and_variance_based_noise': a_prediction = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a = a_prediction + noise else: pass else: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += act_noise * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit), \ a_var_uncertainty, a_var_uncertainty_clipped, a_std_uncertainty, a_std_uncertainty_clipped def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action_test(o)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len, \ ep_a_var_uncertainty,ep_a_var_uncertainty_clipped, \ ep_a_std_uncertainty, ep_a_std_uncertainty_clipped = env.reset(), 0, False, 0, 0, 0, 0, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a, a_var_uncertainty, a_var_uncertainty_clipped, \ a_std_uncertainty, a_std_uncertainty_clipped = get_action_train(o) else: a = env.action_space.sample() # TODO: a_var_uncertainty = 0 a_var_uncertainty_clipped = 0 a_std_uncertainty = 0 a_std_uncertainty_clipped = 0 # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 ep_a_var_uncertainty += a_var_uncertainty ep_a_var_uncertainty_clipped += a_var_uncertainty_clipped ep_a_std_uncertainty += a_std_uncertainty ep_a_std_uncertainty_clipped += a_std_uncertainty_clipped # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len, EpVarUncertainty=ep_a_var_uncertainty, EpVarUncertaintyClipped=ep_a_var_uncertainty_clipped, EpStdUncertainty=ep_a_std_uncertainty, EpStdUncertaintyClipped=ep_a_std_uncertainty_clipped) o, r, d, ep_ret, ep_len, \ ep_a_var_uncertainty, ep_a_var_uncertainty_clipped, \ ep_a_std_uncertainty, ep_a_std_uncertainty_clipped = env.reset(), 0, False, 0, 0, 0, 0, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('EpVarUncertainty', with_min_and_max=True) logger.log_tabular('EpVarUncertaintyClipped', with_min_and_max=True) logger.log_tabular('EpStdUncertainty', with_min_and_max=True) logger.log_tabular('EpStdUncertaintyClipped', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def oac(env_fn, actor_critic=mlp_actor_critic, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] state_hist_n = rl_params['state_hist_n'] grad_clip_val = rl_params['grad_clip_val'] # entropy params alpha = rl_params['alpha'] target_entropy_start = rl_params['target_entropy_start'] target_entropy_stop = rl_params['target_entropy_stop'] target_entropy_steps = rl_params['target_entropy_steps'] # optimistic exploration params use_opt = rl_params['use_opt'] beta_UB = rl_params['beta_UB'] beta_LB = rl_params['beta_LB'] delta = rl_params['delta'] opt_lr = rl_params['opt_lr'] max_opt_steps = rl_params['max_opt_steps'] train_env, test_env = env_fn(), env_fn() obs_space = train_env.observation_space act_space = train_env.action_space try: obs_dim = obs_space.n observation_type = 'Discrete' except AttributeError as e: obs_dim = obs_space.shape[0] observation_type = 'Box' act_dim = act_space.n # set the seed tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim*state_hist_n, act_dim=act_dim, size=replay_size) # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=state_hist_n) test_state_buffer = StateBuffer(m=state_hist_n) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim*state_hist_n, act_dim, obs_dim*state_hist_n, None, None) # alpha and entropy setup max_target_entropy = tf.log(tf.cast(act_dim, tf.float32)) target_entropy_prop_ph = tf.placeholder(dtype=tf.float32, shape=()) target_entropy = max_target_entropy * target_entropy_prop_ph log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Main outputs from computation graph # with tf.variable_scope('main'): # mu, pi, action_probs, log_action_probs, action_logits, q1_logits, q2_logits, q1_a, q2_a = actor_critic(x_ph, a_ph, **network_params) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, action_probs, log_action_probs, action_logits, q1_logits, q2_logits = actor_critic(x_ph, a_ph, **network_params) with tf.variable_scope('main', reuse=True): _, _, action_probs_next, log_action_probs_next, _, _, _ = actor_critic(x2_ph, a_ph, **network_params) # Target value network with tf.variable_scope('target'): _, _, _, _, _, q1_logits_targ, q2_logits_targ = actor_critic(x2_ph, a_ph, **network_params) # Count variables var_counts = tuple(count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print("""\nNumber of parameters: alpha: %d, pi: %d, q1: %d, q2: %d, total: %d\n"""%var_counts) if use_opt: # Optimistic Exploration mu_Q = (q1_logits + q2_logits) / 2.0 sigma_Q = tf.math.abs(q1_logits - q2_logits) / 2.0 Q_UB = mu_Q + beta_UB * sigma_Q Q_LB = mu_Q + beta_LB * sigma_Q Q_UB_sm = tf.nn.softmax(Q_UB, axis=-1) # needed to make EV and penalty proportional for optimisation R = tf.get_variable('R', dtype=tf.float32, shape=[1,act_dim], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)) assign_R = R.assign(action_logits) # initialises P as the same "pessimistic" action distribution P = tf.nn.softmax(R, axis=-1) expected_value = tf.reduce_sum( tf.multiply(P, Q_UB_sm) ) KL_P_PT = tf.reduce_sum( tf.multiply(P, tf.log( tf.divide(P, action_probs) ) ) ) penalty = KL_P_PT - delta relu_penalty = tf.nn.relu(penalty) penalised_opt_function = - expected_value + relu_penalty optpi_optimizer = tf.train.AdamOptimizer(learning_rate=opt_lr) train_optpi_op = optpi_optimizer.minimize(penalised_opt_function, var_list=get_vars('R')) optimistic_policy_dist = tf.distributions.Categorical(probs=P) optimistic_pi = optimistic_policy_dist.sample() else: optimistic_pi = pi # use standard SAC policy Q_LB = tf.minimum(q1_logits, q2_logits) # Min Double-Q: min_q_logits_targ = tf.minimum(q1_logits_targ, q2_logits_targ) # Targets for Q regression q_backup = r_ph + gamma*(1-d_ph)*tf.stop_gradient( tf.reduce_sum(action_probs_next * (min_q_logits_targ - alpha * log_action_probs_next), axis=-1)) # critic losses q1_a = tf.reduce_sum(tf.multiply(q1_logits, a_ph), axis=1) q2_a = tf.reduce_sum(tf.multiply(q2_logits, a_ph), axis=1) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # policy loss pi_backup = tf.reduce_sum(action_probs * ( alpha * log_action_probs - Q_LB ), axis=-1) pi_loss = tf.reduce_mean(pi_backup) # alpha loss for temperature parameter pi_entropy = -tf.reduce_sum(action_probs * log_action_probs, axis=-1) alpha_backup = tf.stop_gradient(target_entropy - pi_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op # (has to be separate from value train op, because q1_logits appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q')) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a, pi_entropy, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1_a': q1_a, 'q2_a': q2_a}) def get_action(state, deterministic=False): # # record data for printing # _ = sess.run(assign_R, feed_dict={x_ph: [state]}) # ins = sess.run([action_probs, Q_UB, P, KL_P_PT], feed_dict={x_ph: [state]}) if deterministic: act_op = mu else: if use_opt: # run a few optimisation steps to set optimistic policy _ = sess.run(assign_R, feed_dict={x_ph: [state]}) for i in range(max_opt_steps): _ = sess.run([train_optpi_op], feed_dict={x_ph: [state]}) act_op = optimistic_pi # # print difference between pessimistic and optimistic policy probabilities # outs = sess.run([P, KL_P_PT], feed_dict={x_ph: [state]}) # # print('ap: ', ins[0]) # print('Q: ', ins[1]) # print('P_in: ', ins[2]) # print('P_out: ', outs[0]) # print('KL_in: ', ins[3]) # print('KL_out: ', outs[1]) # print('') return sess.run(act_op, feed_dict={x_ph: [state]})[0] def reset(env, state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o = process_observation(o, obs_dim, observation_type) r = process_reward(r) state = state_buffer.init_state(init_obs=o) return o, r, d, ep_ret, ep_len, state def test_agent(n=10, render=True): global sess, mu, pi, q1_a, q2_a for j in range(n): o, r, d, ep_ret, ep_len, test_state = reset(test_env, test_state_buffer) if render: test_env.render() while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(test_state, True)) o = process_observation(o, obs_dim, observation_type) r = process_reward(r) test_state = test_state_buffer.append_state(o) ep_ret += r ep_len += 1 if render: test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) if render: test_env.close() start_time = time.time() o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer) total_steps = steps_per_epoch * epochs target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(state) else: a = train_env.action_space.sample() # Step the env o2, r, d, _ = train_env.step(a) o2 = process_observation(o2, obs_dim, observation_type) a = process_action(a, act_dim) r = process_reward(r) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(state, a, r, next_state, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], target_entropy_prop_ph: target_entropy_prop } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], PiEntropy=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # update target entropy every epoch target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': train_env}, None) # Test the performance of the deterministic version of the agent. test_agent(n=10,render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('PiEntropy', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() plot_progress(os.path.join(logger_kwargs['output_dir'],'progress.txt'), show_plot=False)
def d3pg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, without_start_steps=True, batch_size=100, start_steps=10000, without_delay_train=False, reward_scale=1, uncertainty_driven_exploration=True, n_post=100, concentration_factor=0.5, uncertainty_policy_delay=5000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # TODO: Test no start steps if without_start_steps: start_steps = batch_size logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) hidden_sizes = list(ac_kwargs['hidden_sizes']) actor_hidden_activation = tf.keras.activations.relu actor_output_activation = tf.keras.activations.tanh critic_hidden_activation = tf.keras.activations.relu critic_output_activation = tf.keras.activations.linear LOG_VAR_MIN = -20 LOG_VAR_MAX = 20 #20 # Main actor-critic with tf.variable_scope('main'): actor = MLP(hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=tf.keras.activations.tanh) # critic = MLP(hidden_sizes + [1], # hidden_activation=critic_hidden_activation, # output_activation=critic_output_activation) dueling_critic = DuelingMLP() pi = act_limit * actor(x_ph) v_out, a_out, q_out = dueling_critic(None, state=x_ph, action=a_ph, optimal_action=pi) v_out, a_out, q_out = tf.squeeze(v_out, axis=1), tf.squeeze( a_out, axis=1), tf.squeeze(q_out, axis=1) _, a_pi_out, q_pi_out = dueling_critic(None, state=x_ph, action=pi, optimal_action=pi) a_pi_out, q_pi_out = tf.squeeze(a_pi_out, axis=1), tf.squeeze(q_pi_out, axis=1) # Target actor-critic with tf.variable_scope('target'): actor_targ = MLP(hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) # critic_targ = MLP(hidden_sizes + [1], # hidden_activation=critic_hidden_activation, # output_activation=critic_output_activation) dueling_critic_targ = DuelingMLP() pi_targ = act_limit * actor_targ(x2_ph) _, a_pi_out_targ, q_pi_out_targ = dueling_critic(None, state=x2_ph, action=pi_targ, optimal_action=pi) q_pi_out_targ = tf.squeeze(q_pi_out_targ, axis=1) # Create LazyBernoulliDropoutMLP: # which copys weights from MLP by # sess.run(lazy_ber_drop_mlp_update) # , then post sample predictions with dropout masks. # with tf.variable_scope('LazyBernoulliDropoutUncertaintySample'): # # define placeholder for parallel sampling # # batch x n_post x dim # lazy_bernoulli_dropout_actor = BeroulliDropoutMLP(hidden_sizes + [act_dim], weight_regularizer=1e-6, dropout_rate=0.05, # hidden_activation=actor_hidden_activation, # output_activation=actor_output_activation) # lazy_ber_drop_pi = act_limit*lazy_bernoulli_dropout_actor(x_ph, training=True, duplicate_input=False) # Set training=True to sample with dropout masks # lazy_ber_drop_actor_update = tf.group([tf.assign(v_lazy_ber_drop_mlp, v_mlp) # for v_mlp, v_lazy_ber_drop_mlp in # zip(actor.variables, lazy_bernoulli_dropout_actor.variables)]) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, logger_fname='experiences_log.txt', **logger_kwargs) # Count variables print('\nNumber of parameters: \t pi: {:d}, \t q: {:d}, \t total: {:d}\n'. format(count_vars(actor.variables), count_vars(dueling_critic.variables), count_vars(get_vars('main')))) # Bellman backup for Q functions, using Clipped Double-Q targets backup = tf.stop_gradient(r_ph * reward_scale + gamma * (1 - d_ph) * q_pi_out_targ) # losses pi_loss = -tf.reduce_mean(q_pi_out) # pi_loss = -tf.reduce_mean(a_pi_out) q_loss = tf.reduce_mean((q_out - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables) train_q_op = q_optimizer.minimize(q_loss, var_list=dueling_critic.variables) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # sess.run(lazy_ber_drop_actor_update) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) # def get_uncertainty_driven_explore_action(o): # o_post_samples = np.matlib.repmat(o.reshape(1,-1), n_post, 1) # repmat x for post sampling # # # 1. Generate action Prediction # a_pred = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] # # # 2. Generate post sampled actions # a_post = sess.run(lazy_ber_drop_pi, feed_dict={x_ph: o_post_samples}) # # a = np.zeros((act_dim,)) # if act_dim > 1: # a_cov = np.cov(a_post, rowvar=False) # a_cov_shaped = concentration_factor * a_cov # a = np.random.multivariate_normal(a_pred, a_cov_shaped, 1)[0] # unc_a = a_cov # else: # a_std = np.std(a_post, axis=0) # a_std_shaped = concentration_factor * a_std # a = np.random.normal(a_pred, a_std_shaped, 1)[0] # unc_a = a_std # # a = np.clip(a, -act_limit, act_limit) # # TODO: logdet as intrinsic reward # return a, unc_a def get_gaussian_noise_explore_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def get_action_test(o): """Get deterministic action without exploration.""" a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] # print('test a={}'.format(a)) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action_test(o)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len, ep_unc = env.reset(), 0, False, 0, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: if uncertainty_driven_exploration: # if t%uncertainty_policy_delay==0: # sess.run(lazy_ber_drop_actor_update) # a, unc_a = get_uncertainty_driven_explore_action(o) pass else: a = get_gaussian_noise_explore_action(o, act_noise) else: a = env.action_space.sample() if t < start_steps or (not uncertainty_driven_exploration): unc_a = np.zeros((act_dim, act_dim)) # print(t) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 ep_unc += np.sum(unc_a) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d, t, steps_per_epoch, start_time, unc_a=unc_a) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # if without_delay_train: # batch = replay_buffer.sample_batch(batch_size) # feed_dict = {x_ph: batch['obs1'], # x2_ph: batch['obs2'], # a_ph: batch['acts'], # r_ph: batch['rews'], # d_ph: batch['done'] # } # q_step_ops = [q_loss, q_mean, tf.sqrt(tf.exp(q_logvar)), train_q_op] # outs = sess.run(q_step_ops, feed_dict) # logger.store(LossQ=outs[0], QVals=outs[1], QStds=outs[2]) # # # Delayed policy update # outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) # logger.store(LossPi=outs[0]) if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ if not without_delay_train: for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, v_out, a_out, q_out, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], VVals=outs[1], AVals=outs[2], QVals=outs[3]) # print('LossQ={}, QVals={}, QLogVars={}'.format(outs[0], outs[1], outs[2])) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) logger.store(EpUnc=ep_unc) o, r, d, ep_ret, ep_len, ep_unc = env.reset(), 0, False, 0, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('AVals', with_min_and_max=True) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('EpUnc', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, env_name, test_env_fns=[], actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, load_dir=None, num_procs=1, clean_every=200): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ from spinup.examples.pytorch.eval_sac import load_pytorch_policy print(f"SAC proc_id {proc_id()}") logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if proc_id() == 0: writer = SummaryWriter(log_dir=os.path.join( logger.output_dir, str(datetime.datetime.now())), comment=logger_kwargs["exp_name"]) torch.manual_seed(seed) np.random.seed(seed) env = SubprocVecEnv([partial(env_fn, rank=i) for i in range(num_procs)], "spawn") test_env = SubprocVecEnv(test_env_fns, "spawn") obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks if load_dir is not None: _, ac = load_pytorch_policy(load_dir, itr="", deterministic=False) else: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing TD feats-losses def compute_loss_feats(data): o, a, r, o2, d, feats = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'], data["feats"] feats = torch.stack(list(feats.values())).T # (nbatch, nfeats) feats1 = ac.q1.predict_feats(o, a) feats2 = ac.q2.predict_feats(o, a) feats_keys = replay_buffer.feats_keys # Bellman backup for feature functions with torch.no_grad(): a2, _ = ac.pi(o2) # Target feature values feats1_targ = ac_targ.q1.predict_feats(o2, a2) feats2_targ = ac_targ.q2.predict_feats(o2, a2) feats_targ = torch.min(feats1_targ, feats2_targ) backup = feats + gamma * (1 - d[:, None]) * feats_targ # MSE loss against Bellman backup loss_feats1 = ((feats1 - backup)**2).mean(axis=0) loss_feats2 = ((feats2 - backup)**2).mean(axis=0) loss_feats = loss_feats1 + loss_feats2 # Useful info for logging feats_info = dict(Feats1Vals=feats1.detach().numpy(), Feats2Vals=feats2.detach().numpy()) return loss_feats, feats_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, feats_keys): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() loss_feats, feats_info = compute_loss_feats(data) q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Feature loss keys = [f"LossFeats_{key}" for key in feats_keys] for key, val in zip(keys, loss_feats): logger.store(**dict(key, val.item())) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(feats_keys): num_envs = len(test_env_fns) env_ep_rets = np.zeros(num_envs) for j in range(num_test_episodes): o, d = test_env.reset(), np.zeros(num_envs, dtype=bool) ep_len = np.zeros(num_envs) while not (np.all(d) or np.all(ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, info = test_env.step(get_action(o, True)) env_ep_rets += r ep_len += 1 # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) for ti in range(num_envs): logger.store( **{f"TestEpRet_{ti}": env_ep_rets[ti] / num_test_episodes}) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(num_procs) # Main loop: collect experience in env and update/log each epoch epoch = 0 update_times, clean_times = 0, 0 t = 0 while t <= total_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = np.stack([env.action_space.sample() for _ in range(num_procs)]) # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) if np.all(ep_len == max_ep_len): d.fill(False) # Store experience to replay buffer replay_buffer.store_vec(o, a, r, o2, d, [inf["features"] for inf in info]) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling, assumes all subenvs end at the same time if np.all(d) or np.all(ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) if clean_every > 0 and epoch // clean_every >= clean_times: env.close() test_env.close() env = SubprocVecEnv( [partial(env_fn, rank=i) for i in range(num_procs)], "spawn") test_env = SubprocVecEnv(test_env_fns, "spawn") clean_times += 1 o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros( num_procs) # Update handling if t >= update_after and t / update_every > update_times: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, feats_keys=replay_buffer.feats_keys) update_times += 1 # End of epoch handling if t // steps_per_epoch > epoch: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): # try: logger.save_state({'env_name': env_name}, None) # logger.save_state({'env': env}, None) #except: #logger.save_state({'env_name': env_name}, None) # Test the performance of the deterministic version of the agent. test_agent(replay_buffer.feats_keys) # Update tensorboard if proc_id() == 0: log_perf_board = ['EpRet', 'EpLen', 'Q1Vals', 'Q2Vals'] + [ f"TestEpRet_{ti}" for ti in range(len(test_env_fns)) ] log_loss_board = ['LogPi', 'LossPi', 'LossQ'] + [ key for key in logger.epoch_dict.keys() if "LossFeats" in key ] log_board = { 'Performance': log_perf_board, 'Loss': log_loss_board } for key, value in log_board.items(): for val in value: mean, std = logger.get_stats(val) if key == 'Performance': writer.add_scalar(key + '/Average' + val, mean, epoch) writer.add_scalar(key + '/Std' + val, std, epoch) else: writer.add_scalar(key + '/' + val, mean, epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() if proc_id() == 0: writer.flush() import psutil # gives a single float value cpu_percent = psutil.cpu_percent() # gives an object with many fields mem_percent = psutil.virtual_memory().percent print(f"Used cpu avg {cpu_percent}% memory {mem_percent}%") cpu_separate = psutil.cpu_percent(percpu=True) for ci, cval in enumerate(cpu_separate): print(f"\t cpu {ci}: {cval}%") # buf_size = replay_buffer.get_size() # print(f"Replay buffer size: {buf_size//1e6}MB {buf_size // 1e3} KB {buf_size % 1e3} B") t += num_procs if proc_id() == 0: writer.close()
def ddpg_multihead_n_step(env_name, actor_hidden_layers=[300, 300], critic_shared_hidden_layers=[300], critic_separated_head_hidden_layers=[300], seed=0, dropout_rate = 0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), reward_scale = 1, multi_head_multi_step_size = [1, 2, 3, 4, 5], actor_omit_top_k_Q = 2, actor_omit_low_k_Q = 1, critic_omit_top_k_Q = 2, critic_omit_low_k_Q = 1, q_loss_type = 'QLossReduceMeanMean', multihead_q_std_penalty = 0.2, separate_action_and_prediction = False, multi_head_bootstrapping = False, target_policy_smoothing=True, target_noise = 0.2, noise_clip = 0.5, random_n_step=False, random_n_step_low=1, random_n_step_high=5, gamma=0.99, without_delay_train=False, obs_noise_scale=0, nonstationary_env=False, gravity_change_pattern = 'gravity_averagely_equal', gravity_cycle = 1000, gravity_base = -9.81, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, random_action_baseline=False, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture # Inputs to computation graph multi_head_size = len(multi_head_multi_step_size) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, obs_dim)) a_ph = tf.placeholder(dtype=tf.float32, shape=(None, act_dim)) # TODO: use different mini-batch x2_ph = tf.placeholder(dtype=tf.float32, shape=(None, max(multi_head_multi_step_size), obs_dim)) r_ph = tf.placeholder(dtype=tf.float32, shape=(None, None)) d_ph = tf.placeholder(dtype=tf.float32, shape=(None, None)) n_step_ph = tf.placeholder(dtype=tf.float32, shape=()) actor_hidden_sizes = actor_hidden_layers actor_hidden_activation = tf.keras.activations.relu actor_output_activation = tf.keras.activations.tanh critic_shared_hidden_sizes = critic_shared_hidden_layers critic_head_hidden_sizes = critic_separated_head_hidden_layers critic_hidden_activation = tf.keras.activations.relu critic_output_activation = tf.keras.activations.linear # Main outputs from computation graph with tf.variable_scope('main'): actor = MLP(layer_sizes=actor_hidden_sizes+[act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) multihead_critic = MultiHeadMLP(shared_hidden_layer_sizes=critic_shared_hidden_sizes, multi_head_layer_sizes=[critic_head_hidden_sizes+[1] for i in range(multi_head_size)], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Set training=False to ignore dropout masks pi = act_limit * actor(x_ph, training=False) multihead_q = [tf.squeeze(head_out, axis=1) for head_out in multihead_critic(tf.concat([x_ph,a_ph], axis=-1))] multihead_q_pi = [tf.squeeze(head_out, axis=1) for head_out in multihead_critic(tf.concat([x_ph, pi], axis=-1))] # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). actor_targ = MLP(layer_sizes=actor_hidden_sizes+[act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) multihead_critic_targ = MultiHeadMLP(shared_hidden_layer_sizes=critic_shared_hidden_sizes, multi_head_layer_sizes=[critic_head_hidden_sizes+[1] for i in range(multi_head_size)], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Set training=False to ignore dropout for backup target value # Crucial: feed target networks with different next n-step observation multihead_q_pi_targ = [] # for head_i in range(multi_head_size): for h_i, n_step in enumerate(multi_head_multi_step_size): print('Head-{}: {}-step'.format(h_i, n_step)) head_x2_ph = tf.squeeze(tf.slice(x2_ph, [0, n_step-1,0], [batch_size, 1, obs_dim]), axis=1) _ = actor_targ(head_x2_ph) # just for copy parameter if separate_action_and_prediction: head_pi_targ = act_limit * actor(head_x2_ph) else: head_pi_targ = act_limit * actor_targ(head_x2_ph) if target_policy_smoothing: # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(head_pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) head_pi_targ = head_pi_targ + epsilon head_pi_targ = tf.clip_by_value(head_pi_targ, -act_limit, act_limit) # TODO: test multi-head bootstrapping with StdQPenalty if multi_head_bootstrapping: # all heads calculate n-step bootstrapping, # omit overestimation and underestimation of n-step bootstrapped Q after_omit_overestimation = tf.math.top_k( -tf.squeeze(tf.stack(multihead_critic_targ(tf.concat([head_x2_ph, head_pi_targ], axis=-1)), axis=2), axis=1), multi_head_size - critic_omit_top_k_Q)[0] after_omit_underestimation = tf.math.top_k(-after_omit_overestimation, multi_head_size - critic_omit_top_k_Q - critic_omit_low_k_Q)[0] multihead_q_pi_targ.append(tf.reduce_mean(after_omit_underestimation, axis=1)) else: multihead_q_pi_targ.append( tf.squeeze(multihead_critic_targ(tf.concat([head_x2_ph, head_pi_targ], axis=-1))[h_i], axis=1)) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts) # Bellman backup for Q function multihead_q_loss_list = [] multihead_q_pi_loss_list = [] multihead_backup_list = [] for h_i, n_step in enumerate(multi_head_multi_step_size): head_q = multihead_q[h_i] head_q_pi_targ = multihead_q_pi_targ[h_i] head_q_pi = multihead_q_pi[h_i] head_backup = tf.stop_gradient(tf.reduce_sum(tf.multiply(tf.pow(gamma, tf.range(0, n_step, dtype=tf.float32)) * (1 - tf.slice(d_ph, [0, 0], [batch_size, n_step])), tf.slice(r_ph, [0, 0], [batch_size, n_step])), axis=1) + gamma ** n_step * (1 - tf.reshape(tf.slice(d_ph, [0, n_step], [batch_size, 1]), [-1])) * head_q_pi_targ) multihead_backup_list.append(head_backup) multihead_q_loss_list.append(tf.reduce_mean((head_q-head_backup)**2)) multihead_q_pi_loss_list.append(-tf.reduce_mean(head_q_pi)) # DDPG losses # 1. pi loss all_q_pi = tf.stack(multihead_q_pi, axis=1) # pi_loss = tf.reduce_mean(multihead_q_pi_loss_list) # Works, but not stable # Works good, need to test generalization # pi_loss = tf.reduce_mean(tf.math.top_k(-all_q_pi, multi_head_size - omit_top_k_Q)[0]) after_omit_overestimation_for_actor = tf.math.top_k(-all_q_pi, multi_head_size - actor_omit_top_k_Q)[0] after_omit_underestimation_for_actor = tf.math.top_k(-after_omit_overestimation_for_actor, multi_head_size - actor_omit_top_k_Q - actor_omit_low_k_Q)[0] pi_loss = tf.reduce_mean(tf.reduce_mean(-after_omit_underestimation_for_actor, axis=1)) # # TODO:test, seems not work # pi_loss = tf.reduce_mean(tf.reduce_mean(tf.math.top_k(-all_q_pi, multi_head_size - actor_omit_top_k_Q)[0], axis=1) + # multihead_q_std_penalty * tf.math.reduce_variance(all_q_pi, axis=1)) # # import pdb; pdb.set_trace() # pi_loss = tf.reduce_sum(tf.reduce_mean(tf.math.top_k(-tf.stack(multihead_q_pi, axis=1), multi_head_size - omit_top_k_Q)[0], axis=0)) # pi_loss = tf.reduce_mean(-multihead_q_pi[0]) # Too slow # # slow # pi_loss = tf.reduce_mean(tf.reduce_sum(tf.math.top_k(-all_q_pi, # multi_head_size - actor_omit_top_k_Q)[0], axis=1)) # 2. q loss all_q = tf.stack(multihead_q, axis=1) all_q_backup = tf.stack(multihead_backup_list, axis=1) if q_loss_type == 'QLossReduceMeanMean': q_loss = tf.reduce_mean(multihead_q_loss_list) # works elif q_loss_type == 'QLossReduceSumMean': q_loss = tf.reduce_sum(multihead_q_loss_list) # Works good for Swimmer-s3 elif q_loss_type == 'QLossReduceMeanAll': q_loss = tf.reduce_mean((all_q - all_q_backup) ** 2) # (Currently the best) Works good for Swimmer-s0 elif q_loss_type == 'QLossReduceSumAll': q_loss = tf.reduce_sum((all_q - all_q_backup) ** 2) # currently the best, and the policy has approximately monotonic improvement # TODO: multihead_q_std_penalty should be dynamically changed # q_loss = tf.reduce_mean(tf.reduce_mean((all_q - all_q_backup)**2, axis=1) + # multihead_q_std_penalty * tf.math.reduce_std(all_q, axis=1)) # # variance penalty is better than standard deviation penalty # q_loss = tf.reduce_mean(tf.reduce_mean((all_q - all_q_backup) ** 2, axis=1) + # multihead_q_std_penalty * tf.math.reduce_variance(all_q, axis=1)) # # TODO: test reduce_sum and reduce_var # q_loss = tf.reduce_mean(tf.reduce_sum((all_q - all_q_backup) ** 2, axis=1) + # multihead_q_std_penalty * tf.math.reduce_variance(all_q, axis=1)) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables) train_q_op = q_optimizer.minimize(q_loss, var_list=multihead_critic.variables) # Polyak averaging for target variables target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(actor.variables+multihead_critic.variables, actor_targ.variables+multihead_critic_targ.variables)]) # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(actor.variables+multihead_critic.variables, actor_targ.variables+multihead_critic_targ.variables)]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # # TODO: delete env.render() # env.render() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps and not random_action_baseline: a = get_action(o, act_noise) else: a = env.action_space.sample() # env.render() # Manipulate environment change_scale = 1/8 if nonstationary_env == True: if gravity_change_pattern == 'gravity_averagely_equal': # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2 gravity = gravity_base + np.abs(gravity_base) * change_scale * np.sin(2 * np.pi / gravity_cycle * t) elif gravity_change_pattern == 'gravity_averagely_easier': # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) gravity = gravity_base * change_scale * (np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * ( 1 - change_scale) elif gravity_change_pattern == 'gravity_averagely_harder': # gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base gravity = gravity_base * change_scale * (-np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * ( 1 + change_scale) else: pass if 'PyBulletEnv' in env_name: env.env._p.setGravity(0, 0, gravity) elif 'Roboschool' in env_name: pass else: env.model.opt.gravity[2] = gravity # Step the env o2, r, d, _ = env.step(a) # Add observation noise o2 += obs_noise_scale * np.random.randn(obs_dim) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, reward_scale*r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if t > batch_size and without_delay_train: if random_n_step: n_step = np.random.randint(random_n_step_low, random_n_step_high + 1, 1)[0] batch = replay_buffer.sample_batch_multihead_n_step(batch_size, n_step_end=max(multi_head_multi_step_size)) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # import pdb; pdb.set_trace() # Q-learning update outs = sess.run([multihead_q_loss_list, multihead_q, train_q_op], feed_dict) logger.store(**{'LossQ{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in range(multi_head_size)}) logger.store(**{'QVals{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[1][h_i] for h_i in range(multi_head_size)}) # Policy update outs = sess.run([multihead_q_pi_loss_list, train_pi_op, target_update], feed_dict) logger.store(**{'LossPi{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in range(multi_head_size)}) if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ if not without_delay_train: for _ in range(ep_len): if random_n_step: n_step = np.random.randint(random_n_step_low, random_n_step_high+1, 1)[0] batch = replay_buffer.sample_batch_multihead_n_step(batch_size, n_step_end=max(multi_head_multi_step_size)) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([multihead_q_loss_list, multihead_q, train_q_op], feed_dict) logger.store(**{'LossQ{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in range(multi_head_size)}) logger.store(**{'QVals{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[1][h_i] for h_i in range(multi_head_size)}) # Policy update outs = sess.run([multihead_q_pi_loss_list, train_pi_op, target_update], feed_dict) logger.store(**{'LossPi{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in range(multi_head_size)}) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) for h_i in range(multi_head_size): logger.log_tabular('QVals{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]), with_min_and_max=True) for h_i in range(multi_head_size): logger.log_tabular('LossPi{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]), average_only=True) for h_i in range(multi_head_size): logger.log_tabular('LossQ{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]), average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=250, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, use_grad_penalty=True, penalty_scale=.025): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device=DEVICE) ac_targ = deepcopy(ac).to(device=DEVICE) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False clip_val = 10 for p in ac.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_val, clip_val)) p.register_hook(lambda grad: torch.where( grad != grad, torch.tensor(0., device=DEVICE), grad)) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions if use_grad_penalty: pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = gradient_penalty(ac_targ.q1, o2, a2, epsilon=penalty_scale) q2_pi_targ = gradient_penalty(ac_targ.q2, o2, a2, epsilon=penalty_scale) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ else: with torch.no_grad(): pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.cpu().detach().numpy(), Q2Vals=q2.cpu().detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = data['obs'] q1_pi = ac.q1(o, ac.pi(o)) return -q1_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() torch.nn.utils.clip_grad_value_(q_params, clip_val) q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() torch.nn.utils.clip_grad_value_(ac.pi.parameters(), clip_val) pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32, device=DEVICE)) a += noise_scale * np.random.randn(act_dim) if not np.isfinite(a).all(): pdb.set_trace() return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def test_agent_transfer(): worst_case = np.inf for j in range(num_test_episodes): test_env = env_fn(transfer=True) # test_env = env_fn() o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 worst_case = min(ep_ret, worst_case) logger.store(TransferEpRet=ep_ret, TransferEpLen=ep_len) # logger.store(WorstTransferEpRet=worst_case) def test_agent_random(): worst_case = np.inf for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o += np.random.normal(0, .01, o.shape) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) o += np.random.normal(0, .01, o.shape) ep_ret += r ep_len += 1 worst_case = min(ep_ret, worst_case) logger.store(RandomEpRet=ep_ret, RandomEpLen=ep_len) def test_agent_adversarial_noise(): def adv_step(o): tens_o = torch.as_tensor(o, device=DEVICE) v = lambda obs: ac.q1(tens_o, ac.pi(obs)) #Value of policy given perturbed observation adv_obs = state_gradient(v, tens_o, epsilon=2e-2) #Bounded adversarial perturbation to observation return adv_obs.cpu().numpy() for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o = adv_step(o) # o += np.random.normal(1, .01, o.shape) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) o = adv_step(o) # o += np.random.normal(1, .01, o.shape) ep_ret += r ep_len += 1 logger.store(AdvEpRet=ep_ret, AdvEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() test_agent_transfer() test_agent_random() # test_agent_adversarial_noise() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TransferEpRet', with_min_and_max=True) logger.log_tabular('RandomEpRet', with_min_and_max=True) # logger.log_tabular('AdvEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TransferEpLen', average_only=True) logger.log_tabular('RandomEpLen', average_only=True) # logger.log_tabular('AdvEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class DQNAgent: """DQN Agent interacting with environment. Attribute: env (gym.Env): openAI Gym environment memory (ReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling epsilon (float): parameter for epsilon greedy policy max_epsilon (float): max value of epsilon min_epsilon (float): min value aof epsilon target_update (int): period for target model's hard update gamma (float): discount factor dqn (Network): model to train and select actions dqn_target (Network): target model to update optimizer (torch.optim): optimizer for training dqn transition (list): transition information including state, action, reward, next_state, done """ def __init__( self, env: gym.Env, replay_size: int, batch_size: int, target_update: int, update_after: int, update_every: int, logger_kwargs, ): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) seed = 0 torch.manual_seed(seed) np.random.seed(seed) # obs_dim = len(env.observation_space.spaces) # action_dim = env.action_space.n obs_dim = 5 action_dim = 3 self.env = env self.replaybuffer = ReplayBuffer(obs_dim, replay_size, batch_size) self.batch_size = batch_size self.epsilon = 0.1 self.target_update = target_update self.gamma = 0.9 self.update_after = update_after self.update_every = update_every self.action_dict = { 'Do_Nothing': 0, 'Emergency_CA': 1, 'Suggested_Shift_L4': 2, 'Shift_L4': 3, 'Correct_Distraction': 4, } self.sub_action_dict = { 'Suggested_Shift_L4': 2, 'Shift_L4': 3, 'Correct_Distraction': 4, } # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # networks: dqn, dqn_target self.dqn = Network(obs_dim, action_dim).to(self.device) self.dqn_target = Network(obs_dim, action_dim).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer, only for self.dqn self.optimizer = Adam(self.dqn.parameters(), lr=0.001) self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=5, gamma=0.8) # Set up model saving self.logger.setup_pytorch_saver(self.dqn) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False def simple_case_action(self, state: np.ndarray) -> np.ndarray: ''' Choose action in normal and critical situation ''' # observations d_att = int(state[0][0]) d_pp = int(state[0][3]) auto_mode = int(state[0][4]) c_sc = int(state[0][14]) driver_fit_manual = auto_mode == 0 and d_att == 0 collision_risk = c_sc == 1 with_preference = d_pp == 1 # Initialization of action # if action ==-10, then it is not normal nor critical. action = -10 # Status 1: Critical if collision_risk: action = self.action_dict[ 'Emergency_CA'] # DiscreteAction.Emergency_CA # Status 2: Normal if (driver_fit_manual and not with_preference) and (not collision_risk): action = self.action_dict[ 'Do_Nothing'] # DiscreteAction.Do_Nothing # Status: SSL4 feedback f_dc = int(state[0][18]) # 0 - no reponse; 1 - accept; 2 - reject f_as = int( state[0] [19]) # 0 - do nothing; 1 - correct; 2 - suggest shift to L4 # SSL4 and feedbacks SSL4_last_time = f_as == 2 no_response_ssl4 = f_dc == 0 and SSL4_last_time accept_ssl4 = f_dc == 1 and SSL4_last_time reject_ssl4 = f_dc == 2 and SSL4_last_time if no_response_ssl4: # <No Response> and then repreat SSL4 action = self.action_dict[ 'Suggested_Shift_L4'] # Suggested Shift L4 elif accept_ssl4: # <Accept> and Shift to L4 action = self.action_dict['Shift_L4'] # Shift_L4 elif reject_ssl4: # <Reject> and Repeat Correction Distraction action = self.action_dict[ 'Correct_Distraction'] # Correct Distraction return action def complex_case_action(self, last_state: np.ndarray, state: np.ndarray) -> np.ndarray: ''' Choose action using using decision trees in complex situation ''' # current observations d_att = int(state[0][0]) d_fat = int(state[0][1]) comfort = int(state[0][2]) d_pp = int(state[0][3]) auto_mode = int(state[0][4]) L_max_now = int(state[0][5]) L_max_next = int(state[0][6]) c_sc = int(state[0][14]) c_ue = int(state[0][15]) c_vs = int(state[0][16]) f_dc = int(state[0][18]) # 0 - no reponse; 1 - accept; 2 - reject f_as = int( state[0] [19]) # 0 - do nothing; 1 - correct; 2 - suggest shift to L4 # last observations last_d_att = int(last_state[0][0]) # initial values action = -10 #--------------------------------------------------------# # Status 3: Degraded driver behavior # Correction distraction_begin = last_d_att == 0 and d_att == 1 DN_last_time = f_as == 0 if distraction_begin and DN_last_time: action = self.action_dict['Correct_Distraction'] CD_last_time = f_as == 1 distraction_eliminate = last_d_att == 1 and d_att == 0 distraction_still = last_d_att == 1 and d_att == 1 L4_available = L_max_now == 3 if CD_last_time and distraction_eliminate: # Correction works action = self.action_dict['Do_Nothing'] elif CD_last_time and distraction_still: # Correction fails if L4_available and f_dc != 2: # L4_available and not being rejected before action = self.action_dict['Suggested_Shift_L4'] elif not L4_available: # L4 unavailable and thus repeat corrections action = self.action_dict['Correct_Distraction'] # SSL4 and feedbacks SSL4_last_time = f_as == 2 no_response_ssl4 = f_dc == 0 and SSL4_last_time accept_ssl4 = f_dc == 1 and SSL4_last_time reject_ssl4 = f_dc == 2 and SSL4_last_time if no_response_ssl4: # <No Response> and then repreat SSL4 action = self.action_dict[ 'Suggested_Shift_L4'] # Suggested Shift L4 elif accept_ssl4: # <Accept> and Shift to L4 action = self.action_dict['Shift_L4'] # Shift_L4 elif reject_ssl4: # <Reject> and Repeat Correction Distraction action = self.action_dict[ 'Correct_Distraction'] # Correct Distraction #-------------------------------------------------------# if action == -10: print('Warning: no action generated from rule-based algorithm!') action_type, action = random.choice( list(self.sub_action_dict.items())) return action def get_action(self, state: np.ndarray, test_stage) -> np.ndarray: """ Select an action from the input state based on the rl policy: epsilon greedy policy """ if not test_stage and (np.random.random() < self.epsilon): # selected_action = self.env.action_space.sample() action_type, selected_action = random.choice( list(self.sub_action_dict.items())) else: selected_action = self.dqn( torch.FloatTensor(state).to(self.device)).argmax() # selected_action = selected_action.detach().cpu().numpy() # TODO +2 selected_action = selected_action.detach().cpu().numpy() + 2 return selected_action def custom_obs(self, state): ''' Return customized the state ''' d_att = int(state[0][0]) f_dc = int( state[0] [18]) # 0 - no reponse; 1 - accept; 2 - reject; -1-inactivated f_as = int(state[0][19]) auto_mode = int(state[0][4]) L_max_now = int(state[0][5]) cd_activate = 1 if f_as == 1 else 0 L4_available = 1 if L_max_now == 3 else 0 ssl4_activate = 1 if f_as == 2 else 0 c_state = [] c_state.append([d_att, cd_activate, L4_available, ssl4_activate, f_dc]) return np.array(c_state) def run(self, num_epoch: int, episodes_per_epoch: int, test_episodes: int): """Train the agent. episode_reward: lists of episode rewards sum_rew: float: reward of each iteration iter_time: the number of time steps update_cnt: determine the frequency of updating the target network """ print('********************* Training Starts *********************') self.is_test = False model_loss, epsilons = [], [] episode_id, step_id, iter_time, update_cnt = 0, 0, 0, 0 sum_rew = 0.0 state = self.env.reset() last_state = state already_starts = False episode_begin = 1e8 total_episode = num_epoch * episodes_per_epoch while episode_id < total_episode: step_id += 1 cus_state = self.custom_obs(state) # 1: >>>> Choose action action = self.simple_case_action(state) if action == -10: if iter_time < 2000: action_type, action = random.choice( list(self.sub_action_dict.items())) else: action = self.get_action(cus_state, False) # 2: >>>> Run episode next_state, reward, done, d_info = self.env.step(action) # 3: >>>> To determine the moment when data starts to be saved in the replay buffer mydict = self.action_dict d_att_last = int(last_state[0][0]) d_att_now = int(state[0][0]) d_att_next = int(next_state[0][0]) l4_next = 1 if int(next_state[0][5]) == 3 else 0 degradation_begin = d_att_now == 0 and d_att_next == 1 L4_available = int(state[0][5]) == 3 TESD = state[0][12] if degradation_begin and not already_starts: episode_begin = step_id already_starts = True if not self.is_test and step_id >= episode_begin: if d_att_now == 1: # put distraction states into replay buffer cus_next_state = self.custom_obs(next_state) self.transition = [ cus_state, action, reward, cus_next_state, done ] self.replaybuffer.store(*self.transition) iter_time = iter_time + 1 print( 'Episode:{}, Step:{}, Iteration:{}, State[d_att,cd_activate,L4_available,ssl4_activate,f_dc]:{}' .format(episode_id, step_id, iter_time, cus_state[0])) print( 'Dis_Last:{}, Dis_Now:{}, Dis_Next:{},L4_Next:{}, Reward+Cost:{}, Action:{}' .format( d_att_last, d_att_now, d_att_next, l4_next, reward, list(mydict.keys())[list( mydict.values()).index(action)])) # 4: >>>> Update state and sum of rewards state = next_state sum_rew += reward # 5. >>>> End and Reset if done: print('Done infos: ', d_info) print('Return(Sum of Rewards):{}'.format(round(sum_rew, 1))) print( '-------------------------------------------------------------------------------------------------------------------------' ) # TODO self.logger.store(EpRet=sum_rew) # reset env state = self.env.reset() last_state = state sum_rew = 0.0 episode_id = episode_id + 1 step_id = 0 already_starts = False episode_begin = 1e8 # 6. >> Update Model Parameters if (iter_time >= self.update_after) and (iter_time % self.update_every == 0): for j in range(self.update_every): self.update_model() update_cnt += 1 if update_cnt % self.target_update == 0: self._target_hard_update() # 7. Save and log information if (iter_time >= self.update_after and done) and (episode_id + 1) % episodes_per_epoch == 0: # Epoch information epoch = episode_id // episodes_per_epoch self.scheduler.step() # self.lr_list.append(optimizer.state_dict()['param_groups'][0]['lr']) # Save model self.logger.save_state({'env': self.env}, None) # Test the performance of the agent self.test_agent(test_episodes) # Save important info self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('QVals', with_min_and_max=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('TotalEnvInteracts', iter_time) self.logger.dump_tabular() def update_model(self): """Update the model by gradient descent.""" samples = self.replaybuffer.sample_batch() loss_q, q_info = self._compute_dqn_loss(samples) self.optimizer.zero_grad() loss_q.backward() self.optimizer.step() self.logger.store(LossQ=loss_q.item(), **q_info) def _target_hard_update(self): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) def _compute_dqn_loss(self, samples: Dict[str, np.ndarray]) -> torch.Tensor: """Return dqn loss.""" device = self.device # for shortening the following lines state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["acts"].reshape(-1, 1)).to(device) reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device) done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device) # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise curr_q_value = self.dqn(state).gather(1, action) next_q_value = self.dqn_target(next_state).gather( # Double DQN 1, self.dqn(next_state).argmax(dim=1, keepdim=True)).detach() mask = 1 - done target = (reward + self.gamma * next_q_value * mask).to( self.device) # ground truth # calculate dqn loss loss_fun = torch.nn.MSELoss().to(self.device) loss_q = loss_fun(curr_q_value, target) loss_info = dict(QVals=curr_q_value.detach().numpy()) return loss_q, loss_info def test_agent(self, test_episodes): """ Test the agent """ for j in range(test_episodes): sum_rew, done, state = 0.0, False, self.env.reset() while not done: action = self.simple_case_action(state) if action == -10: cus_state = self.custom_obs(state) action = self.get_action(cus_state, True) next_state, reward, done, infos = self.env.step(action) state = next_state sum_rew += reward self.logger.store(TestEpRet=sum_rew)
def ddpg_mixed_n_step(env_name, ac_kwargs=dict(), seed=0, new_mlp=True, dropout_rate = 0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), lambda_value = 0.8, n_step_start = 1, n_step_end = 5, rejection_method = 'no_rejection_average_weight', n_step=1, random_n_step=False, random_n_step_low=1, random_n_step_high=5, gamma=0.99, without_delay_train=False, obs_noise_scale=0, nonstationary_env=False, gravity_change_pattern = 'gravity_averagely_equal', gravity_cycle = 1000, gravity_base = -9.81, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, random_action_baseline=False, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph n_step_size = n_step_end - (n_step_start - 1) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, obs_dim)) a_ph = tf.placeholder(dtype=tf.float32, shape=(None, act_dim)) x2_ph = tf.placeholder(dtype=tf.float32, shape=(None, n_step_size, obs_dim)) r_ph = tf.placeholder(dtype=tf.float32, shape=(None, None)) d_ph = tf.placeholder(dtype=tf.float32, shape=(None, None)) n_step_ph = tf.placeholder(dtype=tf.float32, shape=()) hidden_sizes = list(ac_kwargs['hidden_sizes']) actor_hidden_activation = tf.keras.activations.relu actor_output_activation = tf.keras.activations.tanh critic_hidden_activation = tf.keras.activations.relu critic_output_activation = tf.keras.activations.linear # Main outputs from computation graph with tf.variable_scope('main'): actor = MLP(layer_sizes=hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic = MLP(layer_sizes=hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Set training=False to ignore dropout masks pi = act_limit * actor(x_ph, training=False) q = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1)), axis=1) q_pi = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1)), axis=1) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). actor_targ = MLP(layer_sizes=hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic_targ = MLP(layer_sizes=hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Set training=False to ignore dropout for backup target value n_step_q_pi_targ = [] for n_step_i in range(n_step_size): n_step_x2 = tf.squeeze(tf.slice(x2_ph, [0, n_step_i, 0], [batch_size, 1, obs_dim]), axis=1) n_step_pi_targ = act_limit * actor_targ(n_step_x2) n_step_q_pi_targ.append( tf.squeeze(critic_targ(tf.concat([n_step_x2, n_step_pi_targ], axis=-1)), axis=1)) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts) # Bellman backup for Q function n_step_backup_list = [] n_step_backup_weight_list = [] n_step_backup_weighted_list = [] for n_step in range(n_step_start, n_step_end+1): print(n_step) if n_step <= n_step_end - 1: n_step_weight = (1 - lambda_value) * lambda_value ** (n_step - n_step_start) else: n_step_weight = lambda_value ** (n_step_end - n_step_start) n_step_backup = tf.stop_gradient(tf.reduce_sum(tf.multiply(tf.pow(gamma, tf.range(0, n_step, dtype=tf.float32)) * (1 - tf.slice(d_ph, [0, 0], [batch_size, n_step])), tf.slice(r_ph, [0, 0], [batch_size, n_step])), axis=1) + gamma ** n_step * (1 - tf.reshape(tf.slice(d_ph, [0, n_step], [batch_size, 1]), [-1])) * n_step_q_pi_targ[n_step-n_step_start]) n_step_backup_list.append(n_step_backup) n_step_backup_weight_list.append(n_step_weight) n_step_backup_weighted_list.append(n_step_weight*n_step_backup) # TODO: could we consider standard deviation of n-step bootstrapped Q and reject outliers? # because for different states the extent of overestimation might be different! # DDPG losses # 1. pi loss pi_loss = -tf.reduce_mean(q_pi) all_n_step_backup = tf.stack(n_step_backup_list, axis=1) if rejection_method == 'no_rejection_average_weight': q_loss = tf.reduce_mean((q - tf.reduce_mean(all_n_step_backup, axis=1)) ** 2) elif rejection_method == 'no_rejection_lambda_weight': q_loss = tf.reduce_mean((q - tf.reduce_sum(tf.stack(n_step_backup_weighted_list, axis=1), axis=1)) ** 2) elif rejection_method == 'mean_and_std_rejection': # 1. Meand and Standard Deviation rejection rejection_deviation_scale = 3 all_n_step_backup_std = tf.math.reduce_std(all_n_step_backup, axis=1) all_n_step_backup_mean = tf.math.reduce_mean(all_n_step_backup, axis=1) rejection_upper_bound = tf.reshape(all_n_step_backup_mean + rejection_deviation_scale * all_n_step_backup_std, shape=(batch_size, 1)) rejection_lower_bound = tf.reshape(all_n_step_backup_mean - rejection_deviation_scale * all_n_step_backup_std, shape=(batch_size, 1)) # mean-std<= kept values < mean+std kept_mask = tf.dtypes.cast(tf.math.logical_and(tf.math.less(all_n_step_backup, rejection_upper_bound), tf.math.greater(all_n_step_backup, rejection_lower_bound)), tf.float32) mean_backup_after_rejection = tf.reduce_sum(tf.math.multiply(all_n_step_backup, kept_mask), axis=1) / tf.reduce_sum( kept_mask, axis=1) q_loss = tf.reduce_mean((q - mean_backup_after_rejection) ** 2) elif rejection_method == 'interquartile_rejection': # 2. Interquartile rejection reject_low_k = 1 reject_top_k = 1 n_size = n_step_end - n_step_start + 1 after_rejct_top_k = tf.math.top_k(-all_n_step_backup, n_size - reject_top_k)[0] after_rejct_low_k = tf.math.top_k(-after_rejct_top_k, n_size - reject_low_k)[0] q_loss = tf.reduce_mean((q-tf.reduce_mean(after_rejct_low_k, axis=1))**2) # import pdb; pdb.set_trace() # q_loss = tf.reduce_mean((q - tf.reduce_sum(tf.stack(n_step_backup_weighted_list, axis=1), axis=1)) ** 2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables) train_q_op = q_optimizer.minimize(q_loss, var_list=critic.variables) # Polyak averaging for target variables target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(actor.variables + critic.variables, actor_targ.variables + critic_targ.variables)]) # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(actor.variables + critic.variables, actor_targ.variables + critic_targ.variables)]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a = get_action(o, 0) # print(a) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # # TODO: delete env.render() # env.render() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps and not random_action_baseline: a = get_action(o, act_noise) else: a = env.action_space.sample() #env.render() # Manipulate environment change_scale = 1/8 if nonstationary_env == True: if gravity_change_pattern == 'gravity_averagely_equal': # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2 gravity = gravity_base + np.abs(gravity_base) * change_scale * np.sin(2 * np.pi / gravity_cycle * t) elif gravity_change_pattern == 'gravity_averagely_easier': # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) gravity = gravity_base * change_scale * (np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * ( 1 - change_scale) elif gravity_change_pattern == 'gravity_averagely_harder': # gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base gravity = gravity_base * change_scale * (-np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * ( 1 + change_scale) else: pass if 'PyBulletEnv' in env_name: env.env._p.setGravity(0, 0, gravity) elif 'Roboschool' in env_name: pass else: env.model.opt.gravity[2] = gravity # Step the env o2, r, d, _ = env.step(a) # Add observation noise o2 += obs_noise_scale * np.random.randn(obs_dim) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if t > batch_size and without_delay_train: if random_n_step: n_step = np.random.randint(random_n_step_low, random_n_step_high + 1, 1)[0] batch = replay_buffer.sample_batch_mixed_n_step(batch_size, n_step_start=n_step_start, n_step_end=n_step_end) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # import pdb; pdb.set_trace() # Q-learning update outs = sess.run([q_loss, q, n_step_backup_weighted_list, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) logger.store(LossQ=outs[0], QVals=outs[1]) logger.store(**{'{}Step_Backup'.format(i): outs[2][i-n_step_start] for i in range(n_step_start, n_step_end + 1)}) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ if not without_delay_train: for _ in range(ep_len): if random_n_step: n_step = np.random.randint(random_n_step_low, random_n_step_high+1, 1)[0] batch = replay_buffer.sample_batch_mixed_n_step(batch_size, n_step_start=n_step_start, n_step_end=n_step_end) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # mean_kept_mask = sess.run(tf.reduce_mean(tf.reduce_sum(kept_mask, axis=1)), feed_dict) # print('mean_kept_mask={}'.format(mean_kept_mask)) # all_backup = sess.run(all_n_step_backup, feed_dict) # upper_bound = sess.run(rejection_upper_bound, feed_dict) # lower_bound = sess.run(rejection_lower_bound, feed_dict) # # import pdb; pdb.set_trace() # print('all_backup[0,:]={}, [{},{}] '.format(all_backup[0,:],lower_bound[0], upper_bound[0])) # Q-learning update # Q-learning update outs = sess.run([q_loss, q, n_step_backup_weighted_list, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) logger.store(**{'{}Step_Backup'.format(i): outs[2][i-n_step_start] for i in range(n_step_start, n_step_end+1)}) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) for i in range(n_step_start, n_step_end+1): logger.log_tabular('{}Step_Backup'.format(i), average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def elu_ddpg( env_fn, render_env=False, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, # TODO: change back to 10000 start_steps=10000, #start_steps=10000, reward_scale=5, act_noise=0.1, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) x_ph, \ a_ph, a_mu_ph, a_alpha_ph, a_beta_ph, \ x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, act_dim, act_dim, int(act_dim*(act_dim-1)/2), obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, pi_mu, pi_alpha, pi_beta, pi_cov, q, q_pi, q_pi_mu = actor_critic( x_ph, a_ph, **ac_kwargs) # pi, q, q_mu, q_sigma, q_pi, q_pi_mu, q_pi_sigma = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, pi_mu_targ, pi_alpha_targ, pi_beta_targ, pi_cov_targ, _, q_pi_targ, q_pi_mu_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # pi_targ, _, _, _, q_pi_targ, q_pi_mu_targ, q_pi_sigma_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, logger_fname='experiences_log.txt', **logger_kwargs) # # Count variables # var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) # print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses # TODO: add term to penalize large variance, give penalize term cofficient # # pi_loss = tf.reduce_mean(-q_pi + # (1/act_dim) * tf.norm(pi_alpha,ord=2,axis=1) + # 1/(act_dim*(act_dim-1)/2) * tf.norm(pi_beta,ord=1,axis=1)) # Option 1. (pass) pi_loss = tf.reduce_mean(-q_pi) # Option 2. (pass) # pi_loss = tf.reduce_mean(-q_pi-q_pi_mu) # Option 3. (pass) # pi_loss = tf.reduce_mean(-q_pi-tf.linalg.logdet(pi_cov)) # Option 4. (pass) # pi_loss = tf.reduce_mean(-q_pi - tf.linalg.logdet(tf.linalg.inv(pi_cov))) # Option 5. # pi_loss = tf.reduce_mean(-q_pi/2 -q_pi_mu/2 - tf.linalg.logdet(tf.linalg.inv(pi_cov))) # Option 5. # pi_loss = tf.reduce_mean(-q_pi/2 -q_pi_mu/2 - 0.001*tf.linalg.logdet(pi_cov)) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() # sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.run(tf.global_variables_initializer()) sess.run(target_init) # import pdb; pdb.set_trace() writer = tf.summary.FileWriter( osp.join(logger_kwargs['output_dir'], 'graph'), sess.graph) writer.flush() # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi_mu': pi_mu, 'pi_alpha': pi_alpha, 'pi_beta': pi_beta, 'q': q }) def get_action(o): # import pdb; pdb.set_trace() # a_mu, a_alpha, a_beta, a_cov = sess.run([pi_mu, pi_alpha, pi_beta, pi_cov], feed_dict={x_ph: o.reshape(1,-1)}) # # if np.any(np.linalg.eigvals(a_cov[0])<=0): # import pdb;pdb.set_trace() a, a_mu, a_alpha, a_beta, a_cov = sess.run( [pi, pi_mu, pi_alpha, pi_beta, pi_cov], feed_dict={x_ph: o.reshape(1, -1)}) a, a_mu, a_alpha, a_beta, a_cov = a[0], a_mu[0], a_alpha[0], a_beta[ 0], a_cov[0] return a, a_mu, a_alpha, a_beta, a_cov def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a, a_mu, a_alpha, a_beta, a_cov = get_action(o) o, r, d, _ = test_env.step(a) # o, r, d, _ = test_env.step(a_mu) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a, a_mu, a_alpha, a_beta, a_cov = get_action(o) # import pdb; pdb.set_trace() print(a_alpha) else: a = env.action_space.sample() a_mu = a a_alpha = np.zeros((act_dim, )) a_beta = np.zeros((int(act_dim * (act_dim - 1) / 2), )) a_cov = np.zeros((act_dim, act_dim)) # Step the env if render_env: env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer # TODO: use determinant as extrinsic reward print('np.linalg.det(a_cov)={}'.format(np.linalg.det(a_cov))) replay_buffer.store(o, a, a_mu, a_alpha, a_beta, a_cov, reward_scale * (r + np.linalg.det(a_cov)), o2, d, t, steps_per_epoch, start_time) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ # print('training ...') for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], a_mu_ph: batch['acts_mu'], a_alpha_ph: batch['acts_alpha'], a_beta_ph: batch['acts_beta'], r_ph: batch['rews'], d_ph: batch['done'] } # import pdb; pdb.set_trace() # # outs = sess.run([pi_mu, pi_alpha, pi_beta], feed_dict) # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) if outs[0] > 10000: print('q_loss={}'.format(outs[0])) # import pdb; # pdb.set_trace() # Policy update if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # print('training done.') # if t%1000 == 0: # print('step={}'.format(t)) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. # TODO: change test number test_agent(2) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class sac_discrete_class: def __init__(self, env_fn, Actor=core.DiscreteMLPActor, Critic=core.DiscreteMLPQFunction, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(5e5), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_times_every_step=50, num_test_episodes=10, max_ep_len=100000, logger_kwargs=dict(), save_freq=1, automatic_entropy_tuning=True, use_gpu=False, gpu_parallel=False, show_test_render=False, last_save_path=None, **kwargs): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_times_every_step (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.ac_kwargs = ac_kwargs self.seed = seed self.steps_per_epoch = steps_per_epoch self.epochs = epochs self.replay_size = replay_size self.gamma = gamma self.polyak = polyak self.lr = lr self.alpha = alpha self.batch_size = batch_size self.start_steps = start_steps self.update_after = update_after self.update_times_every_step = update_times_every_step self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.logger_kwargs = logger_kwargs self.save_freq = save_freq self.automatic_entropy_tuning = automatic_entropy_tuning self.use_gpu = use_gpu self.gpu_parallel = gpu_parallel self.show_test_render = show_test_render self.last_save_path = last_save_path self.kwargs = kwargs self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env = env_fn() self.test_env = env_fn() self.env.seed(seed) # env.seed(seed) # test_env.seed(seed) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.n # Create actor-critic module and target networks self.actor = Actor(self.obs_dim, self.act_dim, **ac_kwargs) self.critic1 = Critic(self.obs_dim, self.act_dim, **ac_kwargs) self.critic2 = Critic(self.obs_dim, self.act_dim, **ac_kwargs) self.critic1_targ = deepcopy(self.critic1) self.critic2_targ = deepcopy(self.critic2) # gpu是否使用 if torch.cuda.is_available(): self.device = torch.device("cuda" if self.use_gpu else "cpu") if gpu_parallel: self.actor = torch.nn.DataParallel(self.actor) self.critic1 = torch.nn.DataParallel(self.critic1) self.critic2 = torch.nn.DataParallel(self.critic2) self.critic1_targ = torch.nn.DataParallel(self.critic1_targ) self.critic2_targ = torch.nn.DataParallel(self.critic2_targ) else: self.use_gpu = False self.gpu_parallel = False self.device = torch.device("cpu") # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.critic1_targ.parameters(): p.requires_grad = False for p in self.critic2_targ.parameters(): p.requires_grad = False self.actor.to(self.device) self.critic1.to(self.device) self.critic2.to(self.device) self.critic1_targ.to(self.device) self.critic2_targ.to(self.device) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=1, size=replay_size, device=self.device) # # List of parameters for both Q-networks (save this for convenience) # q_params = itertools.chain(critic1.parameters(), critic2.parameters()) if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / self.act_dim)) * 0.98 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=lr, eps=1e-4) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.actor, self.critic1, self.critic2]) self.logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.actor.parameters(), lr=lr) self.q1_optimizer = Adam(self.critic1.parameters(), lr=lr) self.q2_optimizer = Adam(self.critic2.parameters(), lr=lr) if last_save_path is not None: checkpoints = torch.load(last_save_path) self.epoch = checkpoints['epoch'] self.actor.load_state_dict(checkpoints['actor']) self.critic1.load_state_dict(checkpoints['critic1']) self.critic2.load_state_dict(checkpoints['critic2']) self.pi_optimizer.load_state_dict(checkpoints['pi_optimizer']) self.q1_optimizer.load_state_dict(checkpoints['q1_optimizer']) self.q2_optimizer.load_state_dict(checkpoints['q2_optimizer']) self.critic1_targ.load_state_dict(checkpoints['critic1_targ']) self.critic2_targ.load_state_dict(checkpoints['critic2_targ']) # last_best_Return_per_local = checkpoints['last_best_Return_per_local'] print("succesfully load last prameters") else: self.epoch = 0 print("Dont load last prameters.") # Set up function for computing SAC Q-losses def compute_loss_q(self, data): # Bellman backup for Q functions with torch.no_grad(): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] r = r.unsqueeze(-1) if r.ndim == 1 else r d = d.unsqueeze(-1) if d.ndim == 1 else d # Target actions come from *current* policy a2, (a2_p, logp_a2), _ = self.get_action(o2) # Target Q-values q1_pi_targ = self.critic1_targ(o2) q2_pi_targ = self.critic2_targ(o2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) min_qf_next_target = a2_p * (q_pi_targ - self.alpha * logp_a2) min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1) backup = r + self.gamma * (1 - d) * min_qf_next_target q1 = self.critic1(o).gather(1, a.long()) q2 = self.critic2(o).gather(1, a.long()) # MSE loss against Bellman backup loss_q1 = F.mse_loss(q1, backup) loss_q2 = F.mse_loss(q2, backup) # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q1, loss_q2, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self, data): state_batch = data['obs'] action, (action_probabilities, log_action_probabilities), _ = self.get_action(state_batch) qf1_pi = self.critic1(state_batch) qf2_pi = self.critic2(state_batch) min_qf_pi = torch.min(qf1_pi, qf2_pi) inside_term = self.alpha * log_action_probabilities - min_qf_pi policy_loss = action_probabilities * inside_term policy_loss = policy_loss.mean() log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1) # Useful info for logging pi_info = dict(LogPi=log_action_probabilities.detach().cpu().numpy()) return policy_loss, log_action_probabilities, pi_info def take_optimisation_step(self, optimizer, network, loss, clipping_norm=None, retain_graph=False): if not isinstance(network, list): network = [network] optimizer.zero_grad() # reset gradients to 0 loss.backward( retain_graph=retain_graph) # this calculates the gradients if clipping_norm is not None: for net in network: torch.nn.utils.clip_grad_norm_( net.parameters(), clipping_norm) # clip gradients to help stabilise training optimizer.step() # this applies the gradients def soft_update_of_target_network(self, local_model, target_model, tau): """Updates the target network in the direction of the local network but by taking a step size less than one so the target network's parameter values trail the local networks. This helps stabilise training""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update(self, data): # First run one gradient descent step for Q1 and Q2 loss_q1, loss_q2, q_info = self.compute_loss_q(data) self.take_optimisation_step( self.q1_optimizer, self.critic1, loss_q1, 5, ) self.take_optimisation_step( self.q2_optimizer, self.critic2, loss_q2, 5, ) # Record things self.logger.store(LossQ=(loss_q1.item() + loss_q2.item()) / 2., **q_info) # Freeze Q-networks so you don't waste computational effort # # computing gradients for them during the policy learning step. # for p in q_params: # p.requires_grad = False # Next run one gradient descent step for pi. loss_pi, log_pi, pi_info = self.compute_loss_pi(data) # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # # Unfreeze Q-networks so you can optimize it at next DDPG step. # for p in q_params: # p.requires_grad = True if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() # logger.store(alpha_loss=alpha_loss.item()) self.take_optimisation_step( self.pi_optimizer, self.actor, loss_pi, 5, ) with torch.no_grad(): for p, p_targ in zip(self.critic1.parameters(), self.critic1_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) for p, p_targ in zip(self.critic2.parameters(), self.critic2_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) if self.automatic_entropy_tuning: self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None) self.alpha = self.log_alpha.exp() def get_action(self, state): """Given the state, produces an action, the probability of the action, the log probability of the action, and the argmax action""" action_probabilities = self.actor(state) max_probability_action = torch.argmax(action_probabilities).unsqueeze( 0) action_distribution = Categorical(action_probabilities) action = action_distribution.sample().cpu() # Have to deal with situation of 0.0 probabilities because we can't do log 0 z = action_probabilities == 0.0 z = z.float() * 1e-8 log_action_probabilities = torch.log(action_probabilities + z) return action, (action_probabilities, log_action_probabilities), max_probability_action def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not (d or (ep_len == self.max_ep_len)): if self.show_test_render: self.test_env.render() # Take deterministic actions at test time with torch.no_grad(): _, (_, _), a = self.get_action( torch.FloatTensor([o]).to(self.device)) o, r, d, _ = self.test_env.step(a.cpu().item()) ep_ret += r ep_len += 1 text = "\r\x1b[32mEpoch: %s, TestEp_ret: %s, Testep_len: %s.\x1b[0m" % \ (self.epoch, ep_ret, ep_len) sys.stdout.write(text) sys.stdout.flush() self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def run(self): # Prepare for interaction with environment total_steps = self.steps_per_epoch * self.epochs start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 eps = 1 t = self.epoch * self.steps_per_epoch if self.last_save_path is not None else 0 # Main loop: collect experience in env and update/log each epoch self.actor.eval() while t < total_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t >= self.start_steps: with torch.no_grad(): a, _, _ = self.get_action(torch.FloatTensor([o]).to(self.device)) if o.shape == self.obs_dim else \ self.get_action(torch.FloatTensor(o).to(self.device)) a = a.cpu().item() else: a = np.random.randint(0, self.act_dim) # Step the env o2, r, d, _ = self.env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == self.max_ep_len else d # Store experience to replay buffer self.replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == self.max_ep_len): # ep_len == max_ep_len是游戏成功时最少ep长度 self.logger.store(EpRet=ep_ret, EpLen=ep_len) text = "\r\x1b[32mEpoch: %s, Episode: %s, Ep_ret: %s, ep_len: %s. [%s/%s] \x1b[0m" % \ (self.epoch, eps, ep_ret, ep_len, t+1, total_steps) sys.stdout.write(text) sys.stdout.flush() o, ep_ret, ep_len = self.env.reset(), 0, 0 # if eps % 30 == 0: # logger.log('\nEpisode: %s\n,\tEp_ret: %s,\tep_len: %s' % (eps, ep_ret,ep_len)) eps += 1 # Update handling if t >= self.update_after and t % self.update_times_every_step == 0: self.actor.train() for j in range(self.update_times_every_step): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch) self.actor.eval() # logger.save_epoch_Ret_optimizer_model(save_dict) # last_best_Return_per_local = Return_per_local # End of epoch handling if ( t + 1 ) % self.steps_per_epoch == 0 and t > self.update_after: # steps_perepoch步 and 大于update_after步 if ( t + 1 ) % self.update_times_every_step == 0: # 每达到update_times_every_step self.epoch = (t + 1) // self.steps_per_epoch # Save model if proc_id() == 0 and (self.epoch) % self.save_freq == 0: save_dict = { 'epoch': self.epoch, 'actor': self.actor.state_dict(), 'critic1': self.critic1.state_dict(), 'critic2': self.critic2.state_dict(), 'pi_optimizer': self.pi_optimizer.state_dict(), 'q1_optimizer': self.q1_optimizer.state_dict(), 'q2_optimizer': self.q2_optimizer.state_dict(), 'critic1_targ': self.critic1_targ.state_dict(), 'critic2_targ': self.critic2_targ.state_dict(), } self.logger.save_epoch_Ret_optimizer_model( save_dict, self.epoch) self.actor.eval() # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', self.epoch) self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('TestEpRet', with_min_and_max=False) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalEnvInteracts', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) # if epoch > 1: # (time.time() - start_time)/epo self.logger.dump_tabular() t += 1
def ddpg(env_fn=core.ALMEnv, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=300, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=.01, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, time_horizon=80, discount_rate=.06): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. In this version, the default environment is 'ALMEnv' actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) # env, test_env = env_fn(), env_fn() original OpenAI SpinningUp entry env = env_fn(T=time_horizon, rate=discount_rate) # Added by the author test_env = env_fn(T=time_horizon, rate=discount_rate) # Added by the author obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a = a * (noise_scale * np.random.randn(act_dim) + 1 ) # Added by the author return (a / np.sum(a)) # Added by the author # a += noise_scale * np.random.randn(act_dim) Original OpenAI SpinningUp entry # return np.clip(a, -act_limit, act_limit) Original OpenAI SpinningUp entry def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Main outputs from computation graph with tf.variable_scope('main'): ####################### # # # YOUR CODE HERE # # # ####################### # pi, q1, q2, q1_pi = pass # Target policy network with tf.variable_scope('target'): ####################### # # # YOUR CODE HERE # # # ####################### # pi_targ = pass # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions ####################### # # # YOUR CODE HERE # # # ####################### # Target Q-values, using action from smoothed target policy ####################### # # # YOUR CODE HERE # # # ####################### pass # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets ####################### # # # YOUR CODE HERE # # # ####################### # TD3 losses ####################### # # # YOUR CODE HERE # # # ####################### # pi_loss = # q1_loss = # q2_loss = # q_loss = #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac_adapt_fast( env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=1000, epochs=1000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, save_model=False, auto_alpha=True, grad_clip=-1, logger_store_freq=100, logger_kwargs=dict(), ): """ Largely following OpenAI documentation, but a bit different Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. save_model (bool): set to True if want to save the trained agent auto_alpha: set to True to use the adaptive alpha scheme, target entropy will be set automatically grad_clip: whether to use gradient clipping. < 0 means no clipping logger_store_freq: how many steps to log debugging info, typically don't need to change """ """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed + 10000) test_env.action_space.np_random.seed(seed + 10000) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) """ Auto tuning alpha """ if auto_alpha: target_entropy = -np.prod(env.action_space.shape).item() # H log_alpha = torch.zeros(1, requires_grad=True) alpha_optim = optim.Adam([log_alpha], lr=lr) else: target_entropy, log_alpha, alpha_optim = None, None, None def test_agent(n=1): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs """init all networks""" # see line 1 policy_net = TanhGaussianPolicySACAdapt(obs_dim, act_dim, hidden_sizes, action_limit=act_limit) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q1_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) # see line 2: copy parameters from value_net to target_value_net q1_target_net.load_state_dict(q1_net.state_dict()) q2_target_net.load_state_dict(q2_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode current_update_index = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action(o, deterministic=False) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 """perform update""" if replay_buffer.size >= batch_size: # get data from replay buffer batch = replay_buffer.sample_batch(batch_size) obs_tensor = Tensor(batch['obs1']) obs_next_tensor = Tensor(batch['obs2']) acts_tensor = Tensor(batch['acts']) # unsqueeze is to make sure rewards and done tensors are of the shape nx1, instead of n # to prevent problems later rews_tensor = Tensor(batch['rews']).unsqueeze(1) done_tensor = Tensor(batch['done']).unsqueeze(1) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document psudocode part for reference line nubmers indicate lines in psudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) """get q loss""" with torch.no_grad(): a_tilda_next, _, _, log_prob_a_tilda_next, _, _ = policy_net.forward( obs_next_tensor) q1_next = q1_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) q2_next = q2_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) min_next_q = torch.min(q1_next, q2_next) - alpha * log_prob_a_tilda_next y_q = rews_tensor + gamma * (1 - done_tensor) * min_next_q # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = mse_criterion(q1_prediction, y_q) q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = mse_criterion(q2_prediction, y_q) """ get policy loss """ a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) min_q1_q2_a_tilda = torch.min(q1_a_tilda, q2_a_tilda) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] policy_loss = (alpha * log_prob_a_tilda - min_q1_q2_a_tilda).mean() """ alpha loss, update alpha """ if auto_alpha: alpha_loss = -( log_alpha * (log_prob_a_tilda + target_entropy).detach()).mean() alpha_optim.zero_grad() alpha_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(log_alpha, grad_clip) alpha_optim.step() alpha = log_alpha.exp().item() else: alpha_loss = 0 """update networks""" q1_optimizer.zero_grad() q1_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip) q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip) q2_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip) policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(q1_target_net, q1_net, polyak) soft_update_model1_with_model2(q2_target_net, q2_net, polyak) current_update_index += 1 if current_update_index % logger_store_freq == 0: # store diagnostic info to logger logger.store(LossPi=policy_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossAlpha=alpha_loss.item(), Q1Vals=q1_prediction.detach().numpy(), Q2Vals=q2_prediction.detach().numpy(), Alpha=alpha, LogPi=log_prob_a_tilda.detach().numpy()) if d or (ep_len == max_ep_len): """when episode terminates, log info about this episode, then reset""" ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if save_model: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'q1_target_net': q1_target_net.state_dict(), 'q2_target_net': q2_target_net.state_dict(), 'policy_opt': policy_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer, 'log_alpha': log_alpha, 'alpha_opt': alpha_optim, 'target_entropy': target_entropy } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # use joblib.load(fname) to load # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('Alpha', with_min_and_max=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() sys.stdout.flush()
def td3(env_fn, env_fn_test, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, logdir=None, nstep=None, alpha=None, beta=None, sil_weight=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ assert logdir is not None if not os.path.exists(logdir): os.makedirs(logdir) sess = tf.Session() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn_test() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) x_ph_sil, a_ph_sil, x2_ph_sil, r_ph_sil, d_ph_sil = core.placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) with tf.variable_scope('main', reuse=True): _, q1_sil, q2_sil, _ = actor_critic(x_ph_sil, a_ph_sil, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) with tf.variable_scope('target', reuse=True): pi_targ_sil, _, _, _ = actor_critic(x2_ph_sil, a_ph_sil, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ_sil), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ_sil + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ_sil, q2_targ_sil, _ = actor_critic(x2_ph_sil, a2, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Prioritized replay for expert data sil_replay_buffer = prioritized_buffer.PrioritizedReplayBuffer( size=replay_size, alpha=alpha) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets backup_discount = gamma min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + backup_discount * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss # sil q loss backup_discount_nstep = gamma**nstep min_q_targ_sil = tf.minimum(q1_targ_sil, q2_targ_sil) backup_sil = tf.stop_gradient(r_ph_sil + backup_discount_nstep * (1 - d_ph_sil) * min_q_targ_sil) # TD3 losses weights_ph = tf.placeholder(tf.float32, [None]) gains_1 = tf.nn.relu(backup_sil - q1_sil) gains_2 = tf.nn.relu(backup_sil - q2_sil) q1_loss_sil = tf.reduce_mean(weights_ph * tf.square(gains_1)) q2_loss_sil = tf.reduce_mean(weights_ph * tf.square(gains_2)) q_loss_sil = q1_loss_sil + q2_loss_sil gains = gains_1 + gains_2 # add to the q loss q_loss += sil_weight * q_loss_sil # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): # test recorder ep_ret_list = [] # set up for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0).flatten()) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) ep_ret_list.append(ep_ret) return ep_ret_list start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # record training ep_ret_record = [] time_step_record = [] # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise).flatten() else: a = env.action_space.sample() # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d if 'nstep_data_1' in info.keys(): info['nstep_data_1'][-1] = d if 'nstep_data_{}'.format(nstep) in info.keys(): info['nstep_data_{}'.format(nstep)][-1] = d # Store experience to replay buffer if 'nstep_data_1' in info.keys(): replay_buffer.store(*info['nstep_data_1']) if nstep == 1: try: assert info['nstep_data_1'] == [o, a, r, o2, d] except: import pdb pdb.set_trace() if 'nstep_data_{}'.format(nstep) in info.keys(): sil_replay_buffer.store(*info['nstep_data_{}'.format(nstep)]) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) batch_sil, weights, batch_idxes = sil_replay_buffer.sample_batch( batch_size, beta=beta) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], x_ph_sil: batch_sil['obs1'], x2_ph_sil: batch_sil['obs2'], a_ph_sil: batch_sil['acts'], r_ph_sil: batch_sil['rews'], d_ph_sil: batch_sil['done'], weights_ph: weights } q_step_ops = [q_loss, q1, q2, train_q_op] + [gains] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # get the priorities new_priorities = outs[-1] + 1e-8 sil_replay_buffer.update_priorities(batch_idxes, new_priorities) #print_stats('new priorities', new_priorities) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Test the performance of the deterministic version of the agent. ep_rets = test_agent() ep_ret_record.append(np.mean(ep_rets)) time_step_record.append(t) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # save the records np.save(logdir + '/ep_rets', ep_ret_record) np.save(logdir + '/timesteps', time_step_record)
def oac(env_fn, actor_critic=mlp_actor_critic, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] grad_clip_val = rl_params['grad_clip_val'] # entropy params alpha = rl_params['alpha'] target_entropy = rl_params['target_entropy'] # optimistic exploration params use_opt = rl_params['use_opt'] beta_UB = rl_params['beta_UB'] beta_LB = rl_params['beta_LB'] delta = rl_params['delta'] train_env, test_env = env_fn(), env_fn() obs_dim = train_env.observation_space.shape[0] act_dim = train_env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = train_env.action_space.high[0] # set the seed tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) # Share information about action space with policy architecture network_params['action_space'] = train_env.action_space # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1_a, q2_a, pretanh_mu, std = actor_critic( x_ph, a_ph, **network_params) with tf.variable_scope('main', reuse=True): # compose q with mu _, _, _, q1_mu, q2_mu, _, _ = actor_critic(x_ph, mu, **network_params) # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi, _, _ = actor_critic(x_ph, pi, **network_params) # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _, _, _ = actor_critic( x2_ph, a_ph, **network_params) # Target value network with tf.variable_scope('target'): _, _, _, q1_pi_targ, q2_pi_targ, _, _ = actor_critic( x2_ph, pi_next, **network_params) # alpha Params if target_entropy == 'auto': target_entropy = tf.cast(-act_dim, tf.float32) else: target_entropy = tf.cast(target_entropy, tf.float32) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Count variables var_counts = tuple( count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print("""\nNumber of parameters: alpha: %d, pi: %d, q1: %d, q2: %d, total: %d\n""" % var_counts) if use_opt: # Optimistic Exploration mu_Q = (q1_mu + q2_mu) / 2.0 sigma_Q = tf.math.abs(q1_mu - q2_mu) / 2.0 Q_UB = mu_Q + beta_UB * sigma_Q Q_LB = mu_Q + beta_LB * sigma_Q grad_Q_UB = tf.gradients(Q_UB, pretanh_mu)[0] Sigma = tf.math.pow(std, 2) denom = tf.math.sqrt( tf.math.reduce_sum( tf.math.multiply(tf.math.pow(grad_Q_UB, 2), Sigma))) + 10e-6 mu_C = np.sqrt(2.0 * delta) * tf.math.multiply(Sigma, grad_Q_UB) / denom mu_E = pretanh_mu + mu_C optimistic_pi = tf.tanh(mu_E + tf.random_normal(tf.shape(mu_E)) * std) optimistic_pi *= act_limit else: optimistic_pi = pi # use standard SAC policy Q_LB = tf.minimum(q1_pi, q2_pi) # Min Double-Q: min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * (min_q_pi_targ - alpha * logp_pi_next)) # critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # Soft actor losses pi_loss = tf.reduce_mean(alpha * logp_pi - Q_LB) # alpha loss for temperature parameter alpha_backup = tf.stop_gradient(logp_pi + target_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op # (has to be separate from value train op, because q1_logits appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients( value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize( value_loss, var_list=get_vars('main/q')) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize( alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x_ph': x_ph, 'a_ph': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1_a': q1_a, 'q2_a': q2_a }) def get_action(o, deterministic=False): act_op = mu if deterministic else optimistic_pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=10, render=True): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 if render: test_env.render() while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if render: test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) if render: test_env.close() start_time = time.time() o, r, d, ep_ret, ep_len = train_env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = train_env.action_space.sample() # Step the env o2, r, d, _ = train_env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = train_env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': train_env}, None) # Test the performance of the deterministic version of the agent. test_agent(n=4, render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'), show_plot=False)
def shpo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, device='cpu', steps_per_epoch=4000, epochs=50, replay_size=1000000, gamma=0.99, polyak=0.005, polyak_pi=0.0, lr=1e-3, batch_size=100, expand_batch=100, start_steps=10000, update_after=10000, num_test_episodes=10, per_update_steps_for_actor=100, per_update_steps_for_critic=50, cg_iters=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='shpo'): """ Sinkhorn Policy Optimization (SHPO) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) polyak_pi (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). batch_size (int): Minibatch size for Critic. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # ====== All About Init =============================================================== device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) env.seed(seed) test_env.seed(seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print("obs_dim = {}, act_dim = {}".format(obs_dim, act_dim)) # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) q_optimizer = Adam(q_params, lr=lr) pi_optimizer = Adam(ac.pi.parameters(), lr=lr) # Experience buffer replay_buffer = core.ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # ===== End Of Init ========================================================================= # ===== Critic Loss ========================================================================= def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] o = torch.FloatTensor(o).to(device) a = torch.FloatTensor(a).to(device) r = torch.FloatTensor(r).to(device) o2 = torch.FloatTensor(o2).to(device) d = torch.FloatTensor(d).to(device) q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2 = ac_targ.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info def update_critic(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) for p, p_targ in zip(ac.pi.parameters(), ac_targ.pi.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak_pi) p_targ.data.add_(polyak_pi * p.data) # ===== End Of Critic Loss ============================================================================ # ===== Update Actor ================================================================================== def compute_loss_pi(data): o = data['obs'] o = torch.FloatTensor(o).to(device) o2 = o.repeat(expand_batch, 1) a2 = ac.pi(o2) q1_pi = ac.q1(o2, a2) q2_pi = ac.q2(o2, a2) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = -q_pi.mean() return loss_pi def update_actor(data): for p in q_params: p.requires_grad = False pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() logger.store(LossPi=loss_pi.item()) """ # ??? I am not sure: Do I need zero_grad()? loss_pi = compute_loss_pi(data) grads = torch.autograd.grad(loss_pi, ac.pi.parameters()) grads_vector = torch.cat([grad.view(-1) for grad in grads]).data def get_Hx(x): # Require New Method. invHg = core.cg(get_Hx, loss_grad, cg_iters) # fullstep = ??? with torch.no_grad(): prev_params = core.get_flat_params_from(ac.pi) # new_params = ??? # core.set_flat_params_to(ac.pi, new_params) """ for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # ===== End Of Actor ================================================================================== # ===== Start Training ================================================================================ def get_action(o, deterministic=False): # o = replay_buffer.obs_encoder(o) o = torch.FloatTensor(o.reshape(1, -1)).to(device) a = ac.act(o, deterministic) return a def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t <= start_steps: a = env.action_space.sample() else: a = get_action(o, deterministic=False) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) ac.obs_std = torch.FloatTensor(replay_buffer.obs_std).to(device) ac.obs_mean = torch.FloatTensor(replay_buffer.obs_mean).to(device) ac_targ.obs_std = ac.obs_std ac_targ.obs_mean = ac.obs_mean # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and (t + 1) % steps_per_epoch == 0: for j in range(per_update_steps_for_critic): data = replay_buffer.sample_batch(batch_size) update_critic(data) for j in range(per_update_steps_for_actor): data = replay_buffer.sample_recently(steps_per_epoch) update_actor(data) # End of epoch handling epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Normalize State print("obs_mean=" + str(replay_buffer.obs_mean)) print("obs_std=" + str(replay_buffer.obs_std))
def ddpg_dropout(env_fn, ac_kwargs=dict(), seed=0, new_mlp=True, dropout_rate=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) hidden_sizes = list(ac_kwargs['hidden_sizes']) actor_hidden_activation = tf.keras.activations.relu actor_output_activation = tf.keras.activations.tanh critic_hidden_activation = tf.keras.activations.relu critic_output_activation = tf.keras.activations.linear # Main outputs from computation graph with tf.variable_scope('main'): if not new_mlp: actor = BeroulliDropoutMLP( layer_sizes=hidden_sizes + [act_dim], dropout_rate=dropout_rate, hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic = BeroulliDropoutMLP( layer_sizes=hidden_sizes + [1], dropout_rate=dropout_rate, hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) else: actor = MLP(layer_sizes=hidden_sizes + [act_dim], dropout_rate=dropout_rate, hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic = MLP(layer_sizes=hidden_sizes + [1], dropout_rate=dropout_rate, hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Set training=False to ignore dropout masks pi = act_limit * actor(x_ph, training=False) q = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1), training=False), axis=1) q_pi = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1), training=False), axis=1) # Set traininig=True to mask input for each hidden layer and output layer pi_drop = act_limit * actor(x_ph, training=True) q_drop = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1), training=True), axis=1) # 1. q_pi_drop = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1), training=True), axis=1) # 2. q_pi_drop = tf.squeeze(critic(tf.concat([x_ph, pi_drop], axis=-1), training=True), axis=1) # q_drop = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1), training=False), axis=1) # q_pi_drop = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1), training=False), axis=1) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). if not new_mlp: actor_targ = BeroulliDropoutMLP( layer_sizes=hidden_sizes + [act_dim], dropout_rate=dropout_rate, hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic_targ = BeroulliDropoutMLP( layer_sizes=hidden_sizes + [1], dropout_rate=dropout_rate, hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) else: actor_targ = MLP(layer_sizes=hidden_sizes + [act_dim], dropout_rate=dropout_rate, hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic_targ = MLP(layer_sizes=hidden_sizes + [1], dropout_rate=dropout_rate, hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Set training=False to ignore dropout for backup target value pi_targ = act_limit * actor_targ(x2_ph, training=False) q_pi_targ = tf.squeeze(critic_targ(tf.concat([x2_ph, pi_targ], axis=-1), training=False), axis=1) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses # # 1. # pi_loss = -tf.reduce_mean(q_pi) # q_loss = tf.reduce_mean((q-backup)**2) # # 2. # pi_loss = -tf.reduce_mean(q_pi_drop) # q_loss = tf.reduce_mean((q_drop - backup) ** 2) # 3. pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q_drop - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables) train_q_op = q_optimizer.minimize(q_loss, var_list=critic.variables) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(actor.variables + critic.variables, actor_targ.variables + critic_targ.variables) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(actor.variables + critic.variables, actor_targ.variables + critic_targ.variables) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) state_noise_scale = 0.01 o2 += state_noise_scale * np.random.randn(obs_dim) # import pdb; pdb.set_trace() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(workload_file, model_path, ac_kwargs=dict(), seed=0, traj_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, pre_trained=0, trained_model=None, attn=False, shuffle=False, backfil=False, skip=False, score_type=0, batch_job_slice=0, sched_algo=4): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = HPCEnvSkip(shuffle=shuffle, backfil=backfil, skip=skip, job_score_type=score_type, batch_job_slice=batch_job_slice, build_sjf=False, sched_algo=sched_algo) env.seed(seed) env.my_init(workload_file=workload_file, sched_file=model_path) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['attn'] = attn # Inputs to computation graph buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE, gamma, lam) if pre_trained: sess = tf.Session() model = restore_tf_graph(sess, trained_model) logger.log('load pre-trained model') # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) x_ph = model['x'] a_ph = model['a'] mask_ph = model['mask'] adv_ph = model['adv'] ret_ph = model['ret'] logp_old_ph = model['logp_old_ph'] pi = model['pi'] v = model['v'] # logits = model['logits'] out = model['out'] logp = model['logp'] logp_pi = model['logp_pi'] pi_loss = model['pi_loss'] v_loss = model['v_loss'] approx_ent = model['approx_ent'] approx_kl = model['approx_kl'] clipfrac = model['clipfrac'] clipped = model['clipped'] # Optimizers # graph = tf.get_default_graph() # op = sess.graph.get_operations() # [print(m.values()) for m in op] # train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0') # train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0') train_pi = tf.get_collection("train_pi")[0] train_v = tf.get_collection("train_v")[0] # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad') # train_pi = train_pi_optimizer.minimize(pi_loss) # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad') # train_v = train_v_optimizer.minimize(v_loss) # sess.run(tf.variables_initializer(train_pi_optimizer.variables())) # sess.run(tf.variables_initializer(train_v_optimizer.variables())) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] else: x_ph, a_ph = placeholders_from_spaces(env.observation_space, env.action_space) # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features mask_ph = placeholder(env.action_space.n) adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] # Experience buffer # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = tf.train.AdamOptimizer( learning_rate=pi_lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) tf.add_to_collection("train_pi", train_pi) tf.add_to_collection("train_v", train_v) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v}) logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph, 'adv': adv_ph, 'mask': mask_ph, 'ret': ret_ph, 'logp_old_ph': logp_old_ph }, outputs={ 'pi': pi, 'v': v, 'out': out, 'pi_loss': pi_loss, 'logp': logp, 'logp_pi': logp_pi, 'v_loss': v_loss, 'approx_ent': approx_ent, 'approx_kl': approx_kl, 'clipped': clipped, 'clipfrac': clipfrac }) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 # Main loop: collect experience in env and update/log each epoch start_time = time.time() for epoch in range(epochs): t = 0 discard_times = 0 while True: # [no_skip, skip] lst = [1, 1] #for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES): # job = o[i:i + JOB_FEATURES] # # the skip time of will_skip job exceeds MAX_SKIP_TIME # if job[-3] == 1.0: # lst = [1,0] a, v_t, logp_t, output = sess.run(get_action_ops, feed_dict={ x_ph: o.reshape(1, -1), mask_ph: np.array(lst).reshape(1, -1) }) # print(a, end=" ") ''' action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs) log_action_prob = np.log(action_probs[action]) ''' # save and log if buf.ptr - buf.path_start_idx >= 10 * JOB_SEQUENCE_SIZE: discard_times += 1 buf.ptr = buf.path_start_idx [ o, co ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 continue buf.store(o, None, a, np.array(lst), r, v_t, logp_t) logger.store(VVals=v_t) if a[0] == 1: skip_count += 1 o, r, d, r2, sjf_t, f1_t = env.step(a[0]) ep_ret += r ep_len += 1 show_ret += r2 sjf += sjf_t f1 += f1_t if d: t += 1 buf.finish_path(r) logger.store(EpRet=ep_ret, EpLen=ep_len, ShowRet=show_ret, SJF=sjf, F1=f1, SkipRatio=skip_count / ep_len) [ o, co ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 if t >= traj_per_epoch: # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action) break # print("Sample time:", (time.time()-start_time)/num_total, num_total) # Save model print(discard_times) if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # start_time = time.time() update() # print("Train time:", time.time()-start_time) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * traj_per_epoch * JOB_SEQUENCE_SIZE) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('ShowRet', average_only=True) logger.log_tabular('SJF', average_only=True) logger.log_tabular('F1', average_only=True) logger.log_tabular('SkipRatio', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac1_carla(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=3000, epochs=100, replay_size=int(3e5), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=9000, max_ep_len=600, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_space = env.observation_space.spaces[0] act_space = env.action_space obs_dim = obs_space.shape act_dim = act_space.shape # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(False, x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(False, x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=list(obs_dim), act_dim=list(act_dim), size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/cnn_layer', 'main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t cnn_layer: %d, \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) q_backup = r_ph + gamma*(1-d_ph)*v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - tf.stop_gradient(q1_pi)) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss cnn_params = get_vars('main/cnn_layer') # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) pi_params = get_vars('main/pi') with tf.control_dependencies(update_ops): train_pi_op = pi_optimizer.minimize(pi_loss, var_list = cnn_params + pi_params) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies(update_ops): with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list = cnn_params + value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies(update_ops): with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step with tf.control_dependencies(update_ops): if isinstance(alpha, Number): step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update] else: step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o[np.newaxis,...]})[0] def test_agent(n=1): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: if np.random.randn() > 0.1: b = (1 + np.random.random(1)) * 0.5 else: b = -1 + 2 * np.random.random(1) #b = np.array([1]) #c = np.array([0]) c = -1 + 2*np.random.random(1) a = np.stack((b, c)) a = a.flatten() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) is_train = tf.placeholder(tf.bool, name="is_train") feed_dict = {} feed_dict['is_train'] = True feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], Alpha=outs[6]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. # test_agent() # logger.log_tabular('TestEpLen', average_only=True) # logger.log_tabular('TestEpRet', with_min_and_max=True) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): # Special function to avoid certain slowdowns from PyTorch + MPI combo. # setup_pytorch_for_mpi() # Setup logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random Seed seed += 10000 * proc_id torch.manual_seed(seed) np.random.seed(seed) # Instantiate Environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create Actor-Critic Module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params() # Count the number of variables var_counts = tuple(core.count_variables(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experiment buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for calculating VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy Loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra information approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up a function for calculating Value Function loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret) ** 2).mean() # Set up optimizers for policy and value functions pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_save(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step() # Log changes from the update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with the environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main Loop: Collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical !) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform vpg update update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac1(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=150, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['obs_dim'] = obs_dim h_size = ac_kwargs["h_size"] # hidden size of rnn seq_length = ac_kwargs["seq"] # seq length of rnn # Inputs to computation graph seq = None # training and testing doesn't has to have the same seq length x_ph, a_ph, r_ph, d_ph = core.placeholders([seq, obs_dim], [seq, act_dim], [seq, 1], [seq, 1]) s_t_0 = tf.placeholder(shape=[None, h_size], name="pre_state", dtype="float32") # zero state # s_0 = np.zeros([batch_size, h_size]) # zero state for training N H # Main outputs from computation graph # outputs, states = cudnn_rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"]) outputs, states = rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"]) # states = outputs[:, -1, :] # outputs = mlp(outputs, [ac_kwargs["h_size"], ac_kwargs["h_size"]], activation=tf.nn.elu) # if use model predict next state (obs) with tf.variable_scope("model"): """hidden size for mlp h_size for RNN """ s_predict = mlp(tf.concat([outputs, a_ph], axis=-1), list(ac_kwargs["hidden_sizes"]) + [ac_kwargs["h_size"]], activation=tf.nn.relu) # s_predict = mlp(tf.concat([outputs, a_ph], axis=-1), # list(ac_kwargs["hidden_sizes"]) + [ac_kwargs["obs_dim"] - act_dim], activation=tf.nn.elu) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, q1_pi_, q2_pi_ = actor_critic(x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, h_size=h_size, seq_length=seq_length, flag="seq", normalize=ac_kwargs["norm"]) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', "model"]) print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t model: %d \n' % var_counts) if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.shape)) target_entropy = -np.prod(env.action_space.shape) # log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) # print(ac_kwargs["h0"]) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=ac_kwargs["h0"]) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi[:, :-1, :] + target_entropy)) # Use smaller learning rate to make alpha decay slower alpha_optimizer = tf.train.AdamOptimizer(learning_rate=1e-5, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) # model train op # we can't use s_T to predict s_T+1 # delta_x = tf.stop_gradient(x_ph[:, 1:, :] - x_ph[:, :-1, :]) # predict delta obs instead of obs # TODO: can we use L1 loss delta_x = tf.stop_gradient(outputs[:, 1:, :] - outputs[:, :-1, :]) # predict delta obs instead of obs model_loss = tf.abs((1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x)) # how about "done" state model_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # print(tf.global_variables()) if "m" in ac_kwargs["opt"]: value_params_1 = get_vars('model') + get_vars('rnn') else: value_params_1 = get_vars('model') # opt for optimize model train_model_op = model_optimizer.minimize(tf.reduce_mean(model_loss), var_list=value_params_1) # Targets for Q and V regression v_backup = tf.stop_gradient(tf.minimum(q1_pi_, q2_pi_) - alpha * logp_pi) # clip curiosity in_r = tf.stop_gradient(tf.reduce_mean(tf.clip_by_value(model_loss, 0, 64), axis=-1, keepdims=True)) beta = tf.placeholder(dtype=tf.float32, shape=(), name="beta") # beta = ac_kwargs["beta"] # adjust internal reward # can we prove the optimal value of beta # I think beta should decrease with training going on # beta = alpha # adjust internal reward q_backup = r_ph[:, :-1, :] + beta * in_r + gamma * (1 - d_ph[:, :-1, :]) * v_backup[:, 1:, :] # Soft actor-critic losses # pi_loss = tf.reduce_mean(alpha * logp_pi[:, :-1, :] - q1_pi[:, :-1, :]) pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) # in some case, the last timestep Q function is super important so maybe we can use weight sum of loss # calculate last timestep separately for convince q1_loss = 0.5 * tf.reduce_mean((q1[:, :-1, :] - q_backup) ** 2) q2_loss = 0.5 * tf.reduce_mean((q2[:, :-1, :] - q_backup) ** 2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) # train model first pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) with tf.control_dependencies([train_model_op]): train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) # TODO: maybe we should add parameters in main/rnn to optimizer ---> training is super slow while we adding it # TODO: if use model maybe we shouldn't opt rnn with q??? value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) if "q" in ac_kwargs["opt"]: value_params = get_vars('main/q') + get_vars('rnn') else: value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in non_deterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), model_loss, train_model_op, train_pi_op, train_value_op, target_update] else: step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, model_loss, train_model_op, train_pi_op, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, s_t_0_, mu, pi, states, deterministic=False): """s_t_0_ starting step for testing 1 H""" act_op = mu if deterministic else pi action, s_t_1_ = sess.run([act_op, states], feed_dict={x_ph: o.reshape(1, 1, obs_dim), a_ph: np.zeros([1, 1, act_dim]), s_t_0: s_t_0_}) return action.reshape(act_dim), s_t_1_ def test_agent(mu, pi, states, n=5): # global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 s_0 = np.zeros([1, h_size]) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a, s_1 = get_action(o, s_0, mu, pi, states, deterministic=True) s_0 = s_1 o, r, d, _ = test_env.step(a) # test_env.render() ep_ret += r ep_len += 1 # replay_buffer.store(o.reshape([1, obs_dim]), a.reshape([1, act_dim]), r, d) logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # start = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch s_t_0_ = np.zeros([1, h_size]) episode = 0 for t in range(total_steps + 1): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t == 0: start = time.time() if t > start_steps: # s_t_0_store = s_t_0_ # hidden state stored in buffer a, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False) s_t_0_ = s_t_1_ else: # s_t_0_store = s_t_0_ # print(s_t_0_.shape) _, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False) s_t_0_ = s_t_1_ a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) # give back o_t_1 we need store o_t_0 because that is what cause a_t_0 # print(r) # env.render() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o.reshape([1, obs_dim]), s_t_0_.reshape([1, h_size]), a.reshape([1, act_dim]), r, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ # fps = (time.time() - start)/200 # print("{} fps".format(200 / (time.time() - start))) print(ep_len) episode += 1 start = time.time() beta_ = ac_kwargs["beta"] * (1 - t / total_steps) # beta_ = ac_kwargs["beta"] * (1 / t ** 0.5) for j in range(int(ep_len)): batch = replay_buffer.sample_batch(batch_size) # maybe we can store starting state feed_dict = {x_ph: batch['obs1'], s_t_0: batch['s_t_0'], # all zero matrix for zero state in training a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], beta: beta_, } for _ in range(ac_kwargs["tm"] - 1): batch = replay_buffer.sample_batch(batch_size) # maybe we can store starting state feed_dict = {x_ph: batch['obs1'], s_t_0: batch['s_t_0'], # stored zero state for training a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], beta: beta_, } _ = sess.run(train_model_op, feed_dict) outs = sess.run(step_ops, feed_dict) # print(outs) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3].flatten(), Q2Vals=outs[4].flatten(), LogPi=outs[5].flatten(), Alpha=outs[6], beta=beta_, model_loss=outs[7].flatten()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 s_t_0_ = np.zeros([1, h_size]) # reset s_t_0_ when one episode is finished print("one episode duration:", time.time() - start) start = time.time() # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model # if (epoch % save_freq == 0) or (epoch == epochs - 1): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(mu, pi, states) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Episode', episode) logger.log_tabular('name', name) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('beta', average_only=True) logger.log_tabular('model_loss', with_min_and_max=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, use_gpu=False, learnable_temperature=False): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # env, test_env = env_fn(), env_fn() env = env_fn() action_space = env.action_space action_space.seed(seed) act_dim = action_space.shape[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, action_space, **ac_kwargs) ac_targ = deepcopy(ac) target_entropy = -act_dim print (ac) device = None if use_gpu: device = 'cuda' ac.cuda() ac_targ.cuda() log_alpha = torch.tensor(np.log(alpha), requires_grad=True, device=next(ac.parameters()).device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_space=env.observation_space, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(helper.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o,a) q2 = ac.q2(o,a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - log_alpha.exp().detach() * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (log_alpha.exp().detach() * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function log_alpha_optimizer = Adam([log_alpha], lr=lr) pi_optimizer = Adam(ac.pi.parameters(), lr=lr, weight_decay=1e-6) q_optimizer = Adam(q_params, lr=lr, weight_decay=1e-6) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() if learnable_temperature: log_alpha_optimizer.zero_grad() log_prob = pi_info["LogPi"] alpha_loss = log_alpha.exp() * ((-log_prob - target_entropy)).mean() logger.store(Alpha=log_alpha.exp(), AlphaLoss=alpha_loss) alpha_loss.backward() log_alpha_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch_ext.as_tensor(o, dtype=torch.float32, device=device), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = action_space.sample() # Step the env o2, r, d, _ = env.step(a) # env.render(mode='human') ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # if t % 100 == 0: print ("t: ", t) # End of epoch handling if (t+1) % steps_per_epoch == 0 and t >= update_after: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, itr=epoch) # Test the performance of the deterministic version of the agent. # test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', with_min_and_max=True) # logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('AlphaLoss', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
#!/usr/bin/env python3 import gym import numpy as np from spinup.utils.logx import EpochLogger env = gym.make('gym_lgsvl:lgsvl-v0') observation = env.reset() epoch_logger = EpochLogger() for i_episode in range(20): observation = env.reset() while (True): action = env.action_space.sample() observation, reward, done, info = env.step(action) epoch_logger.store(Reward = reward) if done: epoch_logger.log_tabular('Reward', with_min_and_max=True) epoch_logger.dump_tabular() break
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ # initialize logger and save it logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # initialize seed, and set tf and np seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) # get the env function, observation dimensions, and action dimensions env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer # calculate the number of steps per epoch per process local_steps_per_epoch = int(steps_per_epoch / num_procs()) # get the info shapes info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} # initialize the bugger buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO losses # ratio of pi / pi_old # pi loss # v loss ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver # pi params # gradient # v_ph and hvp pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) # check if the damping coeff is needed # if so, update hvp (damping_coeff * v_ph) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params # get pi params # set pi params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) # create a tf session and initialize it's variables sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ # initialize x as 0s of shape b x = np.zeros_like(b) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. # make a copy of b and r as r and p r = b.copy() p = r.copy() # calculate r dot old (r dot r) r_dot_old = np.dot(r, r) # for cg_iterations for _ in range(cg_iters): # calc z as Ax(p) z = Ax(p) # calculate alpha alpha = r_dot_old / (np.dot(p, z) + EPS) # increment x x += alpha * p # decrement r r -= alpha * z # calculate r dot new (r dot r) r_dot_new = np.dot(r, r) # calculate p p = r + (r_dot_new / r_dot_old) * p # update r dot old with r dot new r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval # get inputs as a dictionary, all phs and buffer inputs = {k: v for k, v in zip(all_phs, buf.get())} # calculate Hx Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) # get g, pi_l_old, v_l_old g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) # get g and pi_l_old averages g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG # get x x = cg(Hx, g) # get alpha alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) # get old paramers old_params = sess.run(get_pi_params) def set_and_eval(step): # set pi params with v_ph # old_params - alpha * x * step sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) # return average of d_kl and pi_loss operation return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl # for backtrack iterations for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates # for train_v_iterations for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) # update v_l_new with v_loss operation v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # Update start time start_time = time.time() # reset variables # o, r, d, ep_ret, ep_len o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): # get agent outputs agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # decontruct the above to a, v_t, logp_t, info_t a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # take an action o, r, d, _ = env.step(a) # update ep rewards and length ep_ret += r ep_len += 1 # check if the episode is done terminal = d or (ep_len == max_ep_len) # check if terminal or at max t for local epoch if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) # add the finish path to buffer buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) # reset environment variables # o, r, d, ep_ret, ep_len o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def bail_learn(algo = 'bail_2_bah', env_set="Hopper-v2", seed=0, buffer_type='FinalSigma0.5_env_0_1000K', gamma=0.99, ue_rollout=1000, augment_mc='gain', C=None, eval_freq=625, max_timesteps=int(25e4), batch_size=1000, lr=1e-3, wd=0, ue_lr=3e-3, ue_wd=2e-2, ue_loss_k=1000, ue_vali_freq=1250, pct_anneal_type='constant', last_pct=0.25, select_type='border', logger_kwargs=dict()): """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if not os.path.exists("./plots"): os.makedirs("./plots") if not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") file_name = "%s_%s_%s" % (algo, env_set, seed) setting_name = "%s_r%s_g%s" % (buffer_type.replace('env', env_set), ue_rollout, gamma) setting_name += '_noaug' if not (augment_mc) else '' setting_name += '_augNew' if augment_mc == 'new' else '' print("---------------------------------------") print("Algo: " + file_name + "\tData: " + buffer_type) print("Settings: " + setting_name) print("Evaluate Policy every", eval_freq * batch_size * 0.8 / 1e6, 'epoches; Total', max_timesteps * batch_size * 0.8 / 1e6, 'epoches') print("---------------------------------------") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Load buffer replay_buffer = utils.ReplayBuffer() buffer_name = buffer_type.replace('env', env_set) replay_buffer.load(buffer_name) # Load data for training UE states = np.load('./results/ueMC_%s_S.npy' % buffer_name, allow_pickle=True).squeeze() setting_name += '_Gain' if augment_mc == 'gain' else '_Gt' gts = np.load('./results/ueMC_%s.npy' % setting_name, allow_pickle=True).squeeze() print('Load mc returns type', augment_mc, 'with gamma:', gamma, 'rollout length:', ue_rollout) # Start training print('-- Policy train starts --') # Initialize policy if algo == 'bail_2_bah': policy = bail_training.BAIL_selebah(state_dim, action_dim, max_action, max_iters=max_timesteps, States=states, MCrets=gts, ue_lr=ue_lr, ue_wd=ue_wd, pct_anneal_type=pct_anneal_type, last_pct=last_pct, pct_info_dic=pct_info_dic, select_type=select_type, C=C) elif algo == 'bail_1_buf': policy = bail_training.BAIL_selebuf(state_dim, action_dim, max_action, max_iters=max_timesteps, States=states, MCrets=gts, ue_lr=ue_lr, ue_wd=ue_wd, pct_anneal_type=pct_anneal_type, last_pct=last_pct, pct_info_dic=pct_info_dic, select_type=select_type, C=C) else: raise Exception("! undefined BAIL implementation '%s'" % algo) training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += eval_freq * batch_size * 0.8 / 1e6 ue = policy.train(replay_buffer, training_iters, iterations=eval_freq, batch_size=batch_size, ue_loss_k=ue_loss_k, ue_vali_freq=ue_vali_freq, logger=logger) if training_iters >= max_timesteps - eval_freq: cur_ue_setting = 'Prog_' + setting_name + '_lossk%s_s%s' % (ue_loss_k, seed) bail_training.plot_envelope(ue, states, gts, cur_ue_setting, seed, [ue_lr, ue_wd, ue_loss_k, max_timesteps/batch_size, 4]) torch.save(ue.state_dict(), '%s/Prog_UE_%s.pth' % ("./pytorch_models", setting_name + \ '_s%s_lok%s' % (seed, ue_loss_k))) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq # log training info logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) logger.log_tabular('CloneLoss', average_only=True) logger.log_tabular('UELoss', average_only=True) logger.log_tabular('BatchUEtrnSize', average_only=True) logger.log_tabular('SVal', with_min_and_max=True) logger.log_tabular('SelePct', average_only=True) logger.log_tabular('BatchUpSize', with_min_and_max=True) logger.log_tabular('UEValiLossMin', average_only=True) if select_type == 'border': logger.log_tabular('Border', with_min_and_max=True) elif select_type == 'margin': logger.log_tabular('Margin', with_min_and_max=True) else: raise Exception('! undefined selection type') logger.dump_tabular()
def sqn_rpf(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, ensemble_size=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # print(max_ep_len,type(max_ep_len)) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] obs_space = env.observation_space act_dim = env.action_space.n act_space = env.action_space # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space( obs_space, act_space, obs_space, None, None) # x_ph, x2_ph: shape(?,128) # a_ph: shape(?,1) # r_ph, d_ph: shape(?,) all_ph = [x_ph, a_ph, x2_ph, r_ph, d_ph] ###### if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.n)) # target_entropy = (np.prod(env.action_space.n))/4/10 target_entropy = 0.15 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) ###### # Main outputs from computation graph with tf.variable_scope('random_head'): head_index = tf.get_variable(name='random_int', shape=[], dtype=tf.int32) with tf.variable_scope('main'): mu, pi, _, q1, _, q2, _ = actor_critic(x_ph, a_ph, alpha, ensemble_size=ensemble_size, **ac_kwargs) # _, _, logp_pi, _, _ = actor_critic(x2_ph, a_ph, alpha, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, q1_pi_, _, q2_pi_ = actor_critic( x2_ph, a_ph, alpha, ensemble_size=ensemble_size, **ac_kwargs) # Experience buffer if isinstance(act_space, Box): a_dim = act_dim elif isinstance(act_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if isinstance(alpha, tf.Tensor): alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi_ + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = [tf.minimum(q1_pi_[i], q2_pi_[i]) for i in range(ensemble_size)] # Targets for Q and V regression # v_backup = tf.stop_gradient(q1_pi_ - alpha * logp_pi_) ############################## alpha=0 v_backup = [ tf.stop_gradient(min_q_pi[i] - alpha * logp_pi_[i]) for i in range(ensemble_size) ] # q_backup = tf.expand_dims(r_ph, axis=-1) + gamma*(1-tf.expand_dims(d_ph, axis=-1))*v_backup # q_backup = r_ph + gamma * (1 - d_ph) * v_backup q_backup = [ r_ph + gamma * (1 - d_ph) * v_backup[i] for i in range(ensemble_size) ] # Soft actor-critic losses # q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) # q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) # value_loss = q1_loss + q2_loss q1_loss = [ 0.5 * tf.reduce_mean((q_backup[i] - q1[i])**2, axis=0) for i in range(ensemble_size) ] q2_loss = [ 0.5 * tf.reduce_mean((q_backup[i] - q2[i])**2, axis=0) for i in range(ensemble_size) ] value_loss = [q1_loss[i] + q2_loss[i] for i in range(ensemble_size)] # # Policy train op # # (has to be separate from value train op, because q1_pi appears in pi_loss) # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') #with tf.control_dependencies([train_pi_op]): train_value_op = [ value_optimizer.minimize(value_loss[i], var_list=value_params) for i in range(ensemble_size) ] # train_value_op = [value_optimizer.minimize(value_loss)] # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies(train_value_op): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # zip([1,2,3,4],['a','b']) = [(1,'a'),(2,'b')] # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ q1_loss[0], q1[0], logp_pi_[0], tf.identity(alpha), train_value_op, target_update ] # step_ops = [q1_loss[0], q1[0], logp_pi_[0], tf.identity(alpha), target_update] else: step_ops = [ q1_loss, q1, logp_pi_, alpha, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu[0], 'pi': pi[0], 'q1': q1[0] }) def get_action(o, active_head=0, deterministic=False): act_op = mu[active_head] if deterministic else pi[active_head] return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def test_agent(n=3): # number of tests global sess, mu, pi, q1 for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, deterministic=True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Select a head to interact with env. active_head = np.random.randint(ensemble_size) # t0 = time.time() total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # if t > start_steps and 20*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum if t > start_steps: a = get_action(o, active_head=active_head) else: a = env.action_space.sample() np.random.random() # Step the env o2, r, d, _ = env.step(a) #print(a,o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): # t_last = t0 # t0 = time.time() # print('episode_time:', t0-t_last, 'ep_len:', ep_len) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [q1_loss, q1, logp_pi_, alpha, train_value_op, target_update, train_alpha_op] # for i in range(ensemble_size): # batch = replay_buffer.sample_batch(batch_size) # feed_dict = {x_ph: batch['obs1'], # x2_ph: batch['obs2'], # a_ph: batch['acts'], # r_ph: batch['rews'], # d_ph: batch['done'], # } # # step_ops = [q1_loss, q1, logp_pi_, alpha, target_update, train_alpha_op] # q_values = sess.make_callable(train_value_op, [o_tm1]) # sess.run(train_value_op[i], feed_dict) # #print(i) outs = sess.run(step_ops, feed_dict) logger.store(LossQ1=outs[0], Q1Vals=outs[1], LogPi=outs[2], Alpha=outs[3]) logger.store(EpRet=ep_ret, EpLen=ep_len) # t_last = t0 # t0 = time.time() # print('training_time:', t0-t_last, 'num_train/ep_len:', ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Select a head to interact with env. active_head = np.random.randint(ensemble_size) # print(active_head) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: # and ep_len < steps_per_epoch: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) # logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) # logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) # logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=100000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=100, start_epochs=1000, max_ep_len=500, policy_path=None, logger_kwargs=dict(), save_freq=100, update_steps=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) test_logger_kwargs = dict() test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] test_logger = EpochLogger(**test_logger_kwargs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['output_activation'] = None if policy_path is None: # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) else: # todo # load pretrained model with tf.variable_scope('main'): sess, x_ph, a_ph, mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = load_policy( policy_path, itr='last', deterministic=False, act_high=env.action_space.high) x2_ph, r_ph, d_ph = core.placeholders(None, None, None) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) sess.run(tf.variables_initializer(pi_optimizer.variables())) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) sess.run(tf.variables_initializer(value_optimizer.variables())) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # sess = tf.Session() # sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=91, test_num=1): global sess, mu, pi, q1, q2, q1_pi, q2_pi env.unwrapped._set_test_mode(True) for i in range(n): observation = env.reset() policy_cumulated_reward = 0 for t in range(episode_steps): newObservation, reward, done, info = env.step( get_action(np.array(observation), True)) observation = newObservation if (t == episode_steps - 1): print("reached the end") done = True policy_cumulated_reward += reward if done: test_logger.store(policy_reward=policy_cumulated_reward) test_logger.store(policy_steps=t) test_logger.store(arrive_des=info['arrive_des']) break else: pass test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('policy_reward', average_only=True) test_logger.log_tabular('policy_steps', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.dump_tabular() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs test_num = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for t in range(steps_per_epoch): if epoch > start_epochs: a = get_action(np.array(o)) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if (t == steps_per_epoch - 1): print("reached the end") d = True # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d: """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 break # End of epoch wrap-up if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs - 1): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class AntSoftActorCritic: def var_scope(self, var): return self.main_scope + '/' + var def __init__(self, env_fn, reward_fn=[], actor_critic=core.mlp_actor_critic, xid=0, seed=0, max_ep_len=1000, gamma=.99, alpha=0.2, lr=1e-3, polyak=0.995, replay_size=int(1e6), ac_kwargs=dict(), logger_kwargs=dict(), normalization_factors=[], learn_reduced=False): tf.set_random_seed(seed) np.random.seed(seed) self.xid = xid self.main_scope = 'main' + str(xid) self.target_scope = 'target' + str(xid) self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(logger_kwargs) self.max_ep_len = max_ep_len self.reward_fn = reward_fn self.normalization_factors = normalization_factors self.learn_reduced = learn_reduced self.env, self.test_env = env_fn(), env_fn() self.obs_dim = len(self.env.env.state_vector()) if self.learn_reduced: self.obs_dim = ant_utils.expected_state_dim self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = self.env.action_space self.graph = tf.Graph() with self.graph.as_default(): # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders( self.obs_dim, self.act_dim, self.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope(self.main_scope): self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v, self.std = actor_critic( self.x_ph, self.a_ph, **ac_kwargs) # Target value network with tf.variable_scope(self.target_scope): _, _, _, _, _, _, _, self.v_targ, _ = actor_critic( self.x2_ph, self.a_ph, **ac_kwargs) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Min Double-Q: min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * self.v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * self.logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * self.logp_pi - self.q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - self.q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - self.q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - self.v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars( self.var_scope('pi'))) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars(self.var_scope('q')) + get_vars( self.var_scope('v')) with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize( value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars(self.main_scope), get_vars(self.target_scope)) ]) # All ops to call during one training step self.step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, self.q1, self.q2, self.v, self.logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(self.v_targ, v_main) for v_main, self.v_targ in zip( get_vars(self.main_scope), get_vars(self.target_scope)) ]) self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init) def reward(self, env, r, o): if len(self.reward_fn) == 0: return r # use self.normalization_factors to normalize the state. tup = tuple( ant_utils.discretize_state(o, self.normalization_factors, env)) return self.reward_fn[tup] def get_action(self, o, deterministic=False): if self.learn_reduced: o = ant_utils.convert_obs(o) with self.graph.as_default(): act_op = self.mu if deterministic else self.pi action = self.sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0] return action def get_sigma(self, o): if self.learn_reduced: o = ant_utils.convert_obs(o) with self.graph.as_default(): return self.sess.run(self.std, feed_dict={self.x_ph: o.reshape(1, -1)})[0] def test_agent(self, T, n=10, initial_state=[], normalization_factors=[], store_log=True, deterministic=True, reset=False): denom = 0 p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) for j in range(n): o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0 if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] self.test_env.env.set_state(qpos, qvel) o = self.test_env.env._get_obs() o = get_state(self.test_env, o) while not (d or (ep_len == T)): # Take deterministic actions at test time a = self.get_action(o, deterministic) o, r, d, _ = self.test_env.step(a) o = get_state(self.test_env, o) r = self.reward(self.test_env, r, o) ep_ret += r ep_len += 1 denom += 1 p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] += 1 p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] += 1 if d and reset: d = False if store_log: self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) p /= float(denom) p_xy /= float(denom) return p, p_xy def test_agent_random(self, T, normalization_factors=[], n=10): p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) cumulative_states_visited_baseline = 0 states_visited_baseline = [] cumulative_states_visited_xy_baseline = 0 states_visited_xy_baseline = [] denom = 0 for j in range(n): o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0 o = get_state(self.test_env, o) while not (d or (ep_len == T)): a = self.test_env.action_space.sample() o, r, d, _ = self.test_env.step(a) o = get_state(self.test_env, o) r = self.reward(self.test_env, r, o) # if this is the first time you are seeing this state, increment. if p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] == 0: cumulative_states_visited_baseline += 1 states_visited_baseline.append( cumulative_states_visited_baseline) if p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] == 0: cumulative_states_visited_xy_baseline += 1 states_visited_xy_baseline.append( cumulative_states_visited_xy_baseline) p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] += 1 p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] += 1 denom += 1 ep_len += 1 if d: # CRITICAL: ignore done signal d = False p /= float(denom) p_xy /= float(denom) return p, p_xy, states_visited_baseline, states_visited_xy_baseline # record film of policy def record(self, T, n, video_dir='', on_policy=False, deterministic=False): print("rendering env in record()") # TODO: set width and height. for i in range(n): self.test_env.reset() wrapped_env = wrappers.Monitor(self.test_env, video_dir + '_%d' % (i)) o = wrapped_env.reset() t = 0 d = False while t < T and not d: o = wrapped_env.unwrapped.state_vector() if on_policy: a = self.get_action(o, deterministic) else: a = wrapped_env.unwrapped.action_space.sample() o2, r, d, _ = wrapped_env.step(a) print(t) if np.all(np.isclose(o, wrapped_env.unwrapped.state_vector())): print('close!') break wrapped_env.unwrapped.render(mode='rgb_array', width=1000, height=1000) o = o2 t = t + 1 wrapped_env.close() def soft_actor_critic(self, initial_state=[], steps_per_epoch=5000, epochs=100, batch_size=100, start_steps=10000, save_freq=1): with self.graph.as_default(): # Count variables var_counts = tuple( core.count_vars(scope) for scope in [ self.var_scope('pi'), self.var_scope('q1'), self.var_scope('q2'), self.var_scope('v'), self.main_scope ]) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Setup model saving self.logger.setup_tf_saver(self.sess, inputs={ 'x': self.x_ph, 'a': self.a_ph }, outputs={ 'mu': self.mu, 'pi': self.pi, 'q1': self.q1, 'q2': self.q2, 'v': self.v }) start_time = time.time() o, r, d, ep_ret, ep_len = self.env.reset(), 0, False, 0, 0 if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] self.env.env.set_state(qpos, qvel) o = self.env.env._get_obs() o = get_state(self.env, o) total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: # if t == start_steps + 1: # print("!!!! using policy !!!!") a = self.get_action(o) else: a = self.env.action_space.sample() # Step the env o2, r, d, _ = self.env.step(a) o2 = get_state(self.env, o2) r = self.reward(self.env, r, o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == self.max_ep_len else d # Store experience to replay buffer if self.learn_reduced: self.replay_buffer.store(ant_utils.convert_obs(o), a, r, ant_utils.convert_obs(o2), d) else: self.replay_buffer.store(o, a, r, o2, d) # Super critical: update most recent observation. o = o2 if d or (ep_len == self.max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = self.replay_buffer.sample_batch(batch_size) feed_dict = { self.x_ph: batch['obs1'], self.x2_ph: batch['obs2'], self.a_ph: batch['acts'], self.r_ph: batch['rews'], self.d_ph: batch['done'], } outs = self.sess.run(self.step_ops, feed_dict) self.logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) self.logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = self.env.reset(), 0, False, 0, 0 if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] self.env.env.set_state(qpos, qvel) o = self.env.env._get_obs() o = get_state(self.env, o) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent(self.max_ep_len) # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('EpRet', with_min_and_max=False) self.logger.log_tabular('TestEpRet', with_min_and_max=False) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ1', average_only=True) self.logger.log_tabular('LossQ2', average_only=True) self.logger.log_tabular('LossV', average_only=True) self.logger.dump_tabular()