def __init__(self, agent, env, steps_per_epoch=4000, epochs=50, seed=0, output_dir=None, output_fname='progress.txt', exp_name=None, max_ep_len=1000, gamma=0.99, lam=0.97): self.epoch_len, self.n_epochs = steps_per_epoch, epochs self.max_ep_len = max_ep_len self.logger = EpochLogger(output_dir=output_dir, output_fname=output_fname, exp_name=exp_name) print('locals') for key, val in locals().items(): print('{}: {}'.format(key, len(str(val)))) # self.logger.save_config(locals()) self.env, self.agent = env, agent self.buffer = OnPolicyBuffer(steps_per_epoch, gamma=gamma, lam=lam) saver_kwargs = agent.build_graph(env.observation_space, env.action_space) self.logger.setup_tf_saver(**saver_kwargs) var_counts = tuple( tf_utils.trainable_count(scope) for scope in ['pi', 'v']) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) np.random.seed(seed) tf.set_random_seed(seed)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger_kwargs = setup_logger_kwargs('MultiTaskDDPG') self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(globals()) self.start_steps = 10000
def __init__(self, env, EP_MAX=1000, EP_LEN=250, GAMMA=0.99, LR = 0.0001, BATCH=32, UPDATE_STEP=10, hidden_sizes = (64,64), activation=tf.tanh, output_activation=tf.tanh, act_noise_amount=0.01, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.EP_MAX = EP_MAX self.EP_LEN = EP_LEN self.GAMMA = GAMMA self.LR = LR self.BATCH = BATCH self.UPDATE_STEP = UPDATE_STEP self.S_DIM = env.observation_space.shape[-1] self.A_DIM = env.action_space.shape[-1] self.act_high = env.action_space.high self.act_low = env.action_space.low self.hidden_sizes = hidden_sizes self.activation = activation self.output_activation = output_activation self.act_noise_amount = act_noise_amount self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], 'state') self.tfa = tf.placeholder(tf.float32, [None, self.A_DIM], 'action') # policy self.pi, self.pi_params = self._build_net('pi', trainable=True) with tf.variable_scope('loss'): self.loss = tf.reduce_mean(tf.square(self.pi - self.tfa)) with tf.variable_scope('train'): self.train_op = tf.train.AdamOptimizer(LR).minimize(self.loss) tf.summary.FileWriter("log/", self.sess.graph) self.sess.run(tf.global_variables_initializer()) # Setup model saving self.logger.setup_tf_saver(self.sess, inputs={'x': self.tfs, 'a': self.tfa}, outputs={'pi': self.pi})
def __init__(self, config): self.log_dir = config["output_dir"] logger_kwargs = setup_logger_kwargs(config["exp_name"], config["seed"]) logger_kwargs["output_dir"] = config["output_dir"] self.csv_logger = EpochLogger(**logger_kwargs) self.csv_logger.save_config(config) self.tf_logger = SummaryWriter(os.path.join(self.log_dir))
def __init__(self, env_maker: Callable, ac_maker=core.MLPActorCritic, ac_kwargs={}, seed: int = 0, epochs: int = 50, steps_per_epoch: int = 4000, gamma: float = 0.99, actor_lr: float = 3e-4, critic_lr: float = 1e-3, num_iter_train_critic: int = 80, lam: float = 0.97, max_episode_len: int = 1000, logger_kwargs=dict(), save_freq: int = 10): # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.num_iter_train_critic = num_iter_train_critic self.max_episode_len = max_episode_len self.save_freq = save_freq # make env self.env = env_maker() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape # make actor-critic self.ac = ac_maker(self.env.observation_space, self.env.action_space, **ac_kwargs) # make buffer self.local_steps_per_epoch = int(steps_per_epoch / num_procs()) self.buffer = Buffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, gamma, lam) # make optimizers self.actor_optimizer = Adam(self.ac.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(self.ac.critic.parameters(), lr=critic_lr) # Sync params across processes sync_params(self.ac) # Count variables var_counts = tuple( core.count_vars(module) for module in [self.ac.actor, self.ac.critic]) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up model saving self.logger.setup_pytorch_saver(self.ac)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAutoQuery') self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(globals()) self.init_query = False self.init_reward = False self.query_reward = 0
def __init__(self, env_str, seed=0, logger_kwargs=dict()): self.env_str = env_str print(self.env_str) tf.set_random_seed(seed) np.random.seed(seed) self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals())
class MultiTaskDDPGAugmentedOracle(MultiTaskDDPG): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAugmentedOracle') self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(globals()) def process_state(self, state): query = self.reward_function(None, None, None, True) return np.append(state, query)
class MultiTaskDDPG(SingleTaskDDPG): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger_kwargs = setup_logger_kwargs('MultiTaskDDPG') self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(globals()) self.start_steps = 10000 def reset(self, reward_function): self.reward_function = reward_function
def evaluate_agent(env, agent: Agent, deterministic=True, num_episodes=5, render=False, logger=None): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ "page on Experiment Outputs for how to handle this situation." assert env.spec.max_episode_steps > 0 if logger is None: logger = EpochLogger() episode_info = [] goal_grid_code = None for _ in range(num_episodes): obs = env.reset() agent.reset() reward = 0 done = False episode_return = 0 t = 0 while not done: if render: env.render() time.sleep(1e-3) with torch.no_grad(): action = agent.act(obs, reward, goal_grid_code, deterministic) obs, reward, done, _ = env.step(action) episode_return += reward t += 1 if done: goal_grid_code = agent.current_grid_code.detach().cpu().numpy() episode_info.append((t, episode_return)) logger.store(TestEpRet=episode_return, TestEpLen=t) logger.log_tabular('EpisodeLimit', env.spec.max_episode_steps) logger.log_tabular('TestEpLen', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.dump_tabular() env.close() return episode_info
def get_mc(env_set="Hopper-v2", seed=1, buffer_type="sacpolicy_env_stopcrt_2_det_bear", cut_buffer_size='1000K', gamma=0.99, rollout=1000, augment_mc=True, logger_kwargs=dict()): print('MClength:', rollout) print('Discount value', gamma) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if not os.path.exists("./results"): os.makedirs("./results") setting_name = "%s_r%s_g%s" % (buffer_type.replace('env', env_set), rollout, gamma) setting_name += 'noaug' if not (augment_mc) else '' print("---------------------------------------") print("Settings: " + setting_name) print("---------------------------------------") # Load buffer if 'sac' in buffer_type: replay_buffer = utils.BEAR_ReplayBuffer() desire_stop_dict = {'Hopper-v2': 1000, 'Walker2d-v2': 500, 'HalfCheetah-v2': 4000, 'Ant-v2': 750} buffer_name = buffer_type.replace('env', env_set).replace('crt', str(desire_stop_dict[env_set])) replay_buffer.load(buffer_name) buffer_name += '_1000K' setting_name = setting_name.replace('crt', str(desire_stop_dict[env_set])) elif 'FinalSigma' in buffer_type: replay_buffer = utils.ReplayBuffer() buffer_name = buffer_type.replace('env', env_set) replay_buffer.load(buffer_name) else: raise FileNotFoundError('! Unknown type of dataset %s' % buffer_type) print('Starting MC calculation, type:', augment_mc) if augment_mc == 'gain': states, gains = calculate_mc_gain(replay_buffer, rollout=rollout, gamma=gamma) if not os.path.exists('./results/ueMC_%s_S.npy' % buffer_name): np.save('./results/ueMC_%s_S' % (buffer_name + '_' + cut_buffer_size), states) np.save('./results/ueMC_%s_Gain' % setting_name, gains) else: raise Exception('! undefined mc calculation type') print('Calculation finished ==')
class Logger: def __init__(self, config): self.log_dir = config["output_dir"] logger_kwargs = setup_logger_kwargs(config["exp_name"], config["seed"]) logger_kwargs["output_dir"] = config["output_dir"] self.csv_logger = EpochLogger(**logger_kwargs) self.csv_logger.save_config(config) self.tf_logger = SummaryWriter(os.path.join(self.log_dir)) # self.tf_logger.set_as_default() def store(self, data_dict, agent): for k, v in data_dict.items(): if len(agent) > 1: for i in agent: key = "{}_{}".format(k, i) k_v = {key: v[i]} self.csv_logger.store(**k_v) else: key = "{}_{}".format(k, agent[0]) k_v = {key: v} self.csv_logger.store(**k_v) def dump(self, keys, agents, step, mean_only): for k in keys: for i in agents: key = "{}_{}".format(k, i) value = self.csv_logger.epoch_dict[key] self.csv_logger.log_tabular(key, average_only=mean_only) if mean_only: self.tf_logger.add_scalar(key, np.mean(value), step) # self.tf_logger.add_scalar(scalar) else: for p, q in zip( ["min", "mean", "max", "std"], [ np.min(value), np.mean(value), np.max(value), np.std(value) ], ): # scalar = tf.compat.v1.summary.scalar("{}_{}".format(key, p), q, step) self.tf_logger.add_scalar("{}_{}".format(key, p), q, step) self.csv_logger.dump_tabular() self.tf_logger.flush()
def meil(WORKING_DIR, EXPERT_DIR, args): expert_distribution = Gaussian_Density() with tf.Session() as sess: env = gym.make(args.env) expert = load_policy(sess, EXPERT_DIR) expert_distribution.train(env, expert, args.trajects, args.distr_gamma, args.iter_length) env.close() expert_density = expert_distribution.density() env = gym.make(args.env) policy_distr = Gaussian_Density() policy = lambda s: np.random.uniform( -2.0, 2.0, size=env.action_space.shape) # random policy policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() logger_kwargs = setup_logger_kwargs("result", data_dir=WORKING_DIR) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) for i in range(args.rounds): reward = lambda s: expert_density(s) / (density(s) + args.eps) message = "\nRound {} out of {}\n".format(i + 1, args.rounds) reuse = (i > 0) ppo(logger, reuse, message, lambda: gym.make(args.env), reward, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), gamma=args.gamma, steps_per_epoch=args.steps, epochs=args.epochs, logger_kwargs=logger_kwargs) with tf.Session() as sess: policy = load_policy(sess, os.path.join(WORKING_DIR, str(i))) policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() env.close() opt_dir = reward_validation(WORKING_DIR, args) return opt_dir
class MultiTaskDDPGQuery(MultiTaskDDPG): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger_kwargs = setup_logger_kwargs('MultiTaskDDPGQuery') self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(globals()) def process_state(self, state): query_state = np.zeros(17) query_state[0] = -.05 next_state = np.zeros(17) next_state[0] = .007 action = self.rng.rand(6) query = self.reward_function(query_state, action, next_state) return np.append(state, query)
def get_mc(env_set="Hopper-v2", seed=0, buffer_type='FinalSAC_env_0_1000K', gamma=0.99, rollout=1000, augment_mc='gain', logger_kwargs=dict()): print('MClength:', rollout) print('Discount value', gamma) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if not os.path.exists("./results"): os.makedirs("./results") setting_name = "%s_r%s_g%s" % (buffer_type.replace( 'env', env_set), rollout, gamma) setting_name += 'noaug' if not (augment_mc) else '' print("---------------------------------------") print("Settings: " + setting_name) print("---------------------------------------") # Load buffer replay_buffer = utils.ReplayBuffer() buffer_name = buffer_type.replace('env', env_set) replay_buffer.load(buffer_name) print('Starting MC calculation, type:', augment_mc) if augment_mc == 'gain': states, gains = calculate_mc_gain(replay_buffer, rollout=rollout, gamma=gamma) if not os.path.exists('./results/ueMC_%s_S.npy' % buffer_name): np.save('./results/ueMC_%s_S' % buffer_name, states) print(len(gains)) np.save('./results/ueMC_%s_Gain' % setting_name, gains) else: raise Exception('! undefined mc calculation type') print('Calculation finished ==')
def evaluate_agent(env, agent: IAgent, deterministic=True, num_episodes=5, episode_len_limit=None, render=False, logger=None): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ "page on Experiment Outputs for how to handle this situation." if episode_len_limit is None: if env.unwrapped.spec and env.unwrapped.spec.max_episode_steps: episode_len_limit = env.spec.max_episode_steps else: raise ValueError("Episode length limit must be specified") if logger is None: logger = EpochLogger() episode_info = [] for _ in range(num_episodes): obs = env.reset() agent.reset() done = False episode_return = 0 t = 0 while not done and t != episode_len_limit: if render: env.render() time.sleep(1e-3) with torch.no_grad(): action = agent.act(obs, deterministic) obs, reward, done, _ = env.step(action) episode_return += reward t += 1 episode_info.append((t, episode_return)) logger.store(TestEpRet=episode_return, TestEpLen=t) logger.log_tabular('EpisodeLimit', episode_len_limit) logger.log_tabular('TestEpLen', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.dump_tabular() return episode_info
def __init__(self, action_space, observation_space, rng, eps=0.9, discount_factor=0.99, alpha=1e-3): self.rng = rng logger_kwargs = setup_logger_kwargs('SingleTaskDDPG', self.rng) self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.actor_critic = MLPActorCritic # ac_kwargs=dict() ****?????***** # seed=0 self.replay_size = int(1e6) self.polyak = 0.995 self.gamma = discount_factor self.pi_lr = alpha self.q_lr = alpha self.batch_size = 100 self.start_steps = 10000 self.update_after = 1000 self.update_every = 50 self.act_noise = 0.1 self.step_count = 0 self.action_space = action_space self.observation_space = observation_space # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(17,), dtype=np.float32) #fix # torch.manual_seed(seed) # np.random.seed(seed) # self.obs_dim = self.observation_space.shape self.act_dim = self.action_space.shape[0] # act_dim = self.action_space.n # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.action_space.high[0] self.net = False
def __init__(self, agent, env, steps_per_epoch=5000, epochs=50, seed=0, max_ep_len=1000, start_steps=10000, replay_size=int(1e6), batch_size=100, n_test_episodes=10, output_dir=None, output_fname='progress.txt', exp_name=None): self.epoch_len, self.n_epochs = steps_per_epoch, epochs self.max_ep_len, self.start_steps = max_ep_len, start_steps self.n_test_episodes = n_test_episodes self.logger = EpochLogger(output_dir=output_dir, output_fname=output_fname, exp_name=exp_name) print('locals') for key, val in locals().items(): print('{}: {}'.format(key, len(str(val)))) # self.logger.save_config(locals()) self.env, self.agent = env, agent self.buffer = OffPolicyBuffer(buffer_size=replay_size, epoch_size=steps_per_epoch, batch_size=batch_size) saver_kwargs = agent.build_graph(env.observation_space, env.action_space) self.logger.setup_tf_saver(**saver_kwargs) var_counts = tuple( tf_utils.trainable_count(scope) for scope in ['pi', 'q']) self.logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) np.random.seed(seed) tf.set_random_seed(seed)
def worker_test(ps, start_time): from spinup.utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) logger = EpochLogger(**logger_kwargs) config = locals() del config['ps'] logger.save_config(config) agent = Model(args) keys = agent.get_weights()[0] weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) test_env = gym.make(args.env) while True: ave_ret = agent.test_agent(test_env, args) # print("test Average Ret:", ave_ret, "time:", time.time()-start_time) logger.log_tabular('AverageTestEpRet', ave_ret) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights)
def sppo(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=200, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) ########### if args.alpha == 'auto': target_entropy = 0.35 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=tf.log(0.2)) alpha = tf.exp(log_alpha) else: alpha = args.alpha ########### # Main outputs from computation graph mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, h] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) ###### if args.alpha == 'auto': alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(-h + target_entropy) ) # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 ) alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5) train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) # For PPO # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # ### Scheme1: SPPO NO.2: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp) # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) # ### Scheme3: SPPO NO.3: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) ### Scheme2: SPPO NO.2: add entropy min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean( tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h) v_loss = tf.reduce_mean((ret_ph - v)**2) #+(ret_ph - q)**2)/2.0 # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( h) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss) # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): if args.alpha == 'auto': sess.run(train_alpha_op, feed_dict=inputs) _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # for _ in range(train_v_iters): # sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, h_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a}) # SPPO NO.1: add entropy # rh = r - args.alpha * logp_t if args.alpha == 'auto': rh = r + sess.run(alpha) * h_t else: rh = r + alpha * h_t # exact entropy # save and log buf.store(o, a, rh, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # d = False if ep_len == max_ep_len else d terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def egl(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.4, n_explore=32, device='cuda', architecture='mlp', sample='on_policy'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ if architecture == 'mlp': actor_critic = core.MLPActorCritic elif architecture == 'spline': actor_critic = core.SplineActorCritic else: raise NotImplementedError device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps]) logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n' % var_counts) n_samples = 100 cmin = 0.25 cmax = 1.75 greed = 0.01 rand = 0.01 def max_reroute(o): b, _ = o.shape o = repeat_and_reshape(o, n_samples) with torch.no_grad(): ai, _ = ac.pi(o) q1 = ac.q1(o, ai) q2 = ac.q2(o, ai) qi = torch.min(q1, q2).unsqueeze(-1) qi = qi.view(n_samples, b, 1) ai = ai.view(n_samples, b, act_dim) rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False) w = cmin * torch.ones_like(ai) m = int((1 - cmin) * n_samples / (cmax - cmin)) w += (cmax - cmin) * (rank < m).float() w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float() w -= greed w += greed * n_samples * (rank == 0).float() w = w * (1 - rand) + rand w = w / w.sum(dim=0, keepdim=True) prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0)) a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2) return a, (ai, w.mean(-1)) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # # Set up function for computing EGL mean-gradient-losses # def compute_loss_g(data): # # o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # # a2 = ball_explore(a1, n_explore, eps) # # a2 = a2.view(n_explore * len(r), act_dim) # o_expand = repeat_and_reshape(o, n_explore) # # # Bellman backup for Q functions # with torch.no_grad(): # # q1 = ac.q1(o_expand, a2) # q2 = ac.q2(o_expand, a2) # q_dither = torch.min(q1, q2) # # # Target actions come from *current* policy # a_tag, logp_a_tag = ac.pi(o_tag) # # # Target Q-values # q1_pi_targ = ac_targ.q1(o_tag, a_tag) # q2_pi_targ = ac_targ.q2(o_tag, a_tag) # q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) # q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag) # # q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) # # geps = ac.geps(o, a1) # geps = repeat_and_reshape(geps, n_explore) # a1 = repeat_and_reshape(a1, n_explore) # # geps = (geps * (a2 - a1)).sum(-1) # # l1 loss against Bellman backup # # loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # # # Useful info for logging # g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) # # return loss_g, g_info # Set up function for computing EGL mean-gradient-losses def compute_loss_g(data): o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] a2 = ball_explore(a1, n_explore, eps) a2 = a2.view(n_explore * len(r), act_dim) o_expand = repeat_and_reshape(o, n_explore) # Bellman backup for Q functions with torch.no_grad(): q1 = ac.q1(o_expand, a2) q2 = ac.q2(o_expand, a2) q_dither = torch.min(q1, q2) # Target actions come from *current* policy # Target Q-values q1 = ac.q1(o, a1) q2 = ac.q2(o, a1) q_anchor = torch.min(q1, q2) q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) geps = ac.geps(o, a1) geps = repeat_and_reshape(geps, n_explore) a1 = repeat_and_reshape(a1, n_explore) geps = (geps * (a2 - a1)).sum(-1) # l1 loss against Bellman backup loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # Useful info for logging g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) return loss_g, g_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) geps_pi = ac.geps(o, pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean() beta = autograd.Variable(pi.detach().clone(), requires_grad=True) q1_pi = ac.q1(o, beta) q2_pi = ac.q2(o, beta) qa = torch.min(q1_pi, q2_pi).unsqueeze(-1) grad_q = autograd.grad(outputs=qa, inputs=beta, grad_outputs=torch.cuda.FloatTensor( qa.size()).fill_(1.), create_graph=False, retain_graph=False, only_inputs=True)[0] # Useful info for logging pi_info = dict( LogPi=logp_pi.detach().cpu().numpy(), GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(), GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(), GradDelta=torch.norm(geps_pi - grad_q, dim=-1).detach().cpu().numpy(), GradSim=F.cosine_similarity(geps_pi, grad_q, dim=-1).detach().cpu().numpy(), ) return loss_pi, pi_info if architecture == 'mlp': # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) g_optimizer = Adam(ac.geps.parameters(), lr=lr) elif architecture == 'spline': # Set up optimizers for policy and q-function pi_optimizer = SparseDenseAdamOptimizer(ac.pi, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2], dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) g_optimizer = SparseDenseAdamOptimizer(ac.geps, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) else: raise NotImplementedError # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Next run one gradient descent step for the mean-gradient g_optimizer.zero_grad() loss_g, g_info = compute_loss_g(data) loss_g.backward() g_optimizer.step() # Record things logger.store(LossG=loss_g.item(), **g_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in ac.geps.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in ac.geps.parameters(): p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action_on_policy(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def get_action_rbi(o, deterministic=False): o = torch.as_tensor(o, dtype=torch.float32, device=device) if deterministic: a = ac.act(o, deterministic) else: o = o.unsqueeze(0) a, _ = max_reroute(o) a = a.flatten().cpu().numpy() return a if sample == 'on_policy': get_action = get_action_on_policy elif sample == 'rbi': get_action = get_action_rbi else: raise NotImplementedError def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('GVals', with_min_and_max=True) logger.log_tabular('LossG', with_min_and_max=True) logger.log_tabular('GradGAmp', with_min_and_max=True) logger.log_tabular('GradQAmp', with_min_and_max=True) logger.log_tabular('GradDelta', with_min_and_max=True) logger.log_tabular('GradSim', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions pi_noise_targ = get_pi_noise_clipped(pi, noise_scale=target_noise, noise_clip=noise_clip, act_limit=act_limit) # Target Q-values, using action from smoothed target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, pi_noise_targ, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets q_targ = get_q_target(q1_targ, q2_targ, r_ph, d=d_ph, gamma=0.99) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.losses.mean_squared_error(q_targ, q1) q2_loss = tf.losses.mean_squared_error(q_targ, q2) q_loss = q1_loss + q2_loss #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape # obs_dim = env.observation_space.n act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) bayes_kl_loss = 0. if isinstance(ac.v, BayesMLPCritic): bayes_kl_loss = ac.v.compute_kl() total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0] total_loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old), BayesKL=bayes_kl_loss) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 epoch_reward = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished epoch_reward.append(ep_ret) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() if epoch % 10 == 0: # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('BayesKL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() return epoch_reward
def cvi_ad(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, alp = 0.8, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, decay = None, squash = False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) adv_ph = tf.placeholder(dtype = tf.float32, shape = (None,)) alp_ph = tf.placeholder(dtype = tf.float32) t_step = tf.placeholder(dtype = tf.float32) #adv_ph1 = tf.placeholder(dtype = tf.float32, shape = (None,)) #adv_ph2 = tf.placeholder(dtype = tf.float32, shape = (None,)) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, ad1, ad2, ad1_pi, ad2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, ad1_targ, ad2_targ, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) squash_eps = 1e-2 if squash: print("Squashed") squash_func = lambda x: tf.sign(x) * (tf.sqrt(tf.abs(x) + 1) - 1) + x * squash_eps squash_ifunc = lambda x: tf.sign(x) * ((tf.sqrt(1 + 4 * squash_eps * (tf.abs(x) + 1 + squash_eps)) - 1)** 2 * (1 / (2 * squash_eps))** 2 - 1) else: print ("Not Squashed") squash_func = lambda x: x squash_ifunc = lambda x: x # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) q1 = v + ad1 q2 = v + ad2 q1_pi = v + ad1_pi q2_pi = v + ad2_pi # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(squash_func(r_ph + gamma*(1-d_ph)*squash_ifunc(v_targ) + alp_ph * adv_ph)) #q_backup1 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph1) #q_backup2 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph2) v_backup = tf.stop_gradient(squash_func(squash_ifunc(min_q_pi) - alpha * logp_pi)) # Soft actor-critic losses #alp = tf.Variable(0.2,dtype=tf.float32) #q_min = tf.minimum(q1,q2) pi_loss = tf.reduce_mean(alpha * logp_pi - squash_ifunc(min_q_pi)) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main') , get_vars('target'))]) # target_update = tf.group([tf.assign(v_targ, tf.cond(tf.not_equal(t_step%1000,0), lambda: v_targ, lambda: v_main)) # for v_main, v_targ in zip(get_vars('main') , get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # adv_op = squash_ifunc(tf.minimum(q1_targ, q2_targ))-squash_ifunc(v_targ) adv_op = squash_ifunc(tf.minimum(ad1_targ, ad2_targ)) #adv_op1 = q1_targ-v_targ #adv_op2 = q2_targ-v_targ # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs if decay: alp_val = 0.2 else: alp_val = alp update_step = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): update_step+=1 batch = replay_buffer.sample_batch(batch_size) feed_dict = {x2_ph: batch['obs1'], a_ph: batch['acts'] } advantage = sess.run(adv_op , feed_dict) #advantage = sess.run([adv_op1, adv_op2] , feed_dict) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], t_step: update_step, adv_ph : advantage, alp_ph : alp_val #adv_ph1 : advantage[0], #adv_ph2 : advantage[1] } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if decay: alp_val = eval(decay)(t//steps_per_epoch) # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 maxRev = float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT = 0 maxRevRevenue = 0 maxRevThroughput = 0 maxRevJAH = 0 maxRevRemVeh = 0 maxRevJAH2 = 0 maxRevRMSE_MLvio = 0 maxRevPerTimeVio = 0 maxRevHOTDensity = pd.DataFrame() maxRevGPDensity = pd.DataFrame() maxtdJAHMax = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a[0]) numpyFromA = ((numpyFromA + 1.0) * (env.state.tollMax - env.state.tollMin) / 2.0) + env.state.tollMin a[0] = np.ndarray.tolist(numpyFromA) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() if np.any(np.isnan(np.array(otherStats))): sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2=otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret > maxRev: maxRev = ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT = otherStats[0] maxRevRevenue = otherStats[1] maxRevThroughput = otherStats[2] maxRevJAH = otherStats[3] maxRevRemVeh = otherStats[4] maxRevJAH2 = otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio = otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f " % maxRev) print( "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f" % (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax)) outputVector = [ maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax ] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
def td3(env_fn: Callable, actor_critic: torch.nn.Module = core.MLPActorCritic, ac_kwargs: Dict = None, seed: int = 0, steps_per_epoch: int = 4000, epochs: int = 2000, replay_size: int = int(1e6), gamma: float = 0.99, polyak: float = 0.995, pi_lr: Union[Callable, float] = 1e-3, q_lr: Union[Callable, float] = 1e-3, batch_size: int = 100, start_steps: int = 10000, update_after: int = 1000, update_every: int = 100, act_noise: Union[Callable, float] = 0.1, target_noise: float = 0.2, noise_clip: float = 0.5, policy_delay: int = 2, num_test_episodes: int = 3, max_ep_len: int = 1000, logger_kwargs: Dict = None, save_freq: int = 1, random_exploration: Union[Callable, float] = 0.0, save_checkpoint_path: str = None, load_checkpoint_path: str = None, load_model_file: str = None): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float or callable): Learning rate for policy. q_lr (float or callable): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float or callable): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. random_exploration (float or callable): Probability to randomly select an action instead of selecting from policy. save_checkpoint_path (str): Path to save the model. If not set, no model will be saved load_checkpoint_path (str): Path to load the model. Cannot be set if save_model_path is set. """ if logger_kwargs is None: logger_kwargs = dict() if ac_kwargs is None: ac_kwargs = dict() if save_checkpoint_path is not None: assert load_checkpoint_path is None, "load_model_path cannot be set when save_model_path is already set" if not os.path.exists(save_checkpoint_path): print(f"Folder {save_checkpoint_path} does not exist, creating...") os.makedirs(save_checkpoint_path) if load_checkpoint_path is not None: assert load_model_file is None, "load_checkpoint_path cannot be set when load_model_file is already set" # ------------ Initialisation begin ------------ loaded_state_dict = None if load_checkpoint_path is not None: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) loaded_state_dict = load_latest_state_dict(load_checkpoint_path) logger.epoch_dict = loaded_state_dict['logger_epoch_dict'] q_learning_rate_fn = loaded_state_dict['q_learning_rate_fn'] pi_learning_rate_fn = loaded_state_dict['pi_learning_rate_fn'] epsilon_fn = loaded_state_dict['epsilon_fn'] act_noise_fn = loaded_state_dict['act_noise_fn'] replay_buffer = loaded_state_dict['replay_buffer'] env, test_env = loaded_state_dict['env'], loaded_state_dict['test_env'] ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) ac.load_state_dict(loaded_state_dict['ac']) ac_targ.load_state_dict(loaded_state_dict['ac_targ']) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] env.action_space.np_random.set_state( loaded_state_dict['action_space_state']) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) t_ori = loaded_state_dict['t'] pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(t_ori)) pi_optimizer.load_state_dict(loaded_state_dict['pi_optimizer']) q_optimizer = Adam(q_params, lr=q_learning_rate_fn(t_ori)) q_optimizer.load_state_dict(loaded_state_dict['q_optimizer']) np.random.set_state(loaded_state_dict['np_rng_state']) torch.set_rng_state(loaded_state_dict['torch_rng_state']) else: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) q_learning_rate_fn = get_schedule_fn(q_lr) pi_learning_rate_fn = get_schedule_fn(pi_lr) act_noise_fn = get_schedule_fn(act_noise) epsilon_fn = get_schedule_fn(random_exploration) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] env.action_space.seed(seed) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Create actor-critic module and target networks if load_model_file is not None: assert os.path.exists( load_model_file ), f"Model file path does not exist: {load_model_file}" ac = torch.load(load_model_file) else: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(0)) q_optimizer = Adam(q_params, lr=q_learning_rate_fn(0)) t_ori = 0 act_limit = 1.0 # ------------ Initialisation end ------------ # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) torch.set_printoptions(profile="default") # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = data['obs'] q1_pi = ac.q1(o, ac.pi(o)) return -q1_pi.mean() # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for _ in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) scaled_action = get_action(o, 0) o, r, d, _ = test_env.step( unscale_action(env.action_space, scaled_action)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() if loaded_state_dict is not None: o = loaded_state_dict['o'] ep_ret = loaded_state_dict['ep_ret'] ep_len = loaded_state_dict['ep_len'] else: o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): t += t_ori # printMemUsage(f"start of step {t}") # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps and np.random.rand() > epsilon_fn(t): a = get_action(o, act_noise_fn(t)) unscaled_action = unscale_action(env.action_space, a) else: unscaled_action = env.action_space.sample() a = scale_action(env.action_space, unscaled_action) # Step the env o2, r, d, _ = env.step(unscaled_action) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: # Perform LR decay update_learning_rate(q_optimizer, q_learning_rate_fn(t)) update_learning_rate(pi_optimizer, pi_learning_rate_fn(t)) epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model and checkpoint save_checkpoint = False checkpoint_path = "" if save_checkpoint_path is not None: save_checkpoint = True checkpoint_path = save_checkpoint_path if load_checkpoint_path is not None: save_checkpoint = True checkpoint_path = load_checkpoint_path if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({}, None) if save_checkpoint: checkpoint_file = os.path.join(checkpoint_path, f'save_{epoch}.pt') torch.save( { 'ac': ac.state_dict(), 'ac_targ': ac_targ.state_dict(), 'replay_buffer': replay_buffer, 'pi_optimizer': pi_optimizer.state_dict(), 'q_optimizer': q_optimizer.state_dict(), 'logger_epoch_dict': logger.epoch_dict, 'q_learning_rate_fn': q_learning_rate_fn, 'pi_learning_rate_fn': pi_learning_rate_fn, 'epsilon_fn': epsilon_fn, 'act_noise_fn': act_noise_fn, 'torch_rng_state': torch.get_rng_state(), 'np_rng_state': np.random.get_state(), 'action_space_state': env.action_space.np_random.get_state(), 'env': env, 'test_env': test_env, 'ep_ret': ep_ret, 'ep_len': ep_len, 'o': o, 't': t + 1 }, checkpoint_file) delete_old_files(checkpoint_path, 10)
def vpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A reference to ActorCritic class which after instantiation takes an input ``x``, and action, ``a``, and returns: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # https://pytorch.org/docs/master/notes/randomness.html#cudnn torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Actor Critic model instance actor_critic = actor_critic(obs_dim, **ac_kwargs) actor_critic.to(device) # load to cpu/gpu # Count variables var_counts = tuple(core.count_vars(model) for model in [actor_critic.policy, actor_critic.value]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Optimizers train_pi = optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = optim.Adam(actor_critic.value.parameters(), lr=vf_lr) # Sync params across processes # sync_all_params() # TODO figure out the way to do use MPI for pytorch def update(): actor_critic.train() obs, act, adv, ret, logp_old = map(lambda x: Tensor(x).to(device), buf.get()) _ , logp, _, val = actor_critic(obs, act) ent = (-logp).mean() # VPG objectives pi_loss = -(logp * adv).mean() v_l_old = ((ret - val)**2).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() train_pi.step() # Value function learning for _ in range(train_v_iters): val = actor_critic.value(obs) v_loss = (ret - val).pow(2).mean() train_v.zero_grad() v_loss.backward() train_v.step() actor_critic.eval() # Log changes from update _, logp, _, val = actor_critic(obs, act) pi_l_new = -(logp * adv).mean() v_l_new = ((ret - val)**2).mean() kl = (logp_old - logp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_loss), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, logp_t, logp_pi_t, v_t = actor_critic(Tensor(o.reshape(1,-1)).to(device)) # save and log buf.store(o, a.cpu().numpy(), r, v_t.item(), logp_pi_t.cpu().detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.cpu().numpy()) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else actor_critic(Tensor(o.reshape(1,-1)).to(device))[-1].item() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, actor_critic, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def bc_ue_ptb_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K', cut_buffer_size='1000K', mcue_seed=1, qloss_k=10000, qgt_seed=0, qlearn_type='learn_all_data', border=0.75, clip=0.85, update_type='e', eval_freq=float(1e3), max_timesteps=float(1e6), lr=1e-3, lag_lr=1e-3, search_lr=3e-2, wd=0, epsilon_base=1, logger_kwargs=dict()): """parameters |max_timesteps|, |eval_freq|: for BC_ue_border_perturb_c, Totalsteps means the number of minibatch updates (default batch size=100) for BC_ue_border_perturb_5, for BC_ue_border_perturb_e, Totalsteps means the number of updates on each datapoint, i.e., a step is an iteration of one optimization step on each data in the buffer""" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) file_name = "BCue_per_e_%s_%s" % (env_set, seed) buffer_name = "%s_%s_%s_%s" % (buffer_type, env_set, buffer_seed, buffer_size) setting_name = "%s_r%s_g%s" % (buffer_name, 1000, 0.99) print ("---------------------------------------") print ("Settings: " + setting_name) print ("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) action_range = float(env.action_space.high[0]) - float( env.action_space.low[0]) print('env', env_set, 'action range:', action_range) # Print out config used in MC Upper Envelope training rollout_list = [None, 1000, 200, 100, 10] k_list = [10000, 1000, 100] print('testing MClength:', rollout_list[mcue_seed % 10]) print('Training loss ratio k:', k_list[mcue_seed // 10]) selection_info = 'ue_border%s' % border selection_info += '_clip%s' % clip if clip is not None else '' print('selection_info:', selection_info) # Load the ue border selected buffer selected_buffer = utils.SARSAReplayBuffer() if buffer_size != cut_buffer_size: buffer_name = buffer_name + '_cutfinal' + cut_buffer_size selected_buffer.load(selection_info + '_' + buffer_name) buffer_length = selected_buffer.get_length() print(buffer_length) print('buffer setting:', selection_info + '_' + buffer_name) # Load the Q net trained with regression on Gts # And Load the corresponding Gts to the selected buffer selected_gts = np.load('./results/sele%s_ueMC_%s_Gt.npy' % (selection_info, setting_name), allow_pickle=True) if qlearn_type == 'learn_all_data': verbose_qnet = 'alldata_qgts%s' % qgt_seed + 'lok=%s' % qloss_k elif qlearn_type == 'learn_border_data': verbose_qnet = 'uebor%s_qgts%s' % (border, qgt_seed) if clip is None \ else 'uebor%s_clip%s_qgts%s' % (border, clip, qgt_seed) verbose_qnet += 'lok=%s' % qloss_k else: raise ValueError print('verbose_qnet:', verbose_qnet) Q_from_gt = QNet(state_dim, action_dim, activation='relu') Q_from_gt.load_state_dict( torch.load('%s/%s_Qgt.pth' % ("./pytorch_models", setting_name + '_' + verbose_qnet))) print('load Qnet from', '%s/%s_UE.pth' % ("./pytorch_models", setting_name)) # choose the epsilon plan for the constraints if update_type == 'c': epsilon = epsilon_plan(epsilon_base, action_range, selected_buffer, selected_gts, Q_from_gt, device,\ plan='common') else: epsilon = torch.FloatTensor([epsilon_base]) print('one epsilon:', epsilon) print('policy train starts --') '''Initialize policy of the update type''' print("Updating approach: BC_ue_border_perturb_%s" % update_type) if update_type == "c": policy = BC_ue_border_perturb_c.BC_ue_perturb(state_dim, action_dim, max_action,\ lr=lr, lag_lr=lag_lr, wd=wd, num_lambda=buffer_length, Q_from_gt=Q_from_gt ) elif update_type == "5": policy = BC_ue_border_perturb_5.BC_ue_perturb(state_dim, action_dim, max_action, \ lr=lr, lag_lr=lag_lr, wd=wd, Q_from_gt=Q_from_gt) elif update_type == "e": policy = BC_ue_border_perturb_e.BC_ue_perturb(state_dim, action_dim, max_action, \ lr=lr, wd=wd, Q_from_gt=Q_from_gt) policy.train_a_tilda(selected_buffer, max_updates=50, search_lr=search_lr, epsilon=epsilon) episode_num = 0 done = True training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += 1 if update_type == 'e': pol_vals = policy.behavioral_cloning(iterations=int(eval_freq), logger=logger) else: # "5" and "c" pol_vals = policy.train(selected_buffer, iterations=int(eval_freq), epsilon=epsilon, logger=logger) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) if update_type == 'c': logger.log_tabular('BCLoss', average_only=True) logger.log_tabular('ActorLoss', average_only=True) logger.log_tabular('LambdaMax', average_only=True) logger.log_tabular('LambdaMin', average_only=True) logger.log_tabular('ConstraintViolated', with_min_and_max=True) elif update_type == '5': logger.log_tabular('BCLoss', average_only=True) logger.log_tabular('ActorLoss', average_only=True) logger.log_tabular('Lambda', average_only=True) logger.log_tabular('ConstraintViolatedValue', average_only=True) elif update_type == 'e': logger.log_tabular('BCLoss', average_only=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=tabular_actor_critic.TabularVPGActorCritic, n_episodes=100, env_kwargs={}, logger_kwargs={}, ac_kwargs={}, n_test_episodes=100, gamma=0.99, lam=0.95, bootstrap_n=3): """ Environment has discrete observation and action spaces, both low dimensional so policy and value functions can be stored in table. Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic : The constructor method for an actor critic class with an ``act`` method, and attributes ``pi`` and ``v``. n_episodes (int): Number of episodes/rollouts of interaction (equivalent to number of policy updates) to perform. bootstrap_n (int) : (optional) Number of reward steps to use with a bootstrapped approximate Value function. If None, use GAE-lambda advantage estimation. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) log_wandb = logger_kwargs.get('output_dir').startswith('wandb') env = env_fn(**env_kwargs) test_env = env_fn(**env_kwargs) obs_dim = env.observation_space.n act_dim = env.action_space.n ac = actor_critic(obs_dim, act_dim, **ac_kwargs) def test_agent(): o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0 episode = 0 while episode < n_test_episodes: a, _ = ac.step(o) o2, r, d, _ = test_env.step(a) test_ep_ret += r test_ep_len += 1 o = o2 if d is True: logger.store(TestEpRet=test_ep_ret) logger.store(TestEpLen=test_ep_len) episode += 1 o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0 traj = Trajectory(gamma, lam, bootstrap_n) # Run test agent before any training happens episode = 0 test_agent() print('Mean test returns from random agent:', np.mean(logger.epoch_dict['TestEpRet']), flush=True) logger.log_tabular('Epoch', episode) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', with_min_and_max=True) # Hack logger values for compatibility with main logging header keys logger.log_tabular('EpRet', 0) logger.log_tabular('EpLen', 0) logger.log_tabular('AverageVVals', 0) logger.log_tabular('MaxVVals', 0) logger.log_tabular('MinVVals', 0) logger.log_tabular('StdVVals', 0) logger.log_tabular('TotalEnvInteracts', 0) if log_wandb: wandb.log(logger.log_current_row, step=episode) logger.dump_tabular() episode += 1 o, ep_ret, ep_len = env.reset(), 0, 0 total_env_interacts = 0 while episode < n_episodes: a, v = ac.step(o) o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 total_env_interacts += 1 traj.store(o, a, r, v) logger.store(VVals=v) o = o2 if d is True: traj.finish_path(last_obs=o, last_val=0) ac.update(traj) test_agent() logger.log_tabular('Epoch', episode) logger.log_tabular('EpRet', ep_ret) logger.log_tabular('EpLen', ep_len) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_env_interacts) if log_wandb: wandb.log(logger.log_current_row, step=episode) logger.dump_tabular() traj.reset() episode += 1 o, ep_ret, ep_len = env.reset(), 0, 0 print('pi', ac.pi, flush=True) print('logits_pi', ac.logits_pi, flush=True) print('value', ac.V, flush=True) if isinstance(ac, tabular_actor_critic.TabularReturnHCA) or isinstance(ac, tabular_actor_critic.TabularStateHCA): print('h', ac.h, flush=True)