def train(env_id, num_timesteps, seed): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): env = Lynx() #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(1))) set_global_seeds(seed) #env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as sess: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): if MLP: policy = MlpPolicy(sess, ob_space=env.observation_space, ac_space=env.action_space) else: policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=50, desired_kl=0.002, num_timesteps=num_timesteps, animate=False)
def train(env_id, num_timesteps, seed, alg, lr, momentum): env = make_mujoco_env(env_id, seed) if alg == 'sgd': from baselines.acktr.acktr_cont import learn elif alg == 'mid': from baselines.acktr.acktr_cont_midpoint import learn elif alg == 'geo': from baselines.acktr.acktr_cont_geo import learn else: raise ValueError nprocs = 4 with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs)): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) policy = GaussianMlpPolicy(ob_dim, ac_dim, 'pi') learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False, lr=lr, momentum=momentum) env.close()
def run_train_task(vv): # Create envs. env = vv['env'](log_scale_limit=0.0, max_path_length=vv['path_length']) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=vv['discount'], lam=0.97, timesteps_per_batch=vv['batch_size'], desired_kl=0.002, num_timesteps=vv['num_timesteps'], max_path_length=vv['path_length'], animate=False) env.close()
def train(args, num_timesteps, seed): import tensorflow as tf from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser from baselines.acktr.acktr_cont import learn from baselines.acktr.policies import GaussianMlpPolicy from baselines.acktr.value_functions import NeuralNetValueFunction env = common.make_env(args) env.reward_scale = 0.01 with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): env = gym.make(env_id) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=args.timesteps_per_batch, desired_kl=0.002, num_timesteps=num_timesteps, save_path=args.save_path, save_after=args.save_after, load_path=args.load_path, save_rollouts=args.save_rollouts, animate=args.animate) env.close()
def train(env, num_timesteps, timesteps_per_batch, seed, num_cpu, resume, hid_size, num_hid_layers, logdir, agentName, desired_kl, gamma, lam, portnum, num_parallel ): if num_parallel > 1: env = CustomParallelEnv(num_parallel) else: env = gym.make(env) env.seed(seed) # Todo: add seed to the random env too if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim, hid_size=128, num_hid_layers=2) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim, hid_size=128, num_hid_layers=2) learn(env, policy=policy, vf=vf, gamma=gamma, lam=0.97, timesteps_per_batch=timesteps_per_batch, desired_kl=desired_kl, resume=resume, logdir=logdir, agentName=agentName, num_timesteps=num_timesteps, animate=True) env.close()
def train(env_id, num_timesteps, seed): env = make_gym_control_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed, render): env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render) env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=8000, desired_kl=0.0002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): value_fn = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed, save, gamma, lam, desired_kl): env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) ret = learn(env, policy=policy, vf=vf, gamma=gamma, lam=lam, desired_kl=desired_kl, timesteps_per_batch=2500, num_timesteps=num_timesteps, animate=False) env.close() np.savetxt(save, np.array([ret]))
def train(env_id, num_timesteps, seed): env=gym.make(env_id) rank = MPI.COMM_WORLD.Get_rank() env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
from baselines.acktr.value_functions import NeuralNetValueFunction from baselines.common import set_global_seeds env = gym.make('GazeboModularScara3DOF-v0') initial_observation = env.reset() print("Initial observation: ", initial_observation) env.render() seed=0 set_global_seeds(seed) env.seed(seed) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.02, num_timesteps=1e6, animate=False, save_model_with_prefix='', restore_model_from_file='')
def train(self, env_fn, num_timesteps, noise_type, layer_norm, folder, load_policy, video_width, video_height, plot_rewards, save_every=50, seed=1234, episode_length=1000, pi_hid_size=150, pi_num_hid_layers=3, render_frames=_render_frames, **kwargs): num_cpu = self.workers if sys.platform == 'darwin': num_cpu //= 2 config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu) if self.gpu_usage is None or self.gpu_usage <= 0.: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers tf.Session(config=config).__enter__() worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(worker_seed) tf.set_random_seed(worker_seed) np.random.seed(worker_seed) save_every = max(1, save_every) env = env_fn() env.seed(worker_seed) rank = MPI.COMM_WORLD.Get_rank() logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed, logger.get_dir())) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.INFO) that = self iter_name = 'iters_so_far' if self.method == 'sql': iter_name = 'epoch' # TODO replace with utils.create_callback(...) def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name]))) if self.method == "ppo": pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=1024, # 256 clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # 'linear' callback=callback) elif self.method == "trpo": trpo_mpi.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_batch=1024, max_kl=0.1, # 0.01 cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=callback) elif self.method == "acktr": from algos.acktr import acktr with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) acktr.learn( env, pi=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1024, desired_kl=0.01, # 0.002 num_timesteps=num_timesteps, animate=False, callback=callback) elif self.method == "ddpg": from algos.ddpg import ddpg # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import AdaptiveParamNoiseSpec param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import NormalActionNoise action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) ddpg.train( env=env, eval_env=None, param_noise=param_noise, render=False, render_eval=False, action_noise=action_noise, actor=actor, critic=critic, memory=memory, callback=callback, **kwargs) elif self.method == "sql": from softqlearning.algorithms import SQL from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel from softqlearning.misc.utils import timestamp from softqlearning.replay_buffers import SimpleReplayBuffer from softqlearning.value_functions import NNQFunction from softqlearning.policies import StochasticNNPolicy from rllab.envs.gym_env import GymEnv env = GymEnv(env) variant = { 'seed': [1, 2, 3], 'policy_lr': 3E-4, 'qf_lr': 3E-4, 'discount': 0.99, 'layer_size': 128, 'batch_size': 128, 'max_pool_size': 1E6, 'n_train_repeat': 1, 'epoch_length': 1000, 'snapshot_mode': 'last', 'snapshot_gap': 100, } pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=episode_length, epoch_length=episode_length, n_epochs=num_timesteps, max_path_length=episode_length, batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, iter_callback=callback ) qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers), ) pi_layers = tuple([pi_hid_size] * pi_num_hid_layers) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=1, save_full_state=False, ) algorithm.train() else: print('ERROR: Invalid "method" argument provided.', file=sys.stderr) env.close()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): ob_dim, ac_dim = policy.ob_dim, policy.ac_dim dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp') oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe') dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi') # with tf.variable_scope('dbp'): # with tf.variable_scope('oe'): # with tf.variable_scope('doi'): pi = policy do_std = U.function([], [pi.std_1a, pi.logstd_1a]) kloldnew = oldpi.pd.kl(pi.pd) dbkloldnew = dboldpi.pd.kl(dbpi.pd) dist = meankl = tf.reduce_mean(kloldnew) dbkl = tf.reduce_mean(dbkloldnew) obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') inputs, loss, loss_sampled = policy.update_info var_list = [v for v in tf.global_variables() if "pi" in v.name] db_var_list = [v for v in tf.global_variables() if "dbp" in v.name] old_var_list = [v for v in tf.global_variables() if "oe" in v.name] db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name] print(len(var_list), len(db_var_list), len(old_var_list), len(db_old_var_list)) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) assign_db = U.function( [], [], updates=[ tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list) ] + [ tf.assign(dbold, dbnew) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list) ]) assign_old_eq_newr = U.function( [], [], updates=[ tf.assign(newv, oldv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) # assign_dbr = U.function([], [], updates= # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] + # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)]) klgrads = tf.gradients(dist, var_list) dbklgrads = tf.gradients(dbkl, db_var_list) p_grads = [tf.ones_like(v) for v in dbklgrads] get_flat = U.GetFlat(var_list) get_old_flat = U.GetFlat(old_var_list) set_from_flat = U.SetFromFlat(var_list) flat_tangent2 = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan2") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents2 = [] for shape in shapes: sz = U.intprod(shape) tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape)) start += sz gvp2 = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(dbklgrads, tangents2) ]) gvp2_grads = tf.gradients(gvp2, db_var_list) neg_term = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(gvp2_grads, tangents2) ]) / 2. ng1 = tf.gradients(neg_term, db_var_list) ng2 = tf.gradients(neg_term, db_old_var_list) neg_term_grads = [ a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list), tf.gradients(neg_term, db_old_var_list)) ] neg_term = neg_term_grads # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads]) pos_term = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(gvp2_grads, p_grads) ]) pos_term_grads = [ a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list), tf.gradients(pos_term, db_old_var_list)) ] pos_term_sum = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(pos_term_grads, tangents2) ]) pos_term_grads = tf.gradients(pos_term_sum, p_grads) pos_term = pos_term_grads # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads]) geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)] optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) grads = optim.compute_gradients(loss, var_list=pi_var_list) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)] geo_grads = list(zip(geo_term, var_list)) update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads) do_update = U.function(inputs, update_op) inputs_tangent = list(inputs) + [flat_tangent2] do_update_geo = U.function(inputs_tangent, update_geo_op) do_get_geo_term = U.function(inputs_tangent, [ng1, ng2]) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner, q_runner_geo]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) assign_old_eq_new() # set old parameter values to new parameter values assign_db() # Policy update do_update(ob_no, action_na, standardized_adv_n) # ft2 = get_flat() - get_old_flat() # assign_old_eq_newr() # assign back # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2) # def check_nan(bs): # return [~np.isnan(b).all() for b in bs] # print(gnp[0]) # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf') # print(gnp[1]) # do_update_geo(ob_no, action_na, standardized_adv_n, ft2) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) print(do_std()) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)