def __init__(self, checkpoint_path): player_base.PlayerBase.__init__(self) self._action_set = 'default' self._player_prefix = 'player_0' config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) stacking = 4 self._stacker = ObservationStacker(stacking) with tf.variable_scope(self._player_prefix): with tf.variable_scope('ppo2_model'): env = DummyEnv(self._action_set, stacking) ob_space = env.observation_space X = observation_placeholder(ob_space, batch_size=1) extra_tensors = {} encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = gfootball_impala_cnn_network_fn(encoded_x) self._policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, vf_latent=policy_latent, sess=self._sess, estimate_q=False, **extra_tensors ) _load_variables(checkpoint_path, self._sess, prefix=self._player_prefix + '/') saver = tf.train.Saver() saver.save(self._sess, "/home/alex/Dropbox/projects/python/kaggle/football/saved_models/11_vs_11_easy_stochastic_v2/11_vs_11_easy_stochastic_v2")
def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm): super(Model, self).__init__(name='PPO2Model') self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.mode = self.train_model.mode self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables)
def __init__(self, *, ac_space, policy_network, nupdates, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6)): super(Model, self).__init__(name='A2CModel') self.train_model = PolicyWithValue(ac_space, policy_network, value_network=None, estimate_q=False) lr_schedule = InverseLinearTimeDecay(initial_learning_rate=lr, nupdates=nupdates) self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr_schedule, rho=alpha, epsilon=epsilon) self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state
def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm, lr): super(Model, self).__init__(name='PPO2Model') self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False) self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr, epsilon=1e-5) self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
class Model(tf.keras.Model): """ We use this class to : __init__: - Creates the step_model - Creates the train_model train(): - Make the training part (feedforward and retropropagation of gradients) save/load(): - Save load the model """ def __init__(self, *, ac_space, policy_network, nupdates, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6)): super(Model, self).__init__(name='A2CModel') self.train_model = PolicyWithValue(ac_space, policy_network, value_network=None, estimate_q=False) lr_schedule = InverseLinearTimeDecay(initial_learning_rate=lr, nupdates=nupdates) self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr_schedule, rho=alpha, epsilon=epsilon) self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state @tf.function def train(self, obs, states, rewards, masks, actions, values): advs = rewards - values with tf.GradientTape() as tape: policy_latent = self.train_model.policy_network(obs) pd, _ = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(actions) entropy = tf.reduce_mean(pd.entropy()) vpred = self.train_model.value(obs) vf_loss = tf.reduce_mean(tf.square(vpred - rewards)) pg_loss = tf.reduce_mean(advs * neglogpac) loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef var_list = tape.watched_variables() grads = tape.gradient(loss, var_list) grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads_and_vars = list(zip(grads, var_list)) self.optimizer.apply_gradients(grads_and_vars) return pg_loss, vf_loss, entropy
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): super(Model, self).__init__(name='ACKTRModel') nbatch = nenvs * nsteps # TODO: PolicyWithValue does this right? Original implementation uses 'nbatch' #self.model = step_model = policy(nenvs, 1) #self.model2 = train_model = policy(nbatch, nsteps) train_model = PolicyWithValue(ac_space, policy, value_network=None, estimate_q=False) self.ent_coef = ent_coef self.vf_coef = vf_coef self.vf_fisher_coef = vf_fisher_coef self.kfac_clip = kfac_clip self.is_async = is_async self.max_grad_norm = max_grad_norm self.total_timesteps = total_timesteps # TODO: Learning rate schedule and definition of optimizer #self.lrschedule = lrschedule lrschedule = LinearTimeDecay(initial_learning_rate=lr) # TODO self.optim = kfac.KfacOptimizer(learning_rate=lrschedule, clip_kl=self.kfac_clip, \ momentum=0.9, kfac_update=1, epsilon=0.01, \ stats_decay=0.99, is_async=self.is_async, cold_iter=10, max_grad_norm=self.max_grad_norm) self.train_model = train_model #self.step_model = step_model self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state
class Player(player_base.PlayerBase): """An agent loaded from PPO2 cnn model checkpoint.""" def __init__(self, checkpoint_path): player_base.PlayerBase.__init__(self) self._action_set = 'default' self._player_prefix = 'player_0' config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) stacking = 4 self._stacker = ObservationStacker(stacking) with tf.variable_scope(self._player_prefix): with tf.variable_scope('ppo2_model'): env = DummyEnv(self._action_set, stacking) ob_space = env.observation_space X = observation_placeholder(ob_space, batch_size=1) extra_tensors = {} encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = gfootball_impala_cnn_network_fn(encoded_x) self._policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, vf_latent=policy_latent, sess=self._sess, estimate_q=False, **extra_tensors ) _load_variables(checkpoint_path, self._sess, prefix=self._player_prefix + '/') saver = tf.train.Saver() saver.save(self._sess, "/home/alex/Dropbox/projects/python/kaggle/football/saved_models/11_vs_11_easy_stochastic_v2/11_vs_11_easy_stochastic_v2") def __del__(self): self._sess.close() def take_action(self, observation): assert len(observation) == 1, 'Multiple players control is not supported' observation = observation_preprocessing.generate_smm(observation) observation = self._stacker.get(observation) action = self._policy.step(observation)[0][0] actions = [action] #[football_action_set.action_set_dict[self._action_set][action]] return actions def reset(self): self._stacker.reset()
class Model(tf.Module): """ We use this object to : __init__: - Creates the step_model - Creates the train_model train(): - Make the training part (feedforward and retropropagation of gradients) save/load(): - Save load the model """ def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm): super(Model, self).__init__(name='PPO2Model') self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.mode = self.train_model.mode self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpac_old, states=None): grads, pg_loss, vf_loss, entropy, approxkl, clipfrac = self.get_grad( cliprange, obs, returns, masks, actions, values, neglogpac_old) if MPI is not None: self.optimizer.apply_gradients(grads, lr) else: self.optimizer.learning_rate = lr grads_and_vars = zip(grads, self.train_model.trainable_variables) self.optimizer.apply_gradients(grads_and_vars) return pg_loss, vf_loss, entropy, approxkl, clipfrac @tf.function def get_grad(self, cliprange, obs, returns, masks, actions, values, neglogpac_old): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - tf.reduce_mean(advs)) / (tf.keras.backend.std(advs) + 1e-8) with tf.GradientTape() as tape: policy_latent = self.train_model.policy_network(obs) pd, _ = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(actions) entropy = tf.reduce_mean(pd.entropy()) vpred = self.train_model.value(obs) vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange) vf_losses1 = tf.square(vpred - returns) vf_losses2 = tf.square(vpredclipped - returns) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(neglogpac_old - neglogpac) pg_losses1 = -advs * ratio pg_losses2 = -advs * tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange) pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - neglogpac_old)) clipfrac = tf.reduce_mean( tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange), tf.float32)) loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef var_list = self.train_model.trainable_variables grads = tape.gradient(loss, var_list) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) if MPI is not None: grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0) return grads, pg_loss, vf_loss, entropy, approxkl, clipfrac
def __init__(self, agent, network, nsteps, rho, max_kl, ent_coef, vf_stepsize, vf_iters, cg_damping, cg_iters, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MATRPOModel') self.agent = agent self.nsteps = nsteps self.rho = rho self.max_kl = max_kl self.ent_coef = ent_coef self.cg_damping = cg_damping self.cg_iters = cg_iters self.vf_stepsize = vf_stepsize self.vf_iters = vf_iters set_global_seeds(seed) np.set_printoptions(precision=3) if MPI is not None: self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() else: self.nworkers = 1 self.rank = 0 # Setup losses and stuff # ---------------------------------------- ob_space = agent.observation_space ac_space = agent.action_space if isinstance(network, str): network = get_network_builder(network)(**network_kwargs) with tf.name_scope(agent.name): with tf.name_scope("pi"): pi_policy_network = network(ob_space.shape) pi_value_network = network(ob_space.shape) self.pi = pi = PolicyWithValue(ac_space, pi_policy_network, pi_value_network) with tf.name_scope("oldpi"): old_pi_policy_network = network(ob_space.shape) old_pi_value_network = network(ob_space.shape) self.oldpi = oldpi = PolicyWithValue(ac_space, old_pi_policy_network, old_pi_value_network) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.zeros([self.agent.nmates, self.nsteps]).astype(np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] pi_var_list = pi_policy_network.trainable_variables + list( pi.pdtype.trainable_variables) old_pi_var_list = old_pi_policy_network.trainable_variables + list( oldpi.pdtype.trainable_variables) vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables self.pi_var_list = pi_var_list self.old_pi_var_list = old_pi_var_list self.vf_var_list = vf_var_list self.old_vf_var_list = old_vf_var_list if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=pi) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) self.vfadam = MpiAdam(vf_var_list) self.get_flat = U.GetFlat(pi_var_list) self.set_from_flat = U.SetFromFlat(pi_var_list) self.loss_names = [ "Lagrange", "surrgain", "sync", "meankl", "entloss", "entropy" ] self.shapes = [var.get_shape().as_list() for var in pi_var_list]
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, randomization=True): ob_space = env.observation_space extra_tensors = {} X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=None) encoded_x = encode_observation(ob_space, X) # Randomization if randomization: encoded_x = tf.layers.conv2d( encoded_x / 255., 3, 3, padding='same', kernel_initializer=tf.initializers.glorot_normal(), trainable=False, name='randcnn') * 255. randcnn_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="ppo2_model/randcnn") extra_tensors['randcnn_param'] = randcnn_param with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) extra_tensors['latent_fts'] = policy_latent if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue(env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy
def learn( *, network, env, eval_env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, log_path=None, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space if isinstance(network, str): network = get_network_builder(network)(**network_kwargs) with tf.name_scope("pi"): pi_policy_network = network(ob_space.shape) pi_value_network = network(ob_space.shape) pi = PolicyWithValue(ac_space, pi_policy_network, pi_value_network) with tf.name_scope("oldpi"): old_pi_policy_network = network(ob_space.shape) old_pi_value_network = network(ob_space.shape) oldpi = PolicyWithValue(ac_space, old_pi_policy_network, old_pi_value_network) pi_var_list = pi_policy_network.trainable_variables + list( pi.pdtype.trainable_variables) old_pi_var_list = old_pi_policy_network.trainable_variables + list( oldpi.pdtype.trainable_variables) vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=pi) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(pi_var_list) set_from_flat = U.SetFromFlat(pi_var_list) loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] shapes = [var.get_shape().as_list() for var in pi_var_list] def assign_old_eq_new(): for pi_var, old_pi_var in zip(pi_var_list, old_pi_var_list): old_pi_var.assign(pi_var) for vf_var, old_vf_var in zip(vf_var_list, old_vf_var_list): old_vf_var.assign(vf_var) @tf.function def compute_lossandgrad(ob, ac, atarg): with tf.GradientTape() as tape: old_policy_latent = oldpi.policy_network(ob) old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = pi.policy_network(ob) pd, _ = pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) ent = pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] gradients = tape.gradient(optimgain, pi_var_list) return losses + [U.flatgrad(gradients, pi_var_list)] @tf.function def compute_losses(ob, ac, atarg): old_policy_latent = oldpi.policy_network(ob) old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = pi.policy_network(ob) pd, _ = pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) ent = pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] return losses #ob shape should be [batch_size, ob_dim], merged nenv #ret shape should be [batch_size] @tf.function def compute_vflossandgrad(ob, ret): with tf.GradientTape() as tape: pi_vf = pi.value(ob) vferr = tf.reduce_mean(tf.square(pi_vf - ret)) return U.flatgrad(tape.gradient(vferr, vf_var_list), vf_var_list) @tf.function def compute_fvp(flat_tangent, ob, ac, atarg): with tf.GradientTape() as outter_tape: with tf.GradientTape() as inner_tape: old_policy_latent = oldpi.policy_network(ob) old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = pi.policy_network(ob) pd, _ = pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) meankl = tf.reduce_mean(kloldnew) klgrads = inner_tape.gradient(meankl, pi_var_list) start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) hessians_products = outter_tape.gradient(gvp, pi_var_list) fvp = U.flatgrad(hessians_products, pi_var_list) return fvp @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards logdir = log_path + '/evaluator' modeldir = log_path + '/models' if not os.path.exists(logdir): os.makedirs(logdir) if not os.path.exists(modeldir): os.makedirs(modeldir) evaluator = Evaluator(env=eval_env, model=pi, logdir=logdir) max_inner_iter = 500000 if env.spec.id == 'InvertedDoublePendulum-v2' else 3000000 epoch = vf_iters batch_size = timesteps_per_batch mb_size = 256 inner_iter_per_iter = epoch * int(batch_size / mb_size) max_iter = int(max_inner_iter / inner_iter_per_iter) eval_num = 150 eval_interval = save_interval = int( int(max_inner_iter / eval_num) / inner_iter_per_iter) if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' for update in range(1, max_iter + 1): if callback: callback(locals(), globals()) # if total_timesteps and timesteps_so_far >= total_timesteps: # break # elif max_episodes and episodes_so_far >= max_episodes: # break # elif max_iters and iters_so_far >= max_iters: # break logger.log("********** Iteration %i ************" % iters_so_far) if (update - 1) % eval_interval == 0: evaluator.run_evaluation(update - 1) if (update - 1) % save_interval == 0: ckpt = tf.train.Checkpoint(model=pi) ckpt.save(modeldir + '/ckpt_ite' + str((update - 1))) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] ob = sf01(ob) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = ob, ac, atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs).numpy()) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = g.numpy() g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=mb_size): mbob = sf01(mbob) g = allmean(compute_vflossandgrad(mbob, mbret).numpy()) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef, max_grad_norm, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MAPPO2Model') set_global_seeds(seed) # Get state_space and action_space ob_space = agent.observation_space ac_space = agent.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)( **network_kwargs) network = policy_network_fn(ob_space.shape) self.train_model = PolicyWithValue(ac_space, network) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() # if isinstance(network, str): # network = get_network_builder(network)(**network_kwargs) # policy_network = network(ob_space.shape) # value_network = network(ob_space.shape) # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network) # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables) # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables # if MPI is not None: # self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list) # self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list) # else: # self.pi_optimizer = tf.keras.optimizers.Adam() # self.vf_optimizer = tf.keras.optimizers.Adam() self.agent = agent self.nsteps = nsteps self.rho = rho self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=self.train_model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint)
class AgentModel(tf.Module): def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef, max_grad_norm, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MAPPO2Model') set_global_seeds(seed) # Get state_space and action_space ob_space = agent.observation_space ac_space = agent.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)( **network_kwargs) network = policy_network_fn(ob_space.shape) self.train_model = PolicyWithValue(ac_space, network) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() # if isinstance(network, str): # network = get_network_builder(network)(**network_kwargs) # policy_network = network(ob_space.shape) # value_network = network(ob_space.shape) # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network) # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables) # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables # if MPI is not None: # self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list) # self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list) # else: # self.pi_optimizer = tf.keras.optimizers.Adam() # self.vf_optimizer = tf.keras.optimizers.Adam() self.agent = agent self.nsteps = nsteps self.rho = rho self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=self.train_model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) def reinitial_estimates(self): self.estimates = np.random.normal( 0, 0.1, [self.agent.nmates, self.nsteps]).astype(np.float32) self.multipliers = np.random.uniform( 0, 1, [self.agent.nmates, self.nsteps]).astype(np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] def store_oldpi_var(self): pi_var_list = self.train_model.policy_network.trainable_variables + \ list(self.train_model.pdtype.trainable_variables) self.oldpi_var_list = [var.numpy() for var in pi_var_list] def assign_new_eq_old(self): pi_var_list = self.train_model.policy_network.trainable_variables + \ list(self.train_model.pdtype.trainable_variables) for pi_var, old_pi_var in zip(pi_var_list, self.oldpi_var_list): pi_var.assign(old_pi_var) # @tf.function # def get_vf_grad(self, cliprange, obs, returns, actions, values, advs, neglogpac_old): # with tf.GradientTape() as tape: # vpred = self.train_model.value(obs) # vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange) # vf_losses1 = tf.square(vpred - returns) # vf_losses2 = tf.square(vpredclipped - returns) # vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # vf_grads = tape.gradient(vf_loss, self.vf_var_list) # if self.max_grad_norm is not None: # vf_grads, _ = tf.clip_by_global_norm(vf_grads, self.max_grad_norm) # if MPI is not None: # vf_grads = tf.concat([tf.reshape(g, (-1,)) for g in vf_grads], axis=0) # return vf_grads, vf_loss @tf.function def get_pi_grad(self, cliprange, nb, estimates, multipliers, obs, returns, actions, values, advs, neglogpac_old): with tf.GradientTape() as tape: policy_latent = self.train_model.policy_network(obs) pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(actions) entropy = tf.reduce_mean(pd.entropy()) vpred = self.train_model.value(obs) vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange) vf_losses1 = tf.square(vpred - returns) vf_losses2 = tf.square(vpredclipped - returns) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(neglogpac_old - neglogpac) clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange) pg_losses1 = -advs * ratio pg_losses2 = -advs * clipped_ratio pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) comm = self.comm_matrix[self.comm_matrix[:, nb] != 0][0, self.agent.id] syncerr = comm * ratio - estimates sync_loss = tf.reduce_mean(multipliers * syncerr) + \ 0.5 * self.rho * (tf.reduce_mean(tf.square(syncerr))) approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - neglogpac_old)) clipfrac = tf.reduce_mean( tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange), tf.float32)) loss = pg_loss + sync_loss - entropy * self.ent_coef + vf_loss * self.vf_coef var_list = self.train_model.trainable_variables grads = tape.gradient(loss, var_list) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) if MPI is not None: grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0) return grads, loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac # pi_grads = tape.gradient(pi_loss, self.pi_var_list) # if self.max_grad_norm is not None: # pi_grads, _ = tf.clip_by_global_norm(pi_grads, self.max_grad_norm) # if MPI is not None: # pi_grads = tf.concat([tf.reshape(g, (-1,)) for g in pi_grads], axis=0) # return pi_grads, pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac def pi_update(self, lr, cliprange, nb, obs, returns, actions, values, advs, neglogpacs_old): estimates = self.estimates[nb] multipliers = self.multipliers[nb] pi_grads, pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac = self.get_pi_grad( cliprange, nb, estimates, multipliers, obs, returns, actions, values, advs, neglogpacs_old) if MPI is not None: self.optimizer.apply_gradients(pi_grads, lr) else: self.optimizer.learning_rate = lr grads_and_vars = zip(pi_grads, self.train_model.trainable_variables) self.optimizer.apply_gradients(grads_and_vars) return pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac # if MPI is not None: # self.pi_optimizer.apply_gradients(pi_grads, lr) # else: # self.pi_optimizer.learning_rate = lr # grads_and_vars = zip(pi_grads, self.pi_var_list) # self.pi_optimizer.apply_gradients(grads_and_vars) # return pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac # def vf_update(self, lr, cliprange, obs, returns, actions, values, advs, neglogpacs_old): # vf_grads, vf_loss = self.get_vf_grad( # cliprange, obs, returns, actions, values, advs, neglogpacs_old) # if MPI is not None: # self.vf_optimizer.apply_gradients(vf_grads, lr) # else: # self.vf_optimizer.learning_rate = lr # grads_and_vars = zip(vf_grads, self.train_model.trainable_variables) # self.vf_optimizer.apply_gradients(grads_and_vars) # return vf_loss def info_to_exchange(self, cliprange, ob, ac, neglogpac_old, nb): policy_latent = self.train_model.policy_network(ob) pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(ac) ratio = tf.exp(neglogpac_old - neglogpac) clipped_ratio = tf.clip_by_value(tf.exp(-neglogpac), 1 - cliprange, 1 + cliprange) return ratio, self.multipliers[nb] def exchange(self, cliprange, ob, ac, neglogpac_old, nb_ratio, nb_multipliers, nb): policy_latent = self.train_model.policy_network(ob) pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(ac) ratio = tf.exp(neglogpac_old - neglogpac) clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange) comm = self.comm_matrix[self.comm_matrix[:, nb] != 0][0, self.agent.id] v = 0.5 * (self.multipliers[nb] + nb_multipliers) + \ 0.5 * self.rho * (comm * ratio + (-comm) * nb_ratio) estimate = np.array((1.0 / self.rho) * (self.multipliers[nb] - v) + comm * ratio) self.estimates = tf.tensor_scatter_nd_update(self.estimates, [[nb]], estimate[None, :]) self.multipliers = tf.tensor_scatter_nd_update(self.multipliers, [[nb]], v[None, :])
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, mix_mode='nomix'): ob_space = env.observation_space extra_tensors = {} X = observ_placeholder if observ_placeholder is not None \ else observation_placeholder(ob_space, batch_size=None) if mix_mode in ['mixreg', 'mixobs']: COEFF = tf.placeholder(tf.float32, [None]) INDICES = tf.placeholder(tf.int32, [None]) OTHER_INDICES = tf.placeholder(tf.int32, [None]) coeff = tf.reshape(COEFF, (-1, 1, 1, 1)) encoded_x = tf.cast(X, tf.float32) encoded_x = coeff * tf.gather(encoded_x, INDICES, axis=0) \ + (1 - coeff) * tf.gather(encoded_x, OTHER_INDICES, axis=0) encoded_x = tf.cast(encoded_x, tf.uint8) extra_tensors['coeff'] = COEFF extra_tensors['indices'] = INDICES extra_tensors['other_indices'] = OTHER_INDICES elif mix_mode == 'nomix': encoded_x = X else: raise ValueError(f"Unknown mixing mode: {mix_mode} !") encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch ' \ +'size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with # value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, # JAG: Pass adv_gamma to policy adv_gamma=adv_gamma, **extra_tensors ) return policy
def learn( *, network, env, save, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # ttotal_timestepsime constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space if isinstance(network, str): network, network_model = get_network_builder(network)(**network_kwargs) with tf.name_scope("pi"): pi_policy_network = network(ob_space.shape) pi_value_network = network(ob_space.shape) pi = PolicyWithValue(ac_space, pi_policy_network, pi_value_network) with tf.name_scope("oldpi"): old_pi_policy_network = network(ob_space.shape) old_pi_value_network = network(ob_space.shape) oldpi = PolicyWithValue(ac_space, old_pi_policy_network, old_pi_value_network) pi_var_list = pi_policy_network.trainable_variables + list( pi.pdtype.trainable_variables) old_pi_var_list = old_pi_policy_network.trainable_variables + list( oldpi.pdtype.trainable_variables) vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=pi) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(pi_var_list) set_from_flat = U.SetFromFlat(pi_var_list) loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] shapes = [var.get_shape().as_list() for var in pi_var_list] def assign_old_eq_new(): for pi_var, old_pi_var in zip(pi_var_list, old_pi_var_list): old_pi_var.assign(pi_var) for vf_var, old_vf_var in zip(vf_var_list, old_vf_var_list): old_vf_var.assign(vf_var) @tf.function def compute_lossandgrad(ob, ac, atarg): with tf.GradientTape() as tape: old_policy_latent = oldpi.policy_network(ob) old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = pi.policy_network(ob) pd, _ = pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) ent = pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] gradients = tape.gradient(optimgain, pi_var_list) return losses + [U.flatgrad(gradients, pi_var_list)] @tf.function def compute_losses(ob, ac, atarg): old_policy_latent = oldpi.policy_network(ob) old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = pi.policy_network(ob) pd, _ = pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) ent = pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] return losses #ob shape should be [batch_size, ob_dim], merged nenv #ret shape should be [batch_size] @tf.function def compute_vflossandgrad(ob, ret): with tf.GradientTape() as tape: pi_vf = pi.value(ob) vferr = tf.reduce_mean(tf.square(pi_vf - ret)) return U.flatgrad(tape.gradient(vferr, vf_var_list), vf_var_list) @tf.function def compute_fvp(flat_tangent, ob, ac, atarg): with tf.GradientTape() as outter_tape: with tf.GradientTape() as inner_tape: old_policy_latent = oldpi.policy_network(ob) old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = pi.policy_network(ob) pd, _ = pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) meankl = tf.reduce_mean(kloldnew) klgrads = inner_tape.gradient(meankl, pi_var_list) start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) hessians_products = outter_tape.gradient(gvp, pi_var_list) fvp = U.flatgrad(hessians_products, pi_var_list) return fvp @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards # ---------------------- New ---------------------- rewforbuffer = deque(maxlen=40) rewctrlbuffer = deque(maxlen=40) rewconbuffer = deque(maxlen=40) rewsurbuffer = deque(maxlen=40) rewformeanbuf = np.array([]) rewctrlmeanbuf = np.array([]) rewconmeanbuf = np.array([]) rewsurmeanbuf = np.array([]) # ------------------------------------------------- if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # nothing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' x_axis = 0 x_holder = np.array([]) rew_holder = np.array([]) while True: if timesteps_so_far > total_timesteps - 1500: #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Set recording XXXX timesteps before ending env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: True, video_length=200) seg_gen = traj_segment_generator(pi, env, timesteps_per_batch) if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] ob = sf01(ob) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = ob, ac, atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs).numpy()) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = g.numpy() g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): mbob = sf01(mbob) g = allmean(compute_vflossandgrad(mbob, mbret).numpy()) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_for"], seg["ep_rets_ctrl"], seg["ep_rets_con"], seg["ep_rets_sur"] ) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews, rews_for, rews_ctrl, rews_con, rews_sur = map( flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) # ---------------------- New ---------------------- rewforbuffer.extend(rews_for) rewctrlbuffer.extend(rews_ctrl) rewconbuffer.extend(rews_con) rewsurbuffer.extend(rews_sur) rewformeanbuf = np.append([rewformeanbuf], [np.mean(rewforbuffer)]) rewctrlmeanbuf = np.append([rewctrlmeanbuf], [np.mean(rewctrlbuffer)]) rewconmeanbuf = np.append([rewconmeanbuf], [np.mean(rewconbuffer)]) rewsurmeanbuf = np.append([rewsurmeanbuf], [np.mean(rewsurbuffer)]) # ------------------------------------------------- logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() x_axis += 1 x_holder = np.append([x_holder], [x_axis]) rew_holder = np.append([rew_holder], [np.mean(rewbuffer)]) # --------------------------------------- NEW ----------------------------------------------------- with open("img_rec.txt", "r") as rec: cur_gen = rec.read() cur_gen = cur_gen.strip() # remove \n dir_of_gens = [ '1_1', '2_1', '3_1', '1_2', '2_2', '3_2', '1_3', '2_3', '3_3', '1_4', '2_4', '3_4', '1_5', '2_5', '3_5', '1_6', '2_6', '3_6', '1_7', '2_7', '3_7', '1_8', '2_8', '3_8', '1_9', '2_9', '3_9', '1_10', '2_10', '3_10', '1_11', '2_11', '3_11', '1_12', '2_12', '3_12' ] # ------------------------------------------------------------------------------------------------- from matplotlib import pyplot as plt f = plt.figure(1) plt.plot(x_holder, rew_holder) plt.title("Rewards for Ant v2") plt.grid(True) plt.savefig('rewards_for_antv2_{}'.format(cur_gen)) g = plt.figure(2) plt.plot(x_holder, rewformeanbuf, label='Forward Reward') plt.plot(x_holder, rewctrlmeanbuf, label='CTRL Cost') plt.plot(x_holder, rewconmeanbuf, label='Contact Cost') plt.plot(x_holder, rewsurmeanbuf, label='Survive Reward') plt.title("Reward Breakdown") plt.legend() plt.grid(True) plt.savefig('rewards_breakdown{}'.format(cur_gen)) # plt.show() # --------------------------------------- NEW ----------------------------------------------------- elem = int(dir_of_gens.index(cur_gen)) with open("img_rec.txt", "w") as rec: if elem == 35: new_elem = 0 else: new_elem = elem + 1 new_gen = cur_gen.replace(cur_gen, dir_of_gens[new_elem]) rec.write(new_gen) # ------------------------------------------------------------------------------------------------- #----------------------------------------------------------- SAVE WEIGHTS ------------------------------------------------------------# # np.save('val_weights_bias_2_c',val_weights_bias_2_c) # <------------------------------------------------------------------------------------- # save = save.replace(save[0],'..',2) # os.chdir(save) # name = 'max_reward' # completeName = os.path.join(name+".txt") # file1 = open(completeName,"w") # toFile = str(np.mean(rewbuffer)) # file1.write(toFile) # file1.close() # os.chdir('../../../baselines-tf2') return pi