def estimate_advantage_function(vf, paths, parameters): advs = [] for path in paths: rew_t = path["reward"] print('len(rew_t): ', len(rew_t)) return_t = common.discount(rew_t, parameters.gamma) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + parameters.gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, parameters.gamma * parameters.lam) print('len(adv_t): ', len(adv_t)) advs.append(adv_t) return advs
def estimate_v_targets(vf, paths, parameters): vtargs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, parameters.gamma) vtargs.append(return_t) return vtargs
def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): """ Traines an ACKTR model. :param env: (Gym environment) The environment to learn from :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) :param gamma: (float) The discount value :param lam: (float) the tradeoff between exploration and exploitation :param timesteps_per_batch: (int) the number of timesteps for each batch :param num_timesteps: (int) the total number of timesteps to run :param animate: (bool) if render env :param callback: (function) called every step, used for logging and saving :param desired_kl: (float) the Kullback leibler weight for the loss """ obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = tf_util.function(inputs, update_op) tf_util.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for queue_runner in [q_runner, value_fn.q_runner]: assert queue_runner is not None enqueue_threads.extend( queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) timesteps_this_batch += path["reward"].shape[0] timesteps_so_far += path["reward"].shape[0] if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = value_fn.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function value_fn.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl_loss = policy.compute_kl(ob_no, oldac_dist) if kl_loss > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl_loss < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular( "EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) logger.record_tabular("KL", kl_loss) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, resume, logdir, agentName, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1.0 - 0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(U.get_session(), coord=coord, start=True)) timesteps_so_far = 0 saver = tf.train.Saver(max_to_keep=10) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) ob_filter_path = os.path.join(os.path.abspath(logdir), "{}-{}".format('obfilter', resume)) with open(ob_filter_path, 'rb') as ob_filter_input: obfilter = pickle.load(ob_filter_input) print("Loaded observation filter") iters_so_far = resume print('logdir = ', logdir) logF = open(os.path.join(logdir, 'log.txt'), 'a') logF2 = open(os.path.join(logdir, 'log_it.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) save_interval = 5 # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (iters_so_far % save_interval == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2.0: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2.0: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") rew_mean = np.mean([path["reward"].sum() for path in paths]) logger.record_tabular("EpRewMean", rew_mean) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) logF.write(str(rew_mean) + "\n") logF2.write(str(iters_so_far) + "," + str(rew_mean) + "\n") # json.dump(combined_stats, logStats) logF.flush() logF2.flush() # logStats.flush() if save_interval and (iters_so_far % save_interval == 0 or iters_so_far == 1): saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far) ob_filter_path = os.path.join( os.path.abspath(logdir), "{}-{}".format('obfilter', iters_so_far)) with open(ob_filter_path, 'wb') as ob_filter_output: pickle.dump(obfilter, ob_filter_output, pickle.HIGHEST_PROTOCOL) if callback: callback() logger.dump_tabular() iters_so_far += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, fname=None): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() if fname != None and tf.train.checkpoint_exists(fname): load_result = U.load_state(fname) logger.log("Model loaded from file {}".format(fname)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(get_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Save model every 100 iterations if fname != None and (i % 100 == 99): U.save_state(fname) logger.log("Model saved to file {}".format(fname)) env.seed() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def run(self, update_counters=True): ob = self.env.reset() prev_ob = np.float32(np.zeros(ob.shape)) if self.obfilter: ob = self.obfilter(ob) terminated = False obs = [] acs = [] ac_dists = [] logps = [] rewards = [] for _ in range(self.max_pathlength): if self.animate: self.env.render() state = np.concatenate([ob, prev_ob], -1) obs.append(state) ac, ac_dist, logp = self.policy.act(state) acs.append(ac) ac_dists.append(ac_dist) logps.append(logp) prev_ob = np.copy(ob) scaled_ac = self.env.action_space.low + (ac + 1.) * 0.5 * ( self.env.action_space.high - self.env.action_space.low) scaled_ac = np.clip(scaled_ac, self.env.action_space.low, self.env.action_space.high) ob, rew, done, _ = self.env.step(scaled_ac) if self.obfilter: ob = self.obfilter(ob) rewards.append(rew) if done: terminated = True break self.rewards.append(sum(rewards)) self.rewards = self.rewards[-100:] if update_counters: self._num_rollouts += 1 self._num_steps += len(rewards) path = { "observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs), "action_dist": np.array(ac_dists), "logp": np.array(logps) } rew_t = path["reward"] value = self.policy.predict(path["observation"], path) vtarg = common.discount( np.append(rew_t, 0.0 if path["terminated"] else value[-1]), self.gamma)[:-1] vpred_t = np.append(value, 0.0 if path["terminated"] else value[-1]) delta_t = rew_t + self.gamma * vpred_t[1:] - vpred_t[:-1] adv_GAE = common.discount(delta_t, self.gamma * self.lam) if np.mean(self.rewards) >= self.score and not self.finished: self.episodes_till_done = self._num_rollouts self.frames_till_done = self._num_steps self.finished = True return path, vtarg, value, adv_GAE
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') stepsize_mul = tf.placeholder(tf.float32, shape=None) inputs, loss, loss_sampled = policy.update_info inputs = list(inputs) inputs.append(stepsize_mul) optim = kfac.KfacOptimizer(learning_rate=stepsize * stepsize_mul, cold_lr=stepsize * stepsize_mul *(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) grads = optim.compute_gradients(loss, pi_var_list) grads = [g[0] for g in grads] old_var = [ tf.Variable(initial_value=tf.zeros_like(v)) for v in pi_var_list ] old_to_new = tf.group( *[tf.assign(v, o) for v, o in zip(pi_var_list, old_var)]) old_from_new = tf.group( *[tf.assign(o, v) for v, o in zip(pi_var_list, old_var)]) do_old_var = U.function([], old_var) do_pi_var = U.function([], pi_var_list) do_old_from_new = U.function([], old_from_new) with tf.control_dependencies(grads): with tf.control_dependencies([old_to_new]): midpoint_op, q_runner_mid = optim.apply_gradients( list(zip(grads, pi_var_list))) do_midpoint = U.function(inputs, midpoint_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_old_from_new() # print(do_old_var()) do_update(ob_no, action_na, standardized_adv_n, 0.5) do_midpoint(ob_no, action_na, standardized_adv_n, 1.0) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): ob_dim, ac_dim = policy.ob_dim, policy.ac_dim dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp') oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe') dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi') # with tf.variable_scope('dbp'): # with tf.variable_scope('oe'): # with tf.variable_scope('doi'): pi = policy do_std = U.function([], [pi.std_1a, pi.logstd_1a]) kloldnew = oldpi.pd.kl(pi.pd) dbkloldnew = dboldpi.pd.kl(dbpi.pd) dist = meankl = tf.reduce_mean(kloldnew) dbkl = tf.reduce_mean(dbkloldnew) obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') inputs, loss, loss_sampled = policy.update_info var_list = [v for v in tf.global_variables() if "pi" in v.name] db_var_list = [v for v in tf.global_variables() if "dbp" in v.name] old_var_list = [v for v in tf.global_variables() if "oe" in v.name] db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name] print(len(var_list), len(db_var_list), len(old_var_list), len(db_old_var_list)) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) assign_db = U.function( [], [], updates=[ tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list) ] + [ tf.assign(dbold, dbnew) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list) ]) assign_old_eq_newr = U.function( [], [], updates=[ tf.assign(newv, oldv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) # assign_dbr = U.function([], [], updates= # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] + # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)]) klgrads = tf.gradients(dist, var_list) dbklgrads = tf.gradients(dbkl, db_var_list) p_grads = [tf.ones_like(v) for v in dbklgrads] get_flat = U.GetFlat(var_list) get_old_flat = U.GetFlat(old_var_list) set_from_flat = U.SetFromFlat(var_list) flat_tangent2 = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan2") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents2 = [] for shape in shapes: sz = U.intprod(shape) tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape)) start += sz gvp2 = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(dbklgrads, tangents2) ]) gvp2_grads = tf.gradients(gvp2, db_var_list) neg_term = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(gvp2_grads, tangents2) ]) / 2. ng1 = tf.gradients(neg_term, db_var_list) ng2 = tf.gradients(neg_term, db_old_var_list) neg_term_grads = [ a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list), tf.gradients(neg_term, db_old_var_list)) ] neg_term = neg_term_grads # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads]) pos_term = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(gvp2_grads, p_grads) ]) pos_term_grads = [ a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list), tf.gradients(pos_term, db_old_var_list)) ] pos_term_sum = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(pos_term_grads, tangents2) ]) pos_term_grads = tf.gradients(pos_term_sum, p_grads) pos_term = pos_term_grads # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads]) geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)] optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) grads = optim.compute_gradients(loss, var_list=pi_var_list) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)] geo_grads = list(zip(geo_term, var_list)) update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads) do_update = U.function(inputs, update_op) inputs_tangent = list(inputs) + [flat_tangent2] do_update_geo = U.function(inputs_tangent, update_geo_op) do_get_geo_term = U.function(inputs_tangent, [ng1, ng2]) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner, q_runner_geo]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) assign_old_eq_new() # set old parameter values to new parameter values assign_db() # Policy update do_update(ob_no, action_na, standardized_adv_n) # ft2 = get_flat() - get_old_flat() # assign_old_eq_newr() # assign back # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2) # def check_nan(bs): # return [~np.isnan(b).all() for b in bs] # print(gnp[0]) # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf') # print(gnp[1]) # do_update_geo(ob_no, action_na, standardized_adv_n, ft2) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) print(do_std()) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_path="./", save_after=200, load_path=None, save_rollouts=False): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(U.get_session(), coord=coord, start=True)) if load_path != None: saver = tf.train.Saver() saver.restore(U.get_session(), os.path.join(load_path, "model.ckpt")) obfilter_path = os.path.join(load_path, "obfilter.pkl") with open(obfilter_path, 'rb') as obfilter_input: obfilter = pickle.load(obfilter_input) print("Loaded Model") else: # create saver saver = tf.train.Saver() i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: #path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) if "jaco" in env.spec.id.lower(): path = rollout(env, policy, max_pathlength, animate=animate, obfilter=obfilter, save_rollouts=save_rollouts) goal_dist = np.linalg.norm(env.env.env.get_body_com("jaco_link_hand") \ - env.env.env.get_body_com("target")) if goal_dist <= 0.12: print("goal_dist {} ; episode added".format(goal_dist)) paths.append(path) else: path = rollout(env, policy, max_pathlength, animate=animate, obfilter=obfilter, save_rollouts=save_rollouts) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break if save_rollouts: # save the rollouts rollouts_path = os.path.join(load_path, "rollouts-v2.pkl") with open(rollouts_path, 'wb') as rollouts_output: pickle.dump(paths, rollouts_output, pickle.HIGHEST_PROTOCOL) sys.exit() # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) logp_n = np.concatenate([path["logp"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() # save model if necessary if i % save_after == 0: save(saver, obfilter, save_path) i += 1
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_model_with_prefix=None, restore_model_from_file=None, outdir="/tmp/rosrl/experiments/continuous/acktr/"): obfilter = ZFilter(env.observation_space.shape) # Risto change max_pathlength = env.max_episode_steps stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() """ Here we add a possibility to resume from a previously saved model if a model file is provided """ if restore_model_from_file: saver = tf.train.Saver() saver.restore(tf.get_default_session(), restore_model_from_file) logger.log("Loaded model from {}".format(restore_model_from_file)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 if save_model_with_prefix: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary_writer = tf.summary.FileWriter(outdir, graph=tf.get_default_graph()) while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() """ Save the model at every itteration """ if save_model_with_prefix: if np.mean([path["reward"].sum() for path in paths]) > -50.0: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary = tf.Summary(value=[ tf.Summary.Value(tag="EpRewMean", simple_value=np.mean([ path["reward"].sum() for path in paths ])) ]) summary_writer.add_summary(summary, i) if not os.path.exists(outdir): os.makedirs(outdir) modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str( i) + ".model" U.save_state(modelF) logger.log("Saved model to file :{}".format(modelF)) i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, fname='./training.ckpt'): mean_logger = setup_logger("Mean Logger", "log/episode_mean.txt") # print("Filter shape: ", env.observation_space.shape) space = (env.observation_space.shape[0] * 2, ) obfilter = ZFilter(space) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') #0.03 inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() #changes if fname != None and tf.train.checkpoint_exists(fname): saver = tf.train.Saver() saver.restore(tf.get_default_session(), fname) logger.log("Model loaded from file {}".format(fname)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None, "QR is None") enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 total_reward = float() while True: print("Timestep Number: %d of %d" % (timesteps_so_far, num_timesteps)) if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) #Save model every 100 iterations if fname != None and (i % 100 == 0): os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) env.seed() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] terminal_rew = [] while True: path, temp_rew = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) terminal_rew.append(np.array(temp_rew)) n = pathlength(path) timesteps_this_batch += n if timesteps_this_batch > timesteps_per_batch: break timesteps_so_far += 1 # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") terminal_rew = np.array(terminal_rew) rew_mean = np.mean([path.sum() for path in terminal_rew]) rew_sem = np.std( [path.sum() / np.sqrt(len(terminal_rew)) for path in terminal_rew]) len_mean = np.mean([path.shape[0] for path in terminal_rew]) # rewList = [] # for path in paths: # trew = [] # rew_i = 0 # while True: # trew.append(path["reward"][rew_i]) # rew_i += 11 # if rew_i > (len(path["reward"])-1): # break # rewList.append( np.array(trew) ) # rewList = np.array(rewList) # rew_mean = np.mean([path.sum() for path in rewList]) # rew_sem = np.std([path.sum()/np.sqrt(len(rewList)) for path in rewList]) # len_mean = np.mean([path.shape[0] for path in rewList]) # rew_mean = np.mean([path["reward"].sum() for path in paths]) # rew_sem = np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]) # len_mean = np.mean([pathlength(path) for path in paths]) total_reward += rew_mean logger.record_tabular("EpRewMean", rew_mean) logger.record_tabular("EpRewSEM", rew_sem) logger.record_tabular("EpLenMean", len_mean) logger.record_tabular("TotalRewardMean", total_reward) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() mean_logger.info( "Result for episode {} of {}: Sum: {}, Average: {}, Length: {}". format(timesteps_so_far, num_timesteps, rew_mean, rew_sem, len_mean)) i += 1 if fname != None: os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) env.seed() coord.request_stop() coord.join(enqueue_threads)