def test_dist(): np.random.seed(0) p1, p2, p3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1)) q1, q2, q3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1)) # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) comm = MPI.COMM_WORLD assert comm.Get_size() == 2 if comm.Get_rank() == 0: x1, x2, x3 = p1, p2, p3 elif comm.Get_rank() == 1: x1, x2, x3 = q1, q2, q3 else: assert False rms = RunningMeanStd(epsilon=0.0, shape=(1, )) U.initialize() rms.update(x1) rms.update(x2) rms.update(x3) bigvec = np.concatenate([p1, p2, p3, q1, q2, q3]) def checkallclose(x, y): print(x, y) return np.allclose(x, y) assert checkallclose(bigvec.mean(axis=0), U.eval(rms.mean)) assert checkallclose(bigvec.std(axis=0), U.eval(rms.std))
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
def test_dist(): np.random.seed(0) p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) comm = MPI.COMM_WORLD assert comm.Get_size()==2 if comm.Get_rank()==0: x1,x2,x3 = p1,p2,p3 elif comm.Get_rank()==1: x1,x2,x3 = q1,q2,q3 else: assert False rms = RunningMeanStd(epsilon=0.0, shape=(1,)) U.initialize() rms.update(x1) rms.update(x2) rms.update(x3) bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) def checkallclose(x,y): print(x,y) return np.allclose(x,y) assert checkallclose( bigvec.mean(axis=0), U.eval(rms.mean) ) assert checkallclose( bigvec.std(axis=0), U.eval(rms.std) )
def test_runningmeanstd(): for (x1, x2, x3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), ]: rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) U.initialize() x = np.concatenate([x1, x2, x3], axis=0) ms1 = [x.mean(axis=0), x.std(axis=0)] rms.update(x1) rms.update(x2) rms.update(x3) ms2 = U.eval([rms.mean, rms.std]) assert np.allclose(ms1, ms2)
def test_runningmeanstd(): for (x1, x2, x3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), ]: rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) U.initialize() x = np.concatenate([x1, x2, x3], axis=0) ms1 = [x.mean(axis=0), x.std(axis=0)] rms.update(x1) rms.update(x2) rms.update(x3) ms2 = U.eval([rms.mean, rms.std]) assert np.allclose(ms1, ms2)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, resume, logdir, agentName, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1.0 - 0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(U.get_session(), coord=coord, start=True)) timesteps_so_far = 0 saver = tf.train.Saver(max_to_keep=10) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) ob_filter_path = os.path.join(os.path.abspath(logdir), "{}-{}".format('obfilter', resume)) with open(ob_filter_path, 'rb') as ob_filter_input: obfilter = pickle.load(ob_filter_input) print("Loaded observation filter") iters_so_far = resume print('logdir = ', logdir) logF = open(os.path.join(logdir, 'log.txt'), 'a') logF2 = open(os.path.join(logdir, 'log_it.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) save_interval = 5 # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (iters_so_far % save_interval == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2.0: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2.0: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") rew_mean = np.mean([path["reward"].sum() for path in paths]) logger.record_tabular("EpRewMean", rew_mean) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) logF.write(str(rew_mean) + "\n") logF2.write(str(iters_so_far) + "," + str(rew_mean) + "\n") # json.dump(combined_stats, logStats) logF.flush() logF2.flush() # logStats.flush() if save_interval and (iters_so_far % save_interval == 0 or iters_so_far == 1): saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far) ob_filter_path = os.path.join( os.path.abspath(logdir), "{}-{}".format('obfilter', iters_so_far)) with open(ob_filter_path, 'wb') as ob_filter_output: pickle.dump(obfilter, ob_filter_output, pickle.HIGHEST_PROTOCOL) if callback: callback() logger.dump_tabular() iters_so_far += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, fname=None): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() if fname != None and tf.train.checkpoint_exists(fname): load_result = U.load_state(fname) logger.log("Model loaded from file {}".format(fname)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(get_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Save model every 100 iterations if fname != None and (i % 100 == 99): U.save_state(fname) logger.log("Model saved to file {}".format(fname)) env.seed() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, optimizer="adam", desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) logp_n = np.concatenate([path["logp"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval(tf.assign(stepsize, stepsize / 1.5)) elif kl < desired_kl / 2: logger.log("kl too low") U.eval(tf.assign(stepsize, stepsize * 1.5)) else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_path="./", save_after=200, load_path=None, save_rollouts=False): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(U.get_session(), coord=coord, start=True)) if load_path != None: saver = tf.train.Saver() saver.restore(U.get_session(), os.path.join(load_path, "model.ckpt")) obfilter_path = os.path.join(load_path, "obfilter.pkl") with open(obfilter_path, 'rb') as obfilter_input: obfilter = pickle.load(obfilter_input) print("Loaded Model") else: # create saver saver = tf.train.Saver() i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: #path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) if "jaco" in env.spec.id.lower(): path = rollout(env, policy, max_pathlength, animate=animate, obfilter=obfilter, save_rollouts=save_rollouts) goal_dist = np.linalg.norm(env.env.env.get_body_com("jaco_link_hand") \ - env.env.env.get_body_com("target")) if goal_dist <= 0.12: print("goal_dist {} ; episode added".format(goal_dist)) paths.append(path) else: path = rollout(env, policy, max_pathlength, animate=animate, obfilter=obfilter, save_rollouts=save_rollouts) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break if save_rollouts: # save the rollouts rollouts_path = os.path.join(load_path, "rollouts-v2.pkl") with open(rollouts_path, 'wb') as rollouts_output: pickle.dump(paths, rollouts_output, pickle.HIGHEST_PROTOCOL) sys.exit() # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) logp_n = np.concatenate([path["logp"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() # save model if necessary if i % save_after == 0: save(saver, obfilter, save_path) i += 1