def train(env_id, num_timesteps, seed): env = make_mujoco_env(env_id, seed) eval_env = make_mujoco_env(env_id, seed) sess = tf.InteractiveSession() ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) # log_dir = './result/%s'%(args.alg) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False, eval_env=eval_env) env.close()
def train(env_id, num_timesteps, seed): import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, seed) logger.log("========observation_space %s action_space %s" % (str(env.observation_space), str(env.action_space))) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=1024, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed=0): U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return cartpole_policy.CartPolePolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=6, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pi = pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() return pi
def train(env_id, num_timesteps, seed, model_path=None): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) print(pi) env.close() if model_path: U.save_state(model_path) return pi
def teacher_replay(): env = make_mujoco_env("Reacher-v2", 0) rets = [] episode = 0 ret = 0 timestep = 1 with tf.Session() as sess: teacher = TeacherAgent(env, sess, True) ob = env.reset() ob = np.expand_dims(ob, axis=0) print(ob) while episode < TOTAL_EPISODES: timestep += 1 ac, _ = teacher.pi.act(False, ob) ob, reward, new, _ = env.step(ac) ob = np.expand_dims(ob, axis=0) ret += reward if new: print( "********** Episode {0}, timestep {1} ***********".format( episode, timestep)) print("return: {0}".format(reward)) ob = env.reset() ob = np.expand_dims(ob, axis=0) rets.append(ret) ret = 0 timestep = 1 episode += 1 env.render() # save_results np.save(teacher_ret_path, rets)
def test_lstm_example(): import tensorflow as tf from baselines.common import policies, models, cmd_util from baselines.common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def train(env_id, num_timesteps, seed, save, clip_param, optim_stepsize, optim_batchsize, gamma, lam): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) ret = pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=clip_param, entcoeff=0.0, optim_epochs=10, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, gamma=gamma, lam=lam, schedule='linear', ) env.close() np.savetxt(save, np.array([ret]))
def train(env_id, num_timesteps, seed, hid_size=64, num_hid_layers=2): from baselines.ppo1 import mlp_policy, pposgd_simple assert env_id in (_MujocoEnvs + _RoboticsEnvs) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers) if env_id in _MujocoEnvs: env = make_mujoco_env(env_id, seed) elif env_id in _RoboticsEnvs: env = make_robotics_env(env_id, seed) else: raise ValueError('Environment `{0}` is not supported.'.format(env_id)) # Not putting these params in config as we do not plan on changing them. optim_epochs = 10 if env_id in _MujocoEnvs else 5 optim_batchsize = 64 if env_id in _MujocoEnvs else 256 pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=optim_epochs, optim_stepsize=3e-4, optim_batchsize=optim_batchsize, gamma=0.99, lam=0.95, schedule='linear', ) env.close() return pi
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, timesteps_per_actor_batch, seed, entropy_coeff, filepath): from baselines.ppo1 import mlp_policy, pposgd_simple sess = U.make_session(num_cpu=1) sess.__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=timesteps_per_actor_batch, clip_param=0.2, entcoeff=entropy_coeff, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close() # Save policy etc. saver = tf.train.Saver() saver.save(sess, filepath + "_final")
def train(env_id, num_timesteps, seed): """ Train PPO1 model for the Mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, sess=sess, placeholders=placeholders) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed, alg, lr, momentum): env = make_mujoco_env(env_id, seed) if alg == 'sgd': from baselines.acktr.acktr_cont import learn elif alg == 'mid': from baselines.acktr.acktr_cont_midpoint import learn elif alg == 'geo': from baselines.acktr.acktr_cont_geo import learn else: raise ValueError nprocs = 4 with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs)): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) policy = GaussianMlpPolicy(ob_dim, ac_dim, 'pi') learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False, lr=lr, momentum=momentum) env.close()
def train(env_id, num_timesteps, seed, save_file, load_file, render, stochastic): from baselines.ppo1 import mlp_policy import my_pposgd_simple as pposgd_simple sess = U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', sess=sess, save_file=save_file, load_file=load_file, render=render, stochastic=stochastic) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple from baselines.ppo1 import lstm_fc_policy from baselines.ppo1 import one_lstm_policy U.make_session(num_cpu=4).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default='checkpoints_best/Humanoid-v2-6914') parser.set_defaults(num_timesteps=int(2e8)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=123) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() time.sleep(0.01) if done: ob = env.reset()
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(args.env, num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env(args.env, seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) print(ob,action) #env.render() if done: ob = env.reset()
def test_lstm_example(): import tensorflow as tf from baselines.common import policies, models, cmd_util from baselines.common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv( [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def main(): """ Runs the test """ logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render policy = train(num_timesteps=1, seed=args.seed) tf_util.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) obs = env.reset() while True: action = policy.act(stochastic=False, obs=obs)[0] obs, _, done, _ = env.step(action) env.render() if done: obs = env.reset()
def main(): args = mujoco_arg_parser().parse_args() logger.configure() pi = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) env = make_mujoco_env('Walker2d-v2', seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() time.sleep(0.01) if done: env.reset()
def createEnv(env_id='CartPole-v1', seed=0): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() return make_mujoco_env(env_id, workerseed)
def train(num_timesteps, seed, model_path=None): env_id = 'Odie-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) env = MinotaurMonitor(env_id, env) def callback(locals, globals): if (len(locals['rewbuffer'])): meanlosses = locals['meanlosses'] loss_names = locals['loss_names'] data = { 'iters_so_far': locals['iters_so_far'], 'episode_reward_mean': np.mean(locals['rewbuffer']), } for (lossval, name) in zipsame(locals['meanlosses'], locals['loss_names']): data['loss_' + name] = float(lossval) env.post_data(data) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback) env.close() if model_path: U.save_state(model_path) return pi
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env_id, num_timesteps, seed): env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, parameters, seed): env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope('pi'): policy = GaussianMlpPolicy(ob_dim, ac_dim) #with tf.variable_scope ('policy_prev'): # policy_previous = GaussianMlpPolicy(ob_dim, ac_dim, name='policy_prev') learn(env, policy=policy, vf=vf, parameters=parameters) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy sess = U.make_session(num_cpu=4) sess.__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=8) env = make_mujoco_env(env_id, seed) learn(sess, env_id, env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close() tf.train.Saver().save(sess, directory + env_id + '/model.ckpt')
def train(num_timesteps, seed, model_path=None): """ Train PPO1 model for the Humanoid environment, for testing purposes :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param model_path: (str) path to the model """ env_id = 'Humanoid-v2' def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, sess=sess, placeholders=placeholders) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) policy = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close() if model_path: tf_util.save_state(model_path) return policy
def main(): args = mujoco_arg_parser().parse_args() logger.configure() pi = train(args.env, num_timesteps=1, seed=args.seed, play=False) run = 'run-20180703_034952-1a24a6ik/' run_home = '/home/ubuntu/wandb_baselines/wandb/' + run #run-20180702_220411-4xtopfue/' model_path = run_home + 'humanoid_policy' # model_path = '/home/ubuntu/wandb_baselines/wandb/run-20180702_220411-4xtopfue/humanoid_policy' U.load_state(model_path) seed = random.randint(1, 1000) env = make_mujoco_env('RoboschoolHumanoid-v1', seed=seed) tot_r = 0 ob = env.reset() runs = 0 video = True if video: video_recorder = gym.wrappers.monitoring.video_recorder.VideoRecorder( env=env, base_path=os.path.join('/home/ubuntu/wandb_baselines', 'humanoid_run2_%i' % seed), enabled=True) while True: action = pi.act(stochastic=False, ob=ob)[0] ob, r, done, _ = env.step(action) if video: video_recorder.capture_frame() tot_r += r if done: ob = env.reset() runs += 1 # if video: # video_recorder.close() # video_recorder = gym.wrappers.monitoring.video_recorder.VideoRecorder(env=env, base_path=os.path.join(run_home, 'humanoid_run_%i'%runs), enabled=True) print(tot_r) tot_r = 0 print("@@@@@@@@@@@@@@@") if runs > 0: break
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple # enter a tensorflow session U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, gaussian_fixed_var=True) env = make_mujoco_env(env_id, seed) pi, result, graph_data = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) if MPI.COMM_WORLD.Get_rank()==0: with open(resultfile, 'a+') as file: file.write("{0} result: {1}\n".format(env_id, result)); with open(graphfile, 'wb') as file: pickle.dump([np.array(l) for l in list(zip(*graph_data))], file); env.close()
def train(env_id, num_timesteps, seed, logdir): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) import os fname = os.path.join(logdir, 'final_state') os.path.join(logdir, 'final_state') saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) env.close()
def student_replay(klts): env = make_mujoco_env("Reacher-v2", 0) timestep = 0 ret = 0 episode = 0 with tf.Session() as sess: student = StudentAgent(env, sess, True, klts) ob = env.reset() while episode < TOTAL_EPISODES: ac = student.pi.act(False, ob) ob, reward, new, _ = env.step(ac) print("********** Episode {0}, timestep {1} ***********".format( episode, timestep)) print("reward: {0}".format(reward)) ret += reward timestep += 1 if new: print(ob) ob = env.reset() timestep = 0 episode += 1
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) logger.log( "NOTE: reward will be scaled by a factor of 10 in logged stats. Check the monitor for unscaled reward." ) pi = pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.1, entcoeff=0.0, optim_epochs=10, optim_stepsize=1e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='constant', ) env.close() if model_path: U.save_state(model_path) return pi
def train(env_id, num_timesteps, seed): env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=True) env.close()
def train(env_id, num_timesteps, seed): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): value_fn = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() if done: ob = env.reset()