def render(hid_size, load_path, video_path, env_id, seed, hist_len, block_high, give_state): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space, ob_name): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=2, ob_name=ob_name) env = make_control_env(env_id, seed, hist_len=hist_len, block_high=block_high, version0=True, give_state=give_state) pi = policy_fn("pi", env.observation_space, env.action_space, ob_name="ob") sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, load_path) ob = env.reset() frames = [] while True: frame = env.unwrapped.render(mode='rgb_array') frames.append(frame) ac, vpred = pi.act(stochastic=False, ob=ob) print(ob) ob, rwd, done, _ = env.step(ac) if done: imageio.mimsave(video_path+'result.mp4', frames, fps=20) break env.close()
def train_copos(env_id, num_timesteps, seed, trial, hist_len, block_high, nsteps, method, hid_size, give_state, vf_iters): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed * 10000 def policy_fn(name, ob_space, ac_space): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=2) # return CompatiblecnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=hid_size, num_hid_layers=2) set_global_seeds(workerseed) # env = gym.make(env_id) env = make_control_env(env_id, seed, hist_len=hist_len, block_high=block_high, version0=True, give_state=give_state) env.seed(workerseed) timesteps_per_batch = nsteps beta = -1 if beta < 0: nr_episodes = num_timesteps // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=vf_iters, vf_stepsize=1e-3, trial=trial, method=method) env.close()
def train(env_id, num_timesteps, seed, num_trials=5): from baselines.ppo1 import mlp_policy, ppo_guided, pporocksample, ppo_guided2, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # return mlp_policy.MlpBetaPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=64, num_hid_layers=2) for i_trial in range(num_trials): env = make_control_env(env_id, seed) # normalized histroy stepsize 15 for field vision full position rocksample # env = make_rocksample_env(seed, map_name="5x7", observation_type="field_vision_full_pos", # observation_noise=True, n_steps=15) # normalized fully observable rocksample # env = make_rocksample_env(seed, map_name="5x7", observation_type="fully_observable", # observation_noise=False, n_steps=15) # # guided way of normalized fully observable rocksample with history timestep 15 # genv = make_control_env(env_id, seed) # # ppo_guided.learn(env, genv, i_trial, policy_fn, # max_iters=1000, # timesteps_per_actorbatch=5000, # clip_param=0.2, entp=0.5, # optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32, # gamma=0.99, lam=0.95, schedule='linear', useentr=False, retrace=False # ) # pposgd_simple.learn(env, i_trial, policy_fn, # max_iters=1000, # timesteps_per_actorbatch=2048, # clip_param=0.2, entcoeff=0.5, # optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32, # gamma=0.99, lam=0.95, schedule='linear') pporocksample.learn(env, i_trial, policy_fn, max_iters=800, timesteps_per_actorbatch=2048, clip_param=0.2, entp=0.3, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32, gamma=0.99, lam=0.95, schedule='linear', useentr=True, retrace=False) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple, ppo_guided U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # env = make_mujoco_env(env_id, seed) env = make_control_env(env_id, seed) i_trial = 1 # genv = make_control_env(env_id, seed) # # # ppo_guided.learn(env, genv, i_trial, policy_fn, # max_iters=100, # timesteps_per_actorbatch=2048, # clip_param=0.2, entp=0.5, # optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, # gamma=0.99, lam=0.95, schedule='linear', useentr=False, retrace=False # ) pposgd_simple.learn(env, i_trial, policy_fn, max_iters=100, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train_trpo(env_id, num_timesteps, seed, hist_len, block_high, nsteps, hid_size, give_state): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=2) set_global_seeds(workerseed) env = make_control_env(env_id, workerseed, hist_len=hist_len, block_high=block_high, not_guided=True, give_state=False) env.seed(workerseed) timesteps_per_batch = nsteps trpo_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed, num_trials=1): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() # if rank == 0: # logger.configure() # else: # logger.configure(format_strs=[]) # logger.set_level(logger.DISABLED) def policy_fn(name, ob_name, ob_space, ac_space): return MlpPolicy(name=name, ob_name=ob_name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) for i_trial in range(num_trials): workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # env = make_mujoco_env(env_id, workerseed) env = make_control_env(env_id, workerseed) trpo_guided.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=20, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, i_trial=i_trial) env.close()
def train(env_id, num_timesteps, seed, trial, hist_len): env = make_control_env(env_id, seed, hist_len=hist_len) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2048, desired_kl=0.002, trial=trial, num_timesteps=num_timesteps, animate=False) env.close()
def train_copos(env_id, num_timesteps, seed, hist_len, block_high, nsteps, hid_size, give_state): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=2) set_global_seeds(workerseed) env = make_control_env(env_id, workerseed, hist_len=hist_len, block_high=block_high, not_guided=True, give_state=True) env.seed(workerseed) timesteps_per_batch = nsteps ###TODO: The following several lines are used for evaluation # pi = policy_fn('pi', env.observation_space, env.action_space) # sess.run(tf.global_variables_initializer()) # saver = tf.train.Saver() # saver.restore(sess, '/work/scratch/rz97hoku/ReinforcementLearning/tmp/hist4/copos-ratio/copos-ratio-1-11-05-20-11/checkpoints/00976.ckpt') # for m in range(100): # ob = env.reset() # ep_rwd = [] # while True: # ac, _ = pi.act(stochastic=False, ob=ob) # ob, rew, new, _ = env.step(ac) # ep_rwd.append(rew) # if new: # break # logger.record_tabular("Reward", np.sum(ep_rwd)) # logger.record_tabular("Episode", m) # logger.dump_tabular() beta = -1 if beta < 0: nr_episodes = num_timesteps // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) #copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, # beta=beta, cg_iters=10, cg_damping=0.1, sess=sess, # max_timesteps=num_timesteps, gamma=0.99, # lam=0.98, vf_iters=vf_iters, vf_stepsize=1e-3, trial=trial, method=method) copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()