def policy_fn(name, ob_space, ac_space): return MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2)
def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_sizes=config["hid_sizes"], gaussian_fixed_var=config["gaussian_fixed_var"], use_obfilter=config["use_obfilter"])
def policy_fn(name, ob_space, ac_space): return MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, #params["hid_size"], num_hid_layers=1, #params["num_hid_layers"] )
def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2, sess=sess, placeholders=placeholders)
def create_policy(name, problem): ob_space = problem.observation_space ac_space = problem.action_space return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2)
def train(env_id, num_timesteps, hidden_size, num_hidden_layers, seed, rank): with U.make_session(3) as sess: worker_seed = seed + 10000 * rank set_global_seeds(worker_seed) # env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), str(rank))) try: env = gym.make(env_id) env.seed(worker_seed) # Rendering and saving callback episode = 0 def episode_callback(locals, globals): nonlocal episode episode += 1 print("----- Episode {} -----".format(episode)) env.render() if episode % 20 == 0: save(sess) # Policy function policy_fn = lambda name, ob_space, ac_space: MlpPolicy( name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=hidden_size, num_hid_layers=num_hidden_layers ) # Learning trpo_mpi.learn( env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=episode_callback) finally: env.close()
def policy_fn(name, ob_space, ac_space): """Create policy for baselines. Args: name (str): Policy name. ob_space (gym.spaces.Box) : Observation space. ac_space (gym.spaces.Box) : Action space. Returns: baselines.ppo1.mlp_policy: MLP policy for baselines. """ return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hyper_parameters['hidden_sizes'][0], num_hid_layers=len(hyper_parameters['hidden_sizes']))
def load(checkpoint, env_id, hidden_size, num_hidden_layers): with U.single_threaded_session() as sess: try: env = gym.make(env_id) policy = MlpPolicy( name="pi", ob_space=env.observation_space, ac_space=env.action_space, hid_size=hidden_size, num_hid_layers=num_hidden_layers) saver = tf.train.Saver() saver.restore(sess, checkpoint) seg_gen = traj_segment_generator( policy, env, 1024, True, human_render=True) # Generate trajectory segments until stopped for _ in seg_gen: pass finally: env.close()
def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2)
def __init__(self, name, ob_space, ac_space, hid_size, num_hid_layers): MlpPolicy.__init__(self, name, ob_space, ac_space, hid_size, num_hid_layers)
def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes))
def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=rl_params['pi_hidden_size'], num_hid_layers=rl_params['pi_hidden_layer'])
def __init__(self, name, *args, session=None, **kwargs): MlpPolicy.__init__(self, name, *args, **kwargs) NetworkSaverMLP.__init__(self, network_id=name)
#print(x) #print(y) line.set_data(*(x, y)) time = float(i) * 0.0005 time_text.set_text('time = %.2f' % time) return line, time_text U.make_session(num_cpu=1).__enter__() env = TwoDofArmEnv(ActiveMuscles='agonist', actionParameterization=True, sim_length=0.2) pol = MlpPolicy("pi", env.observation_space, env.action_space, hid_size=64, num_hid_layers=2) U.initialize() U.load_state('reacher') o = env.reset() time = 0. data = np.empty((1, 8)) while time < 5.0: print(time) ac, vpred = pol.act(False, o) o, r, d, look = env.step(ac) data = np.append(data, look['data'], axis=0)
def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_sizes=[32, 32])