def train(env, policy, seed, njobs=1, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(seed=0): def _thunk(): env_rllab = Rllab2GymWrapper(env_rllab_class()) env_rllab.seed(seed) return env_rllab return _thunk parallel_env = SubprocVecEnv([make_env(i + seed) for i in range(njobs)]) # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(seed=0): def _thunk(): _env = make_atari(env) _env.seed(seed) return wrap_deepmind(_env) return _thunk parallel_env = VecFrameStack(SubprocVecEnv([make_env(i + seed) for i in range(njobs)]), 4) else: # Not atari, standard env creation def make_env(seed=0): def _thunk(): _env = gym.make(env) _env.seed(seed) return _env return _thunk parallel_env = SubprocVecEnv([make_env(i + seed) for i in range(njobs)]) # Create the policy if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 if policy == 'linear' or policy == 'nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) else: raise Exception('Unrecognized policy type.') try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois2.learn(parallel_env, make_policy, **alg_args)
def train(env, max_iters, num_episodes, horizon, iw_method, iw_norm, natural, bound, delta, gamma, seed, policy, max_offline_iters, njobs=1): # Declare env and created the vectorized env def make_env(seed=0): def _thunk(): _env = gym.make(env) _env.seed(seed) return _env return _thunk parallel_env = SubprocVecEnv([make_env(i + seed) for i in range(njobs)], terminating=True) # Create the policy if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois2.learn( parallel_env, make_policy, n_episodes=num_episodes, max_iters=max_iters, horizon=horizon, gamma=gamma, delta=delta, use_natural_gradient=natural, iw_method=iw_method, iw_norm=iw_norm, bound=bound, save_weights=True, center_return=True, render_after=None, max_offline_iters=max_offline_iters, )