Esempio n. 1
0
def train(env, policy, policy_init, n_episodes, horizon, seed, njobs=1, save_weights=False, **alg_args):

    if env.startswith('rllab.'):
        # Get env name and class
        env_name = re.match('rllab.(\S+)', env).group(1)
        env_rllab_class = rllab_env_from_name(env_name)
        # Define env maker
        def make_env():
            env_rllab = env_rllab_class()
            _env = Rllab2GymWrapper(env_rllab)
            return _env
        # Used later
        env_type = 'rllab'
    else:
        # Normal gym, get if Atari or not.
        env_type = get_env_type(env)
        assert env_type is not None, "Env not recognized."
        # Define the correct env maker
        if env_type == 'atari':
            # Atari, custom env creation
            def make_env():
                _env = make_atari(env)
                return wrap_deepmind(_env)
        else:
            # Not atari, standard env creation
            def make_env():
                env_rllab = gym.make(env)
                return env_rllab

    if policy == 'linear':
        hid_size = num_hid_layers = 0
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3

    if policy_init == 'xavier':
        policy_initializer = tf.contrib.layers.xavier_initializer()
    elif policy_init == 'zeros':
        policy_initializer = U.normc_initializer(0.0)
    else:
        raise Exception('Unrecognized policy initializer.')

    if policy == 'linear' or policy == 'nn':
        def make_policy(name, ob_space, ac_space):
            return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                             hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False,
                             hidden_W_init=policy_initializer, output_W_init=policy_initializer)
    elif policy == 'cnn':
        def make_policy(name, ob_space, ac_space):
            return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                         gaussian_fixed_var=True, use_bias=False, use_critic=False,
                         hidden_W_init=policy_initializer,
                         output_W_init=policy_initializer)
    else:
        raise Exception('Unrecognized policy type.')

    sampler = ParallelSampler(make_policy, make_env, n_episodes, horizon, True, n_workers=njobs, seed=seed)

    try:
        affinity = len(os.sched_getaffinity(0))
    except:
        affinity = njobs
    sess = U.make_session(affinity)
    sess.__enter__()

    set_global_seeds(seed)

    gym.logger.setLevel(logging.WARN)

    pois.learn(make_env, make_policy, n_episodes=n_episodes, horizon=horizon,
                sampler=sampler, save_weights=save_weights, **alg_args)

    sampler.close()
Esempio n. 2
0
def train(env,
          num_episodes,
          horizon,
          iw_method,
          iw_norm,
          natural,
          bound,
          delta,
          seed,
          policy,
          max_offline_iters,
          gamma,
          center_return,
          clipping=False,
          njobs=1,
          entropy='none',
          max_iters=500):

    if env == 'swimmer':
        make_env_rllab = SwimmerEnv
    elif env == 'ant':
        make_env_rllab = AntEnv
    elif env == 'half-cheetah':
        make_env_rllab = HalfCheetahEnv
    elif env == 'hopper':
        make_env_rllab = HopperEnv
    elif env == 'simple-humanoid':
        make_env_rllab = SimpleHumanoidEnv
    elif env == 'full-humanoid':
        make_env_rllab = HumanoidEnv
    elif env == 'walker':
        make_env_rllab = Walker2DEnv
    elif env == 'cartpole':
        make_env_rllab = CartpoleEnv
    elif env == 'mountain-car':
        make_env_rllab = MountainCarEnv
    elif env == 'inverted-pendulum':
        make_env_rllab = InvertedPendulumEnv
    elif env == 'acrobot':
        make_env_rllab = AcrobotEnv
    elif env == 'inverted-double-pendulum':
        make_env_rllab = InvertedDoublePendulumEnv

    def make_env():
        env_rllab = make_env_rllab()
        env = Rllab2GymWrapper(env_rllab)
        return env

    if policy == 'linear':
        hid_size = num_hid_layers = 0
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3

    def make_policy(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hid_size,
                         num_hid_layers=num_hid_layers,
                         gaussian_fixed_var=True,
                         use_bias=False,
                         use_critic=False,
                         hidden_W_init=tf.contrib.layers.xavier_initializer(),
                         output_W_init=tf.contrib.layers.xavier_initializer())

    sampler = ParallelSampler(make_policy,
                              make_env,
                              num_episodes,
                              horizon,
                              True,
                              n_workers=njobs,
                              seed=seed)

    affinity = len(os.sched_getaffinity(0))
    sess = U.make_session(affinity)
    sess.__enter__()

    set_global_seeds(seed)

    gym.logger.setLevel(logging.WARN)

    pois.learn(make_env,
               make_policy,
               n_episodes=num_episodes,
               max_iters=max_iters,
               horizon=horizon,
               gamma=gamma,
               delta=delta,
               use_natural_gradient=natural,
               iw_method=iw_method,
               iw_norm=iw_norm,
               bound=bound,
               save_weights=True,
               sampler=sampler,
               center_return=center_return,
               render_after=None,
               max_offline_iters=max_offline_iters,
               clipping=clipping,
               entropy=entropy)

    sampler.close()
Esempio n. 3
0
def train(env,
          num_episodes,
          horizon,
          iw_method,
          iw_norm,
          natural,
          bound,
          delta,
          seed,
          policy,
          max_offline_iters,
          gamma,
          center_return,
          clipping=False,
          njobs=1,
          entropy='none',
          max_iters=500,
          positive_return=False):
    def make_env():
        if env == 'lunarlander-sparse':
            _env = gym.make('LunarLanderContinuous-v2')
            _env = SparseReward(_env)
        else:
            _env = gym.make(env)
        return _env

    if policy == 'linear':
        hid_size = num_hid_layers = 0
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3

    def make_policy(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hid_size,
                         num_hid_layers=num_hid_layers,
                         gaussian_fixed_var=True,
                         use_bias=False,
                         use_critic=False,
                         hidden_W_init=tf.contrib.layers.xavier_initializer(),
                         output_W_init=tf.contrib.layers.xavier_initializer())

    sampler = ParallelSampler(make_policy,
                              make_env,
                              num_episodes,
                              horizon,
                              True,
                              n_workers=njobs,
                              seed=seed)

    affinity = len(os.sched_getaffinity(0))
    sess = U.make_session(affinity)
    sess.__enter__()

    set_global_seeds(seed)

    gym.logger.setLevel(logging.WARN)

    pois.learn(make_env,
               make_policy,
               n_episodes=num_episodes,
               max_iters=max_iters,
               horizon=horizon,
               gamma=gamma,
               delta=delta,
               use_natural_gradient=natural,
               iw_method=iw_method,
               iw_norm=iw_norm,
               bound=bound,
               save_weights=True,
               sampler=sampler,
               center_return=center_return,
               render_after=None,
               max_offline_iters=max_offline_iters,
               clipping=clipping,
               entropy=entropy,
               positive_return=positive_return)

    sampler.close()