Python GaussianMLPPolicy Examples, sandbox.rocky.tf.policies.gaussian_mlp_policy.GaussianMLPPolicy Python Examples

Example #1

0

Show file

File: vpg_generate_baseline_data.py Project: fredshentu/public_model_based_controller

def run_task(variant):
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.algos.vpg import VPG
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from sandbox.rocky.tf.envs.base import TfEnv

    env_name = variant['Environment']
    if env_name == 'Cartpole':
        env = TfEnv(CartpoleEnv())
    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(100, 100))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algorithm = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        n_itr=100,
        start_itr=0,
        batch_size=1000,
        max_path_length=1000,
        discount=0.99,
    )
    algorithm.train()

Example #2

0

Show file

File: sawyer-reach-trpo.py Project: jonny97/MB-MPO-trajectory-buffer

def run_train_task(vv):

    env = TfEnv(normalize(vv['env_class'](
        fix_goal=vv['fix_goal'],
    )))

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        adaptive_std=vv['adaptive_policy_std']
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['batch_size'],
        max_path_length=vv['path_length'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        step_size=vv["step_size"],
        force_batch_sampler=True
    )
    algo.train()

Example #3

0

Show file

def run_task(args, *_):
    metaworld_train_env = benchmark.get_train_tasks()
    wrapped_train_env = MetaworldWrapper(metaworld_train_env)
    env = TfEnv(wrapped_train_env)

    metaworld_test_env = benchmark.get_test_tasks()
    wrapped_test_env = MetaworldWrapper(metaworld_test_env)
    test_env = TfEnv(wrapped_test_env)

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        min_std=1e-2,
        hidden_sizes=(150, 100, 50),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        test_env=test_env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        # batch_size=100,
        force_batch_sampler=True,
        max_path_length=50,
        discount=1,
        step_size=0.02,
    )

    algo.train()

Example #4

0

Show file

File: expert_inverted_dp.py Project: xixinzhang/Third_Person

def generate_expert_dp():
    env = TfEnv(normalize(InvertedPendulumEnv()))
    policy = GaussianMLPPolicy(
        name="expert_policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 64),
        std_hidden_sizes=(64, 64),
        adaptive_std=True,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=64,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)),
        gae_lambda=0.97,
    )

    with tf.Session() as sess:
        algo.train(sess=sess)
        t = rollout(env=env, agent=policy, max_path_length=100, animated=False)
        print(sum(t['rewards']))
        with open('expert_dp.pickle', 'wb') as handle:
            pickle.dump(policy, handle)
        while True:
            rollout(env=env, agent=policy, max_path_length=100, animated=False)

Example #5

0

Show file

File: trpo_test_4.py Project: victorchan314/cs285-final-project

def run_task(args, *_):

    #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO
    #env = TfEnv(normalize(CartpoleEnv()))
    env = TfEnv(CartpoleEnv())
    #metaworld_env = ML1.get_train_tasks("pick-place-v1")
    #tasks = metaworld_env.sample_tasks(1)
    #metaworld_env.set_task(tasks[0])
    #metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space)
    #metaworld_env._action_space = convert_gym_space(metaworld_env.action_space)
    #env = TfEnv(normalize(metaworld_env))

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        min_std=1e-2,
        hidden_sizes=(150, 100, 50),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        # batch_size=100,
        force_batch_sampler=True,
        max_path_length=50,
        discount=1,
        step_size=0.02,
    )

    algo.train()

Example #6

0

Show file

def main(exp_name=None, fusion=False):
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    irl_model = AIRL(env=env,
                     expert_trajs=experts,
                     state_only=True,
                     fusion=fusion,
                     max_itrs=10)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session():
            algo.train()

Example #7

0

Show file

File: swimmer_irl_state_action.py Project: hsilva664/nstep_airl

def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99):
    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=1000,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)

Example #8

0

Show file

def main(exp_name, params_folder=None):
    env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False))

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file)

    irl_model = AIRL(env=env, expert_trajs=None, state_only=True)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name):
        with tf.Session():
            algo.train()

Example #9

0

Show file

def get_algo(env_name, use_eval, init_path, horizon, batch_size, n_itr,
             discount, step_size, gae):
    env = get_env(env_name)
    policy = GaussianMLPPolicy(
        name='policy',
        env_spec=env.spec,
        hidden_sizes=(32, 32),
        # output_nonlinearity=tf.nn.tanh
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    kwargs = dict(env=env,
                  policy=policy,
                  baseline=baseline,
                  batch_size=batch_size,
                  max_path_length=horizon,
                  n_itr=n_itr,
                  discount=discount,
                  step_size=step_size,
                  gae_lambda=gae)
    if use_eval:
        kwargs["reset_init_path"] = os.path.join(config.PROJECT_PATH,
                                                 get_eval_data_path[env_name])
        kwargs["horizon"] = horizon
    if init_path is not None:
        kwargs["initialized_path"] = init_path
    return TRPO(**kwargs)

Example #10

0

Show file

File: trpo_pick.py Project: ilBarbara/gps-rllab3

def run_task(args, *_):

    env = TfEnv(normalize(
        dnc_envs.create_stochastic('pick')))  # Cannot be solved easily by TRPO

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        min_std=1e-2,
        hidden_sizes=(150, 100, 50),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=50000,
        force_batch_sampler=True,
        max_path_length=50,
        discount=1,
        step_size=0.02,
    )

    algo.train()

Example #11

0

Show file

File: trpo_reach.py Project: russellmendonca/maml_gps

def run_task(*_):

    env = TfEnv(
        normalize(GymEnv("Reacher-v1", force_reset=True, record_video=True)))
    #env = TfEnv(normalize(PusherEnv()))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 128))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100 * 500,
        max_path_length=100,
        n_itr=200,
        discount=0.99,
        step_size=0.01,
        force_batch_sampler=True,
        # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
    )
    algo.train()

Example #12

0

Show file

    def test_training(self):
        env = TfEnv(normalize(PointEnv()))

        tf.set_random_seed(22)
        np.random.seed(22)

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(16, 16),
                                   hidden_nonlinearity=tf.nn.tanh)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        dynamics_model = MLPDynamicsModel("dyn_model",
                                          env,
                                          hidden_sizes=(16, 16))

        # fit dynamics model

        algo = ModelTRPO(
            env=env,
            policy=policy,
            dynamics_model=dynamics_model,
            baseline=baseline,
            batch_size_env_samples=5000,
            initial_random_samples=10000,
            batch_size_dynamics_samples=40000,
            max_path_length=100,
            dynamic_model_epochs=(30, 10),
            num_gradient_steps_per_iter=2,
            n_itr=20,
            discount=0.99,
            step_size=0.001,
        )
        algo.train()

Example #13

0

Show file

def run_task(vv):

    env = TfEnv(
        normalize(
            GymEnv('HalfCheetah-v1', record_video=False, record_log=False)))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
        name="policy")

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=vv["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()

Example #14

0

Show file

File: sawyer-push-reach-trpo.py Project: jonny97/MB-MPO-trajectory-buffer

def run_train_task(vv):

    env = TfEnv(
        normalize(vv['env_class'](
            fix_goal=vv['fix_goal'],
            reward_type=vv['reward_type'],
            init_puck_low=INIT_PUCK_TARGET - vv['init_slack'],
            init_puck_high=INIT_PUCK_TARGET + vv['init_slack'],
            puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'],
            puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'],
        )))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=vv['hidden_sizes'],
                               hidden_nonlinearity=vv['hidden_nonlinearity'],
                               adaptive_std=vv['adaptive_policy_std'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=vv['batch_size'],
                max_path_length=vv['path_length'],
                n_itr=vv['n_itr'],
                discount=vv['discount'],
                step_size=vv["step_size"],
                force_batch_sampler=True)
    algo.train()

Example #15

0

Show file

File: reproduce_imitation_results.py Project: longouyang/inverse_rl

def run_expt(config):
    env_name = config['environment']
    env = get_env(env_name)
    experts = get_demos(env_name)
    irl_model = algo_string_to_model[config['algo']](env_spec=env.spec,
                                                     expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    # use params for each env
    algo = IRLTRPO(env=env,
                   policy=policy,
                   irl_model=irl_model,
                   n_itr=200,
                   batch_size=2000 if env_name == 'pendulum' else 10000,
                   max_path_length=100,
                   discount=0.99,
                   store_paths=True,
                   discrim_train_itrs=50,
                   irl_model_wt=1.0,
                   entropy_weight=1.0 if env_name == 'pointmass' else 0.1,
                   zero_environment_reward=True,
                   baseline=LinearFeatureBaseline(env_spec=env.spec))
    dirname = DATA_DIR + "/" + "___".join(
        [str(k) + "=" + str(v) for k, v in config.items()])
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()
    # a little clumsy but it's the easiest way, as rllab logger doesn't keep data around after
    # it's been written to disk
    train_results = pd.read_csv(dirname + '/progress.csv')
    # return originaltaskaverageReturn for last iteation
    output = config.copy()
    output['return'] = train_results.iloc[-1]['OriginalTaskAverageReturn']
    return output

Example #16

0

Show file

def main():
    env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/ant', n=50)

    irl_model = GCLDiscrim(
        env_spec=env.spec,
        expert_trajs=experts,
        discrim_arch=disentangled_net)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=1000,
        discount=0.995,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/ant_airl'):
        with tf.Session():
            algo.train()

Example #17

0

Show file

def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    ### VGG 11/29/18: Added support to CSV files
    ## this method loads expert data saved as pickle file
    # experts = load_latest_experts('data/airsim_final', n=1)
    # this one uses csv:
    experts = load_experts('data/airsim_human_data/log.csv',
                           pickle_format=False)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=5000,
        batch_size=60,
        max_path_length=60,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=100,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        n_parallel=0)

    with rllab_logdir(algo=algo, dirname='data/airsim_gail'):
        with tf.Session():
            algo.train()

Example #18

0

Show file

File: ant_data_collect.py Project: pedrofreire/inverse_rl

def main(exp_name, ent_wt=1.0):
    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('airl/CustomAnt-v0', record_video=False,
                     record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=1500,
            batch_size=20000,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
        )
        with rllab_logdir(algo=algo,
                          dirname='data/ant_data_collect/%s' % exp_name):
            algo.train()

Example #19

0

Show file

File: pendulum_traj.py Project: morrimax/inverse_rl

def main(num_examples=50, discount=0.99):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=num_examples)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=2000,
        max_path_length=100,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_traj'):
        with tf.Session():
            algo.train()

Example #20

0

Show file

File: ant_irl_debug.py Project: hsilva664/nstep_airl

def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99):
    env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10)

    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)

Example #21

0

Show file

File: cassie-mujoco-trpo.py Project: williamd4112/model_ensemble_meta_learning

def run_train_task(vv):

    env = TfEnv(
        normalize(
            CassieEnv(fixed_gains=vv['fixed_gains'],
                      stability_cost_coef=vv['stability_cost_coef'],
                      ctrl_cost_coef=vv['ctrl_cost_coef'],
                      alive_bonus=vv['alive_bonus'])))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=vv['hidden_sizes'],
                               hidden_nonlinearity=vv['hidden_nonlinearity'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=vv['batch_size'],
                max_path_length=vv['path_length'],
                n_itr=vv['n_itr'],
                discount=vv['discount'],
                step_size=vv["step_size"],
                force_batch_sampler=True)
    algo.train()

Example #22

0

Show file

File: pendulum_gail.py Project: Haichao-Zhang/IRL

def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()

Example #23

0

Show file

    def _build_policy_from_rllab(self, env, n_actions):
        """ Return both rllab policy and policy model function. """

        sess = self.tf_sess
        scope_name = self.scope_name

        # Initialize training_policy to copy from policy

        training_policy = GaussianMLPPolicy(
            name=scope_name,
            env_spec=env.spec,
            hidden_sizes=self.policy_params["hidden_layers"],
            init_std=self.policy_opt_params["trpo"]["init_std"],
            output_nonlinearity=eval(
                self.policy_params["output_nonlinearity"]))
        training_policy_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope='training_policy')
        sess.run([tf.variables_initializer(training_policy_vars)])

        def policy_model(x, stochastic=1.0, collect_summary=False):
            dist_info_sym = training_policy.dist_info_sym(x, dict())

            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            mean_var += stochastic * tf.random_normal(
                shape=(tf.shape(x)[0], n_actions)) * tf.exp(log_std_var)

            return mean_var

        return training_policy, policy_model

Example #24

0

Show file

def main(eval_reward = False):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    n_experts = 10
    experts = load_latest_experts('plotting/pendulum_final', n=n_experts)
    dirname='data/pendulum' # dir to save logs and images

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        eval_reward=True,
        fig_dir = dirname
    )

    # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)):
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.fig_dirname = dirname
            algo.train()

Example #25

0

Show file

File: point_traj.py Project: morrimax/inverse_rl

def main():
    env = TfEnv(CustomGymEnv('PointMazeLeft-v0'))
    
    experts = load_latest_experts('data/point', n=50)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/point_traj'):
        with tf.Session():
            algo.train()
            test_pointmaze(sess.run(policy))

Example #26

0

Show file

File: swimmer_data_collect.py Project: hsilva664/nstep_airl

def main(exp_name, ent_wt=0.1, visible_gpus='0', discount=0.99):
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    with tf.Session(config=tf_config) as sess:
        algo = TRPO(
            env=env,
            policy=policy,
            n_itr=3000,
            batch_size=20000,
            max_path_length=1000,
            discount=discount,
            store_paths=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            step_size=0.01,
            entropy_weight=ent_wt,
            sess=sess,
            exp_name=exp_name,
        )

        with rllab_logdir(algo=algo, dirname='data/swimmer'):
            algo.train(sess)

Example #27

0

Show file

def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/airsim', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=10,
        batch_size=100,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/airsim_gcl'):
        with tf.Session():
            algo.train()

Example #28

0

Show file

File: test_multiagent_ngsim_env.py Project: xuhuahaoren/ngsim_env

 def test_multiagent_ngsim_env(self):
     basedir = os.path.expanduser('~/.julia/packages/NGSIM/9OYUa/data')
     filename = 'trajdata_i101_trajectories-0750am-0805am.txt'
     filepaths = [os.path.join(basedir, filename)]
     n_veh = 5
     env = JuliaEnv(env_id='MultiagentNGSIMEnv',
                    env_params=dict(n_veh=n_veh,
                                    trajectory_filepaths=filepaths,
                                    H=200,
                                    primesteps=50),
                    using='AutoEnvs')
     low, high = env.action_space.low, env.action_space.high
     env = TfEnv(env)
     policy = GaussianMLPPolicy(name="policy",
                                env_spec=env.spec,
                                hidden_sizes=(32, 32),
                                std_hidden_sizes=(32, 32),
                                adaptive_std=True,
                                output_nonlinearity=None,
                                learn_std=True)
     baseline = ZeroBaseline(env_spec=env.spec)
     algo = TRPO(env=env,
                 policy=policy,
                 baseline=baseline,
                 n_itr=1,
                 batch_size=1000,
                 sampler_args=dict(n_envs=n_veh))
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         try:
             algo.train(sess=sess)
         except Exception as e:
             self.fail('exception incorrectly raised: {}'.format(e))

Example #29

0

Show file

File: benchmark_svrg.py Project: wolfustc/rllab

def run_svrg(*_):
    envir = env_name_map[env_name]
    env = TfEnv(normalize(GymEnv(envir, record_video=False, force_reset=True)))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=(64, 64),
    )

    policy_tilde = GaussianMLPPolicy(
        name="policy_tilde",
        env_spec=env.spec,
        hidden_sizes=(64, 64),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = SVRGPG(env=env,
                  policy=policy,
                  policy_tilde=policy_tilde,
                  baseline=baseline,
                  batch_size=batch_size,
                  max_path_length=max_path_length,
                  n_itr=n_itr,
                  discount=0.995,
                  delta=delta,
                  optimizer_args=dict(
                      batch_size=mini_batch_size,
                      max_epochs=1,
                      epsilon=1e-8,
                      use_SGD=False,
                      cg_iters=cg_iters,
                      subsample_factor=subsample_factor,
                      max_batch=max_batch,
                  ))

    print("run svrg cg for env {:}".format(env_name))
    print("max_epochs {:}".format(max_epochs))
    print("cg_iters {:}".format(cg_iters))
    print("step size: {:}".format(delta))
    print("max_path_length : {:}".format(max_path_length))
    print("Num of Iterations: {:}".format(n_itr))
    print("Num of Examples: {:}".format(batch_size))
    print("sub sample rate: {:}".format(subsample_factor))
    print("batch size: {:}".format(mini_batch_size))
    print("max num batches: {:}".format(max_batch))
    return algo

Example #30

0

Show file

def main(exp_name=None, fusion=True):
    # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True))
    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)
    #experts = load_latest_experts('data/ant_data_collect', n=5)

    #qvar: inverse model q(a|s,s')
    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env,
                      qvar=qvar,
                      expert_trajs=experts,
                      fusion=True,
                      max_itrs=10)
    #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      fusion=fusion,
                      max_itrs=10,
                      score_discrim=True)

    #Empowerment-based potential functions gamma* Phi(s')-Phi(s)
    empw_model = Empowerment(env=env, fusion=True, max_itrs=4)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               fusion=True,
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=3000,  #130,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        target_empw_update=5,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        lambda_i=1.0,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        plot=False)
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl'):
        #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above
        with tf.Session():
            algo.train()