Ejemplo n.º 1
0
def main(args):
    logger.set_snapshot_dir(args.snapshot_dir)
    logger.set_snapshot_mode("none")
    logger.add_tabular_output(os.path.join(args.snapshot_dir, "tabular.csv"))
    env = GymEnv(args.env_id)

    # Load the AI policy.
    with open(args.ai_policy, "rb") as f:
        env.env.unwrapped.ai_policy = pickle.load(f)

    # If the user provided a starting policy, use it. Otherwise, we start with
    # a fresh policy.
    if args.input_policy is not None:
        with open(args.input_policy, "rb") as f:
            policy = pickle.load(f)
    else:
        policy = CategoricalMLPPolicy(env_spec=env.spec,
                                      hidden_sizes=args.hidden_sizes)

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=env.horizon,
        n_itr=args.n_itr,
        discount=args.discount,
        step_size=args.step_size,
        gae_lambda=args.gae_lambda,
    )
    algo.train()
    with open(args.output_policy, "wb") as f:
        pickle.dump(policy, f)
Ejemplo n.º 2
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=(64, ))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=horizon * 32 * 2,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=400,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
Ejemplo n.º 3
0
def main(num_examples=50, discount=0.99):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=num_examples)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=2000,
        max_path_length=100,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_traj'):
        with tf.Session():
            algo.train()
Ejemplo n.º 4
0
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(GymEnv("CartPole-v0"))

    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(8, 8))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 5
0
def run_task(*_):
    env_name = "BottleneckEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=400,
        discount=0.995,
        # step_size=0.01,
    )
    algo.train()
Ejemplo n.º 6
0
def run_task(*_):
    env = normalize(
        GymEnv("DartWalker3d-v1", record_log=False, record_video=False))

    policy = GaussianMLPAuxPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(100, 50, 25),
        aux_pred_step=3,
        aux_pred_dim=7,
    )

    #policy = joblib.load('data/local/experiment/walker_aux/policy.pkl')

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPOAux(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=50000,
        max_path_length=env.horizon,
        n_itr=500,
        discount=0.995,
        step_size=0.01,
        epopt_epsilon=1.0,
        epopt_after_iter=0,
        gae_lambda=0.97,
        aux_pred_step=3,
        aux_pred_dim=7,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 7
0
def main(exp_name, ent_wt=0.1, visible_gpus='0', discount=0.99):
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    with tf.Session(config=tf_config) as sess:
        algo = TRPO(
            env=env,
            policy=policy,
            n_itr=3000,
            batch_size=20000,
            max_path_length=1000,
            discount=discount,
            store_paths=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            step_size=0.01,
            entropy_weight=ent_wt,
            sess=sess,
            exp_name=exp_name,
        )

        with rllab_logdir(algo=algo, dirname='data/swimmer'):
            algo.train(sess)
Ejemplo n.º 8
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/airsim', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=10,
        batch_size=100,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/airsim_gcl'):
        with tf.Session():
            algo.train()
Ejemplo n.º 9
0
def run_task(vv):

    env = TfEnv(
        normalize(
            GymEnv('HalfCheetah-v1', record_video=False, record_log=False)))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
        name="policy")

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=vv["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 10
0
def run_task(*_):
    env = normalize(
        GymEnv("DartHopper-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

    algo = PPO_Clip_Sym(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        whole_paths=False,
        observation_permutation=np.array(
            [0.0001, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
        action_permutation=np.array([0.0001, 1, 2]),
        sym_loss_weight=0.0,
    )
    algo.train()
def test_fric_rob(test_type,
                  file_name,
                  env_name,
                  fric_fractions=np.linspace(0.5, 1.5, 11),
                  fric_bodies=[b'ffoot', b'bfoot'],
                  adv_fraction=1.0,
                  n_traj=5):
    fric_vals = []
    test_rew_summary = []
    test_rew_std_summary = []
    print(file_name)
    res_D = pickle.load(open(file_name, 'rb'))
    P = res_D['pro_policy']
    for ff in fric_fractions:
        env = normalize(GymEnv(env_name, 1.0))
        e = np.array(env.wrapped_env.env.model.geom_friction)
        e = e * ff
        env.wrapped_env.env.model.geom_friction = e
        fric_vals.append(e[0, 0])
        N = np.zeros(n_traj)
        for i in range(n_traj):
            N[i] = test_type(env, P, 1000, 1)
        M = N.mean()
        V = N.std()
        test_rew_summary.append(M)
        test_rew_std_summary.append(V)

    return test_rew_summary, test_rew_std_summary, fric_vals
Ejemplo n.º 12
0
def run_task(*_):
    # Please note that different environments with different action spaces may require different
    # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete
    # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example)
    env = normalize(GymEnv("Pendulum-v0", record_video=False,
                           force_reset=True))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(
            hvp_approach=FiniteDifferenceHvp(base_eps=1e-5, symmetric=False))
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 13
0
def run_task(_):
    env_name = "PlatooningEnv"

    register(
        id=env_name+'-v0',
        entry_point='platooning_env:{}'.format(env_name),
        max_episode_steps=HORIZON,
        kwargs={"env_params": ENV_PARAMS}
    )

    env = GymEnv(env_name, record_video=False)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(16, 16, 16),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=15000,
        max_path_length=horizon,
        n_itr=1000,
        # whole_paths=True,
        discount=0.999,
    )
    algo.train(),
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True)

    # baseline = LinearFeatureBaseline(env_spec=env.spec)
    baseline = ZeroBaseline(env_spec=env.spec)
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100,
        max_path_length=100,
        n_itr=10000,
        discount=0.99,
        optimizer_args=dict(
            learning_rate=0.01,
        )
    )
    algo.train()
Ejemplo n.º 15
0
def run_task(*_):
    env = normalize(GymEnv("DartWalker2d-v1"))

    policy = GaussianHMLPPropPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 16),
        #subnet_split1=[5, 6, 7, 8, 9, 21, 22, 23, 24, 25],
        #subnet_split2=[10, 11, 12, 13, 14, 26, 27, 28, 29, 30],
        #sub_out_dim=6,
        #option_dim=4,
        sub_out_dim=3,
        option_dim=2,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=50000,
        max_path_length=env.horizon,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        epopt_epsilon=1.0,
        epopt_after_iter=0,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 16
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    es = OUStrategy(env_spec=env.spec)
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=350,
        epoch_length=350,
        min_pool_size=350,
        n_epochs=600,
        discount=0.99,
        scale_reward=1.0/140.0,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), 
                                             str(algo.__class__.__name__), 
                                             exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
Ejemplo n.º 17
0
def run_task(*_):
    
    n_itr = 1000
    env = VaryMassEnv(GymEnv("MyPendulum-v0", record_video=False),
                  m0=0.2, 
                  mf=0.3,
                  iters=n_itr)
    #
    policy = GaussianMLP2Policy(
        name="policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes= (4,4), #(128, 128, 128, 128, 128, 128),
        hidden_nonlinearity=tf.nn.relu, #linearized_tanh
        # tf.nn.relu, # relu_tanh
        #output_nonlinearity=tf.nn.sigmoid
        # idea: define new tf nonlinearity that is a cap, made up of two relus
    )
    #
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    #
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=n_itr,
        discount=0.99,
        step_size=0.0075, # 0.01
        sampler_cls=VectorizedVaryingSampler
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 18
0
def run_task(*_):

    env = TfEnv(
        normalize(GymEnv("Reacher-v1", force_reset=True, record_video=True)))
    #env = TfEnv(normalize(PusherEnv()))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 128))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100 * 500,
        max_path_length=100,
        n_itr=200,
        discount=0.99,
        step_size=0.01,
        force_batch_sampler=True,
        # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
    )
    algo.train()
Ejemplo n.º 19
0
def run_task(*_):
    env = normalize(GymEnv("DartWalker3dRestricted-v1")
                    )  #, record_log=False, record_video=False))

    policy = GaussianHMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
        subnet_split1=[8, 9, 10, 11, 12, 13, 29, 30, 31, 32, 33, 34],
        subnet_split2=[14, 15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 40],
        sub_out_dim=6,
        option_dim=2,
        hlc_output_dim=3,
    )

    #policy = joblib.load('data/local/experiment/Walker3d_waist_onlyconcatoption3/policy.pkl')

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=50000,
        max_path_length=env.horizon,
        n_itr=500,
        discount=0.995,
        step_size=0.01,
        epopt_epsilon=1.0,
        epopt_after_iter=0,
        gae_lambda=0.97,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 20
0
def run_task(*_):
    # Please note that different environments with different action spaces may require different
    # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete
    # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example)
    env = TfEnv(GymEnv("MyPendulum-v1", record_video=False))
    #
    policy = GaussianConvPolicy(
        name="policy",
        env_spec=env.spec,
        conv_filters = [3], # how many conv layers. e.g. this is one layer with 3 fitlers (I think)
        conv_filter_sizes = [5, 5, 5],
        conv_strides = [3, 3, 3],
        conv_pads = ['SAME', 'SAME', 'SAME'],
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes= (16,4), #(128, 128, 128, 128, 128, 128),
        hidden_nonlinearity=tf.nn.relu, #linearized_tanh
        output_nonlinearity=None,
    )
    #
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    #
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=5, #4000,
        max_path_length=env.horizon,
        n_itr=2, #1000,
        discount=0.99,
        step_size=0.0075, # 0.01
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 21
0
def main(eval_reward = False):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    n_experts = 10
    experts = load_latest_experts('plotting/pendulum_final', n=n_experts)
    dirname='data/pendulum' # dir to save logs and images

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        eval_reward=True,
        fig_dir = dirname
    )

    # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)):
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.fig_dirname = dirname
            algo.train()
Ejemplo n.º 22
0
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(
        GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True))
    # policy = CategoricalMLPPolicy(
    #     env_spec=env.spec,
    #     # The neural network policy should have two hidden layers, each with 32 hidden units.
    #     hidden_sizes=(32, 32)
    # )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    # max_path_length = env.horizon
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=300,
        n_itr=10000,
        discount=0.99,
        # step_size=0.02,
        truncate_local_is_ratio=0.2
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Ejemplo n.º 23
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Pendulum-v0'
    pickle_path = '../gpirl/notebooks/plots/rllab_trpo_trainig/itr_112.pkl'
    # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl'
    iter_data = joblib.load(pickle_path)
    env = GymEnv(env_name)
    max_r = 1
    while True:
        o = env.reset()
        disc_r = 0
        r_sum = 0
        done = False
        i = 0
        while not done:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            # s = [np.arccos(o[0]), np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99**(500 - i)
            r_sum += r
            i += 1
        # max_r = r
        print("disc_r : {} , sum_r : {}".format(disc_r, r_sum))
        print("last x : {}".format(o[0]))
        print("-------------------------")
Ejemplo n.º 24
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Cartpole-v3'
    pickle_path = 'data/Cartpole_v3_data_rllab_TRPO/exp_1/itr_1200.pkl'
    # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl'
    iter_data = joblib.load(pickle_path)
    env = GymEnv(env_name)
    max_r = 1
    while True:
        o = env.reset()
        disc_r = 0
        r_sum = 0
        done = False
        i = 0
        print("stable point : {}".format(env.env._stable_x))
        while not done:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            # s = [np.arccos(o[0]), np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99**(500 - i)
            r_sum += r
            i += 1
        # max_r = r
        print("disc_r : {} , sum_r : {}".format(disc_r, r_sum))
        print("last x : {}".format(o[0]))
        print("-------------------------")
Ejemplo n.º 25
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
def run_task(*_):

    env = normalize(
        GymEnv(env_name="MountainCarContinuous-v0", force_reset=True))
    max_path_length = 300

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=100,
        n_updates_per_sample=1,
        max_path_length=max_path_length,
        epoch_length=900,
        min_pool_size=800,
        replay_pool_size=5000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
    )
    algo.train()
Ejemplo n.º 27
0
def gym_env(name):
    from rllab.envs.gym_env import GymEnv
    return GymEnv(
        name,
        record_video=False,
        log_dir='/tmp/gym-test',  # Ignore gym log.
        record_log=False)
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99):
    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=1000,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
Ejemplo n.º 29
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    ### VGG 11/29/18: Added support to CSV files
    ## this method loads expert data saved as pickle file
    # experts = load_latest_experts('data/airsim_final', n=1)
    # this one uses csv:
    experts = load_experts('data/airsim_human_data/log.csv',
                           pickle_format=False)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=5000,
        batch_size=60,
        max_path_length=60,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=100,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        n_parallel=0)

    with rllab_logdir(algo=algo, dirname='data/airsim_gail'):
        with tf.Session():
            algo.train()
Ejemplo n.º 30
0
def main():
    env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/ant', n=50)

    irl_model = GCLDiscrim(
        env_spec=env.spec,
        expert_trajs=experts,
        discrim_arch=disentangled_net)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=1000,
        discount=0.995,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/ant_airl'):
        with tf.Session():
            algo.train()