Esempio n. 1
0
def run_vpg_baseline_large_batch_size_no_critic(*_):
    env = normalize(env_name())
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(
            100,
            50,
            25,
        ),
        adaptive_std=False,
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    print("Iteration Number: {:}".format(n_itr))
    print("Learning Rate : {:}".format(learning_rate))
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size * num_of_agents,
        max_path_length=500,
        n_itr=n_itr,
        discount=0.99,
        optimizer_args={'learning_rate': learning_rate},
        sampler_cls=BatchSampler_no_critic,
    )
    algo.train()
Esempio n. 2
0
def run_task(*_):
    env = normalize(Cassie2dEnv())

    if load_policy:
        filename = "123"
        data = joblib.load(filename)
        policy = data['policy']
        print("Loading Pretrained Policy ...............................")
    else:
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(32, 32),
            init_std=1.0,
            #adaptive_std=True,
        )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=10000,
        max_path_length=1000,  # dt = (1/2000)*n, where n is Step(n)
        n_itr=400,
        discount=0.99,
        step_size=0.005,  # default was 0.01
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=False,
    )
    algo.train()
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True)

    # baseline = LinearFeatureBaseline(env_spec=env.spec)
    baseline = ZeroBaseline(env_spec=env.spec)
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100,
        max_path_length=100,
        n_itr=10000,
        discount=0.99,
        optimizer_args=dict(
            learning_rate=0.01,
        )
    )
    algo.train()
Esempio n. 4
0
def test_baseline(baseline_cls):
    env = CartpoleEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,))
    baseline = baseline_cls(env_spec=env.spec)
    algo = VPG(
        env=env, policy=policy, baseline=baseline,
        n_itr=1, batch_size=1000, max_path_length=100
    )
    algo.train()
Esempio n. 5
0
def run_task(*_):
    import gym_driving
    env = normalize(GymEnv('DrivingEnv-v0'))
    # env = normalize(GymEnv('CartPole-v0'))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=env.horizon,
        n_itr=250,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Esempio n. 6
0
        def run_task(*_):
            env = normalize(GymEnv('HovorkaInterval-v0'))
            # env.wrapped_env.env.env.env.reward_flag = 'absolute'
            env.wrapped_env.env.env.reward_flag = reward_functions[k]

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            # hidden_sizes=(100, 50, 25)

            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=hidden_sizes,
                                       learn_std=learn_std,
                                       init_std=init_std)

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            algo = VPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
            algo.train()
Esempio n. 7
0
        def run_task(*_):
            env = normalize(GymEnv(models[k]))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            # hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            hidden_sizes = (100, 50, 25)

            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=hidden_sizes,
                                       learn_std=learn_std,
                                       init_std=init_std)

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            # algo = VPG(
            algo = VPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
            algo.train()
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = normalize(
        GymEnv(env_name="Acrobot-v1", force_reset=True, record_video=True))

    max_path_length = env.horizon
    print(max_path_length)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64))
    # optimizer = FirstOrderOptimizer(update_method=lasagne.updates.adam, learning_rate=1e-1)

    algo = VPG(env=env,
               policy=policy,
               baseline=baseline,
               batch_size=800,
               max_path_length=500,
               n_itr=10000,
               discount=0.99,
               optimizer_args=dict(learning_rate=0.01, ))
    algo.train()
Esempio n. 9
0
def main(args):
    env = GymEnv(args.env_id)

    # If the user provided a starting policy, use it. Otherwise, we start with
    # a fresh policy.
    if args.input_policy is not None:
        with open(args.input_policy, "rb") as f:
            policy = pickle.load(f)
    else:
        policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
#        n_itr=2000,
#        max_path_length=env.horizon,
#        discount=0.99,
#        batch_size=4000,
    )
    algo.train()
    with open(args.output_policy, "wb") as f:
        pickle.dump(policy, f)
Esempio n. 10
0
 rand_testing_rews.append(
     test_rand_adv(env, pro_policy, path_length=path_length))
 step_testing_rews = []
 step_testing_rews.append(
     test_step_adv(env, pro_policy, path_length=path_length))
 rand_step_testing_rews = []
 rand_step_testing_rews.append(
     test_rand_step_adv(env, pro_policy, path_length=path_length))
 adv_testing_rews = []
 adv_testing_rews.append(
     test_rand_adv(env, pro_policy, path_length=path_length))
 #embed()
 for ni in range(n_itr):
     logger.log('\n\n\n####expNO{}_{} global itr# {}####\n\n\n'.format(
         ne, adv_name, ni))
     pro_algo.train()
     pro_rews += pro_algo.rews
     all_rews += pro_algo.rews
     logger.log('Protag Reward: {}'.format(np.array(pro_algo.rews).mean()))
     const_testing_rews.append(
         test_const_adv(env, pro_policy, path_length=path_length))
     rand_testing_rews.append(
         test_rand_adv(env, pro_policy, path_length=path_length))
     step_testing_rews.append(
         test_step_adv(env, pro_policy, path_length=path_length))
     rand_step_testing_rews.append(
         test_rand_step_adv(env, pro_policy, path_length=path_length))
     adv_testing_rews.append(
         test_rand_adv(env, pro_policy, path_length=path_length))
     if ni != 0 and ni % save_every == 0:
         ## SAVING INFO ##
Esempio n. 11
0
algo = VPG(
    env=env,
    policy=policy,
    # baseline=baseline,
    baseline=baseline,
    batch_size=5000,
    max_path_length=env.horizon,
    n_itr=200,
    # discount=0.80,
    discount=.9,
    step_size=0.01
    # Uncomment both lines (this and the plot parameter below) to enable plotting
    # plot=True,
)
algo.train()

## Testing the policy

reward = []
actions = []

s = env.reset()

# done = False

# Testing the algorithm
# while not done:
for i in range(48):

    # Get action recommended by policy
Esempio n. 12
0
if args.extra is not None:
    exp_prefix += "_" + args.extra
if args.get_exp_paths:
    print('data/s3/' + exp_prefix.replace('_', '-'))
else:
    if (not args.eval) and (not args.pchange) and (not args.savecomvel) and (
            not args.savejointangles):
        if len(algo_lst) > 0:
            # algo_lst = list(filter(lambda x: x[0] != 20 and x[0] != 30, algo_lst))
            # algo_lst = list(filter(lambda x: x[0] != 10, algo_lst))
            print("algo list", algo_lst)
            for seed, algo in algo_lst:
                print(seed)
                exp_name = '{0}_{1}_{2}'.format(
                    exp_prefix, str(seed), time.strftime("%d-%m-%Y_%H-%M-%S"))
                run_experiment_lite(stub_method_call=algo.train(),
                                    mode=mode,
                                    use_gpu=use_gpu,
                                    use_cloudpickle=False,
                                    pre_commands=['pip install --upgrade pip'],
                                    n_parallel=n_parallel,
                                    snapshot_mode=snapshot_mode,
                                    snapshot_gap=snapshot_gap,
                                    seed=seed,
                                    confirm_remote=False,
                                    exp_prefix=exp_prefix,
                                    exp_name=exp_name)
        else:
            print("seeds", seeds)
            for seed in seeds:
                exp_name = '{0}_{1}_{2}'.format(
Esempio n. 13
0
hidden_arc = [str(i) for i in hidden_sizes]
hidden_arc = '_'.join(hidden_arc)

data_dir = 'Reinforce_batchSize_{}_nIters_{}_stepSize_{}_gamma_{}_initStd_{}{}_policyPar_{}_reward_{}'\
        .format(batch_size, n_itr, step_size,''.join(str(gamma).split('.')), init_std, learn_std, hidden_arc, reward_fun)

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

DROPBOX_DIR = '/home/jonas/Dropbox/results/jonas_experiments/'
# log_dir = PROJECT_PATH + '/data/local/' + data_dir + timestamp
log_dir = DROPBOX_DIR + data_dir + timestamp

# Running and saving the experiment
run_experiment_lite(
    algo.train(),
    log_dir=log_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    # exp_prefix="Reinforce_" + env_name,
    # exp_prefix=data_dir
    seed=1,
    mode="local",
    plot=False,
    # terminate_machine=args.dont_terminate_machine,
    added_project_directories=[
        osp.abspath(osp.join(osp.dirname(__file__), '.'))
Esempio n. 14
0
from contrib.alexbeloi.is_sampler import ISSampler

"""
Example using VPG with ISSampler, iterations alternate between live and
importance sampled iterations.
"""

env = normalize(CartpoleEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
    sampler_cls=ISSampler,
    sampler_args=dict(n_backtrack=1),
)
algo.train()
Esempio n. 15
0
def run_task(*_):
    env = normalize(GymEnv(args.env))
    # env.wrapped_env.env.env.env.reward_flag = 'absolute'
    env.wrapped_env.env.env.reward_flag = args.reward

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    learn_std = True
    init_std = 2

    # hidden_sizes=(8,)
    hidden_sizes = (32, 32)
    # hidden_sizes=(100, 50, 25)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=hidden_sizes,
                               learn_std=learn_std,
                               init_std=init_std)

    # =======================
    # Defining the algorithm
    # =======================
    batch_size = 5000
    n_itr = args.n_itr
    gamma = .9
    step_size = 0.01

    if args.algorithm == 0:
        algo = VPG(env=env,
                   policy=policy,
                   baseline=baseline,
                   batch_size=batch_size,
                   n_itr=n_itr,
                   discount=gamma,
                   step_size=step_size)
    if args.algorithm == 1:
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    if args.algorithm == 2:
        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    # if args.algorithm == 4:
    # algo = DDPG(
    # env=env,
    # policy=policy,
    # baseline=baseline,
    # batch_size=batch_size,
    # n_itr=n_itr,
    # discount=gamma,
    # step_size=step_size
    # )
    algo.train()

    return algo