Esempio n. 1
0
from mjrl.utils.gym_env import GymEnv
from mjrl.policies.gaussian_linear import LinearPolicy
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import time as timer

SEED = 500

e = GymEnv('Walker2d-v2')
policy = LinearPolicy(e.spec, seed=SEED)
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=64,
                       epochs=2,
                       learn_rate=1e-3)
agent = NPG(e,
            policy,
            baseline,
            normalized_step_size=0.1,
            seed=SEED,
            save_logs=True)

ts = timer.time()
train_agent(job_name='walker_nominal',
            agent=agent,
            seed=SEED,
            niter=500,
            gamma=0.995,
            gae_lambda=0.97,
            num_cpu=4,
Esempio n. 2
0
for i in range(num_seeds):
      np.random.seed(SEED)
      torch.manual_seed(SEED)

      job_name_stl_seed = job_name_stl + '/seed_{}'.format(i)

      e = {}
      baseline_stl = {}
      policy_stl = {}
      agent_stl = {}
      task_order = np.random.permutation(num_tasks)
      for task_id in range(num_tasks):
            e[task_id] = e_unshuffled[task_order[task_id]]
            baseline_stl[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True)
            policy_stl[task_id] = LinearPolicy(e[task_id].spec, seed=SEED)
            agent_stl[task_id] = NPG(e[task_id], policy_stl[task_id], baseline_stl[task_id], normalized_step_size=0.1, seed=SEED, save_logs=True)

      loggers_stl = {}
      grads_stl = {}
      hess_stl = {}
      for task_id in range(num_tasks):
            ts = timer.time()

            train_agent(job_name=job_name_stl_seed,
                        agent=agent_stl[task_id],
                        seed=SEED,
                        niter=200,
                        gamma=0.995,  
                        gae_lambda=0.97,
                        num_cpu=num_cpu,
Esempio n. 3
0
      np.random.seed(SEED)
      torch.manual_seed(SEED)

      job_name_ewc_seed = job_name_ewc + '/seed_{}'.format(i)

      e = {}
      task_order = np.random.permutation(num_tasks)
      for task_id in range(num_tasks):
            e[task_id] = e_unshuffled[task_order[task_id]]

      for ewc_lambda in lambda_range:   
            baseline_ewc = {}
            for task_id in range(num_tasks):
                  baseline_ewc[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True)
                  
            policy_ewc = LinearPolicy(e[0].spec, seed=SEED)
            agent_ewc = NPGEWC(e, policy_ewc, baseline_ewc, ewc_lambda=ewc_lambda, scaled_lambda=False, normalized_step_size=0.1, seed=SEED, save_logs=True)


            # agent = BatchREINFORCE(e, policy, baseline, learn_rate=0.0001, seed=SEED, save_logs=True)
            job_name_ewc_seed_lambda = job_name_ewc_seed + '/lambda{}'.format(ewc_lambda)
            for task_id in range(num_tasks):
                  ts = timer.time()
                  train_agent(job_name=job_name_ewc_seed_lambda,
                              agent=agent_ewc,
                              seed=SEED,
                              niter=50,
                              gamma=0.995,  
                              gae_lambda=0.97,
                              num_cpu=num_cpu,
                              sample_mode='trajectories',
Esempio n. 4
0
def launch_job(tag, variant):

    print(len(variant))
    seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \
        cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp,  \
        shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant
    beta1, beta2 = betas

    iters = int(total_samples / batch_size)

    # NN policy
    # ==================================
    e = GymEnv(env)
    if use_nn_policy:
        policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed)
    else:
        policy = LinearPolicy(e.spec, seed=seed)
    vfn_batch_size = 256 if gn_vfn_opt else 64
    vfn_epochs = 2 if gn_vfn_opt else 2
    # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=vfn_batch_size,
                           epochs=2,
                           learn_rate=1e-3,
                           use_gauss_newton=gn_vfn_opt)
    # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True)

    common_kwargs = dict(lr=lr,
                         curv_type=curv_type,
                         cg_iters=cg_iters,
                         cg_residual_tol=cg_residual_tol,
                         cg_prev_init_coef=cg_prev_init_coef,
                         cg_precondition_empirical=cg_precondition_empirical,
                         cg_precondition_regu_coef=cg_precondition_regu_coef,
                         cg_precondition_exp=cg_precondition_exp,
                         shrinkage_method=shrinkage_method,
                         lanczos_amortization=lanczos_amortization,
                         lanczos_iters=lanczos_iters,
                         batch_size=batch_size)

    if optim == 'ngd':
        optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs)
    elif optim == 'natural_adam':
        optimizer = fisher_optim.NaturalAdam(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)
    elif optim == 'natural_adagrad':
        optimizer = fisher_optim.NaturalAdagrad(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)
    elif optim == 'natural_amsgrad':
        optimizer = fisher_optim.NaturalAmsgrad(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)

    if algo == 'trpo':
        from mjrl.algos.trpo_delta import TRPO
        agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True)
        # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True)
    else:
        from mjrl.algos.npg_cg_delta import NPG
        agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True)

    save_dir = build_log_dir(tag, variant)
    try:
        os.makedirs(save_dir)
    except:
        pass

    # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000))
    train_agent(job_name=save_dir,
                agent=agent,
                seed=seed,
                niter=iters,
                gamma=0.995,
                gae_lambda=0.97,
                num_cpu=1,
                sample_mode='samples',
                num_samples=batch_size,
                save_freq=5,
                evaluation_rollouts=5,
                verbose=False)  #True)