コード例 #1
0
def main():
    # get args
    args = get_args()

    # load env
    if args.include is not "":
        exec("import " + args.include)
    e = GymEnv(args.env_name)

    # load policy
    policy = args.policy
    if args.policy == "":
        pol = MLP(e.spec, init_log_std=-1)
        mode = "exploration"
    else:
        pol = pickle.load(open(policy, 'rb'))
        mode = "evaluation"

    # Visualized policy
    if args.render == "onscreen":
        # On screen
        e.env.env.visualize_policy(pol,
                                   horizon=e.horizon,
                                   num_episodes=args.num_episodes,
                                   mode=mode)
    else:
        # Offscreen buffer
        e.env.env.visualize_policy_offscreen(pol,
                                             horizon=100,
                                             num_episodes=args.num_episodes,
                                             mode=mode,
                                             filename=args.filename)

    # Close envs
    e.env.env.close_env()
コード例 #2
0
def main(env_name, policy, mode, seed, episodes):
    e = GymEnv(env_name)
    e.set_seed(seed)
    if policy is not None:
        pi = pickle.load(open(policy, 'rb'))
    else:
        pi = MLP(e.spec, hidden_sizes=(32, 32), seed=seed, init_log_std=-1.0)
    # render policy
    e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
コード例 #3
0
def main(env_name, policy, mode, seed, episodes, log_std, terminate,
         device_path):
    render = True

    # TODO(Aravind): Map to hardware if device_path is specified

    e = GymEnv(env_name)
    e.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if policy is not None:
        policy = pickle.load(open(policy, 'rb'))
    else:
        policy = MLP(e.spec,
                     hidden_sizes=(32, 32),
                     seed=seed,
                     init_log_std=log_std)

    for ep in range(episodes):
        o = e.reset()
        rew = 0.0
        t = 0
        done = False
        while t < e.horizon and done is False:
            o = e.get_obs()
            a = policy.get_action(
                o)[0] if mode == 'exploration' else policy.get_action(
                    o)[1]['evaluation']
            next_o, r, done, ifo = e.step(a)
            if terminate is False:
                done = False
            rew = rew + r
            t = t + 1
            if render:
                e.render()
            if done and t < e.horizon - 1:
                print("Episode terminated early")
        print("episode score = %f " % rew)

    e.reset()
コード例 #4
0
ファイル: run_utils.py プロジェクト: Divye02/hand_vil
def train_expert_policy(config):
    print('-' * 80)
    previous_dir = os.getcwd()
    ensure_dir(GEN_DATA_DIR)
    os.chdir(GEN_DATA_DIR)

    print('Training Expert')
    e = make_gym_env(config['env_id'], config)
    policy = MLP(e.spec, hidden_sizes=(32, 32), seed=config['seed'])
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=64,
                           epochs=2,
                           learn_rate=1e-3)
    agent = NPG(e,
                policy,
                baseline,
                normalized_step_size=0.1,
                seed=config['seed'],
                save_logs=True)

    job_name = '%s_expert' % config['env_name']
    # Need to change where it dumps the policy
    train_agent(job_name=job_name,
                agent=agent,
                seed=config['seed'],
                niter=30,
                gamma=0.995,
                gae_lambda=0.97,
                num_cpu=1,
                sample_mode='trajectories',
                num_traj=200,
                save_freq=5,
                evaluation_rollouts=5)
    os.chdir(previous_dir)
    os.rename(
        os.path.join(GEN_DATA_DIR, job_name, 'iterations/best_policy.pickle'),
        os.path.join(EXPERT_POLICIES_DIR, EXPERT_POLICIES[config['env_name']]))
    print('-' * 80)
コード例 #5
0
def single_process(job):
    job_start_time = timer.time()

    # Allow process to parallelize things internally
    curr_proc = mp.current_process()
    curr_proc.daemon = False

    os.chdir(cwd)
    dirpath = os.path.join(job['save_dir'], job['job_name'])
    os.makedirs(dirpath, exist_ok=True)

    # start job
    os.chdir(cwd)
    job_start_time = timer.time()
    print('Started New Job : ', job['job_name'], '=======================')
    print('Job specifications : \n', job)

    # Make Env
    e = GymEnv(job['env_name'])

    # Make baseline
    baseline = MLPBaseline(e.spec)

    # save job details
    job['horizon'] = e.horizon
    job['ctrl_timestep'] = e.env.env.dt
    job['sim_timestep'] = e.env.env.model.opt.timestep
    # job['sim_skip'] = e.env.env.skip
    job_data_file = open(dirpath + '/job_data.txt', 'w')
    pprint.pprint(job, stream=job_data_file)

    job_data_file.close()

    # Make policy (???vik: sizes are hard coded)
    if 'init_policy' in job:
        policy = MLP(e.spec,
                     init_log_std=job['init_std'],
                     hidden_sizes=(32, 32),
                     seed=job['seed'])
        loaded_policy = pickle.load(open(job['init_policy'], 'rb'))
        loaded_params = loaded_policy.get_param_values()
        print('log std values in loaded policy = ')
        print(params[-policy.m:])
        # NOTE: if the log std is too small
        # (say <-2.0, it is problem dependent and intuition should be used)
        # then we need to bump it up so that it explores
        # params[-policy.m:] += 1.0
        policy.set_param_values(loaded_params)
        del job['init_policy']

    else:
        policy = MLP(e.spec,
                     init_log_std=job['init_std'],
                     hidden_sizes=(32, 32),
                     seed=job['seed'])
    # Agent
    agent = NPG(e, policy, baseline, seed=job['seed'], \
        normalized_step_size=job['normalized_step_size'], \
        save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args'])

    # Train Agent
    train_agent(
        job_name=dirpath,
        agent=agent,
        seed=job['seed'],
        niter=job['niter'],
        gamma=job['gamma'],
        gae_lambda=job['gae_lambda'],
        num_cpu=job['num_cpu'],
        sample_mode=job['sample_mode'],
        num_traj=job['num_traj'],
        evaluation_rollouts=job['evaluation_rollouts'],
        save_freq=job['save_freq'],
        plot_keys={'stoc_pol_mean', 'stoc_pol_std'},
    )

    total_job_time = timer.time() - job_start_time
    print('Job', job['job_name'],
          'took %f seconds ==============' % total_job_time)
    return total_job_time
コード例 #6
0
from mjrl.policies.gaussian_mlp import MLP
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.batch_reinforce import BatchREINFORCE
from mjrl.algos.ppo_clip import PPO
from mjrl.algos.ppo_clip import PPO
from mjrl.utils.train_agent import train_agent
import os

import gym
import argparse
import time as timer

SEED = 500
#
e = GymEnv("half-cheetah-joint-v0")
policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED)
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=64,
                       epochs=2,
                       learn_rate=1e-3)
agent = PPO(e, policy, baseline, save_logs=True)

print("========================================")
print("Starting policy learning")
print("========================================")

ts = timer.time()
train_agent(job_name='beta_test',
            agent=agent,
            seed=SEED,
コード例 #7
0
ファイル: job_script.py プロジェクト: yangdeai/hand_dapg
assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']])
job_data['lam_0'] = 0.0 if 'lam_0' not in job_data.keys(
) else job_data['lam_0']
job_data['lam_1'] = 0.0 if 'lam_1' not in job_data.keys(
) else job_data['lam_1']
EXP_FILE = JOB_DIR + '/job_config.json'
with open(EXP_FILE, 'w') as f:
    json.dump(job_data, f, indent=4)

# ===============================================================================
# Train Loop
# ===============================================================================

e = GymEnv(job_data['env'])
policy = MLP(e.spec,
             hidden_sizes=job_data['policy_size'],
             seed=job_data['seed'])
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=job_data['vf_batch_size'],
                       epochs=job_data['vf_epochs'],
                       learn_rate=job_data['vf_learn_rate'])

# Get demonstration data if necessary and behavior clone
if job_data['algorithm'] != 'NPG':
    print("========================================")
    print("Collecting expert demonstrations")
    print("========================================")
    demo_paths = pickle.load(open(job_data['demo_file'], 'rb'))

    bc_agent = BC(demo_paths,
コード例 #8
0
def single_process(job):
    job_start_time = timer.time()

    # Allow process to parallelize things internally
    curr_proc = mp.current_process()
    curr_proc.daemon = False

    # Create a directory for the job results.
    job_dir = os.path.join(job['output_dir'])
    if not os.path.isdir(job_dir):
        os.mkdir(job_dir)

    # start job
    job_start_time = timer.time()
    print('Started New Job : ', job['job_name'], '=======================')
    print('Job specifications : \n', job)

    # Make Env
    env_name = job['env_name']
    # adept_envs.global_config.set_config(env_name, {
    #     'robot_params': job['robot'],
    #     **job.get('env_params', {}),
    # })
    e = GymEnv(env_name)

    # Make baseline
    baseline = MLPBaseline(e.spec)

    # save job details
    job['horizon'] = e.horizon
    job['ctrl_timestep'] = e.env.env.dt
    job['sim_timestep'] = e.env.env.model.opt.timestep
    # job['sim_skip'] = e.env.env.skip

    with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file:
        pprint.pprint(job, stream=job_data_file)

    if 'init_policy' in job:
        policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed'])
        loaded_policy = pickle.load(open(job['init_policy'], 'rb'))
        loaded_params = loaded_policy.get_param_values()
        print("log std values in loaded policy = ")
        print(loaded_params[-policy.m:])
        # NOTE: if the log std is too small 
        # (say <-2.0, it is problem dependent and intuition should be used)
        # then we need to bump it up so that it explores
        loaded_params[-policy.m:] += job['init_std']
        policy.set_param_values(loaded_params)
        del job['init_policy']

    else:
        policy = MLP(
            e.spec,
            init_log_std=job['init_std'],
            hidden_sizes=job['hidden_sizes'],
            # hidden_sizes=(32, 32),
            seed=job['seed'])

    # Agent
    agent = NPG(
        e,
        policy,
        baseline,
        seed=job['seed'],
        normalized_step_size=job['normalized_step_size'],
        save_logs=job['save_logs'],
        FIM_invert_args=job['FIM_invert_args'])

    # Train Agent
    train_agent(
        job_name=job['job_name'],
        agent=agent,
        # save_dir=job_dir,
        seed=job['seed'],
        niter=job['niter'],
        gamma=job['gamma'],
        gae_lambda=job['gae_lambda'],
        num_cpu=job['num_cpu'],
        sample_mode=job['sample_mode'],
        num_traj=job.get('num_traj'),
        num_samples=job.get('num_samples'),
        evaluation_rollouts=job['evaluation_rollouts'],
        save_freq=job['save_freq'],
        plot_keys={'stoc_pol_mean', 'stoc_pol_std'},
    )

    total_job_time = timer.time() - job_start_time
    print('Job', job['job_name'],
          'took %f seconds ==============' % total_job_time)
    return total_job_time
コード例 #9
0
    assert 'rl_num_traj' in job_data.keys()
    job_data['rl_num_samples'] = 0  # will be ignored
elif job_data['sample_mode'] == 'samples':
    assert 'rl_num_samples' in job_data.keys()
    job_data['rl_num_traj'] = 0  # will be ignored
else:
    print("Unknown sampling mode. Choose either trajectories or samples")
    exit()

# ===============================================================================
# Train Loop
# ===============================================================================

e = GymEnv(job_data['env'])
policy = MLP(e.spec,
             hidden_sizes=job_data['policy_size'],
             seed=job_data['seed'],
             init_log_std=job_data['init_log_std'])
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=job_data['vf_batch_size'],
                       hidden_sizes=job_data['vf_hidden_size'],
                       epochs=job_data['vf_epochs'],
                       learn_rate=job_data['vf_learn_rate'])

# Construct the algorithm
if job_data['algorithm'] == 'NPG':
    # Other hyperparameters (like number of CG steps) can be specified in config for pass through
    # or default hyperparameters will be used
    agent = NPG(e,
                policy,
                baseline,
コード例 #10
0
ファイル: run_model_accel_npg.py プロジェクト: zivzone/mjrl
np.random.seed(SEED)
torch.random.manual_seed(SEED)

# TODO(Aravind): Map to hardware if device_path is specified

e = GymEnv(ENV_NAME)
e.set_seed(SEED)
models = [
    DynamicsModel(state_dim=e.observation_dim,
                  act_dim=e.action_dim,
                  seed=SEED + i,
                  **job_data) for i in range(job_data['num_models'])
]
policy = MLP(e.spec,
             seed=SEED,
             hidden_sizes=job_data['policy_size'],
             init_log_std=job_data['init_log_std'],
             min_log_std=-2.5)
baseline = MLPBaseline(
    e.spec,
    reg_coef=1e-3,
    batch_size=256,
    epochs=2,
    learn_rate=1e-3,
    use_gpu=(True if job_data['device'] == 'cuda' else False))
# baseline = QuadraticBaseline(e.spec)
agent = ModelAccelNPG(
    fitted_model=models,
    env=e,
    policy=policy,
    baseline=baseline,
コード例 #11
0
ファイル: examine_policy.py プロジェクト: vikashplus/mjrl_dev
def main():
    # See evaluate_args.py for the list of args.
    args = evaluate_args.get_args()

    if args.include is not "":
        exec("import " + args.include)

    if args.env_name is "":
        print(
            "Unknown env. Use 'python examine_policy --help' for instructions")
        return

    # load envs
    # adept_envs.global_config.set_config(
    #     args.env_name, {
    #         'robot_params': {
    #             'is_hardware': args.hardware,
    #             'legacy': args.legacy,
    #             'device_name': args.device,
    #             'overlay': args.overlay,
    #             'calibration_mode': args.calibration_mode,
    #         },
    #     })
    e = GymEnv(args.env_name)
    # e.env.env._seed(args.seed)

    # load policy
    policy = args.policy
    mode = args.mode
    if args.policy == "":
        pol = MLP(e.spec, init_log_std=0.0)
        mode = "exploration"
        policy = "random_policy.pickle"

    elif args.policy == "saved":
        curr_dir = os.path.dirname(os.path.abspath(__file__))
        policy = curr_dir + "/" + args.env_name + "/best_policy.pickle"
        pol = pickle.load(open(policy, 'rb'))

    else:
        # do this on the remote machine ============
        # weights = pol.get_param_values()
        # pickle.dump(weights, open("weights.pickle", 'wb'))
        # on local machine ============
        # pol = MLP(e.spec, init_log_std=-3.50)
        # loaded_params = pickle.load(open("weights.pickle", 'rb'))
        # pol.set_param_values(loaded_params)
        # pickle.dump(pol, open(policy, 'wb')) # save the policy
        pol = pickle.load(open(policy, 'rb'))

    # dump rollouts
    if (args.num_samples > 0):
        # if (mode == "evaluation"):
        # pol.log_std = pol.log_std - 10  # since there is no other way of expecifying that we want mean policy samplling

        # parallel sampling
        # paths = trajectory_sampler.sample_paths_parallel(num_samples, pol, e.horizon, env_name, 0, 1)

        # Serial sampling
        paths = do_rollout(num_traj=args.num_samples,
                           env=e,
                           policy=pol,
                           eval_mode=True,
                           horizon=e.horizon,
                           base_seed=args.seed)

        # Policy stats
        eval_success = e.env.env.evaluate_success(paths)
        eval_rewards = np.mean(
            [np.sum(p['env_infos']['rwd_dict']['total'])
             for p in paths]) / e.horizon
        eval_score = np.mean([
            np.sum(p['env_infos']['score']) / len(p['env_infos']['score'])
            for p in paths
        ])
        # evaluate_success = np.mean([np.sum(p['env_infos']['rwd_dict']['total']) for p in paths])

        stats = "Policy stats:: <mean reward/step: %+.3f>, <mean score/step: %+.3f>, <mean success: %2.1f%%>\n" % (
            eval_rewards, eval_score, eval_success)
        for ipath, path in enumerate(paths):
            stats = stats + "path%d:: <reward[-1]: %+.3f>, <score[-1]: %+.3f>\n" % (
                ipath, path['env_infos']['rwd_dict']['total'][-1],
                path['env_infos']['score'][-1])
        print(stats)

        # save to a file
        file_name = policy[:-7] + '_stats.txt'
        print(stats, file=open(file_name, 'w'))
        print("saved ", file_name)

        # plot_horizon_distribution(paths, e, fileName_prefix=policy[:-7])
        plot_paths(paths, e, fileName_prefix=policy[:-7])
        file_name = policy[:-7] + '_paths.pickle'
        pickle.dump(paths, open(file_name, 'wb'))
        print("saved ", file_name)

    else:
        # Visualized policy
        if args.render == "onscreen":
            # On screen
            e.env.env.visualize_policy(pol,
                                       horizon=e.horizon,
                                       num_episodes=args.num_episodes,
                                       mode=mode)
        else:
            # Offscreen buffer
            e.env.env.visualize_policy_offscreen(
                pol,
                horizon=100,
                num_episodes=args.num_episodes,
                mode=mode,
                filename=args.filename)

    # Close envs
    e.env.env.close_env()
コード例 #12
0
def launch_job(tag, variant):

    print(len(variant))
    seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \
        cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp,  \
        shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant
    beta1, beta2 = betas

    iters = int(total_samples / batch_size)

    # NN policy
    # ==================================
    e = GymEnv(env)
    if use_nn_policy:
        policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed)
    else:
        policy = LinearPolicy(e.spec, seed=seed)
    vfn_batch_size = 256 if gn_vfn_opt else 64
    vfn_epochs = 2 if gn_vfn_opt else 2
    # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=vfn_batch_size,
                           epochs=2,
                           learn_rate=1e-3,
                           use_gauss_newton=gn_vfn_opt)
    # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True)

    common_kwargs = dict(lr=lr,
                         curv_type=curv_type,
                         cg_iters=cg_iters,
                         cg_residual_tol=cg_residual_tol,
                         cg_prev_init_coef=cg_prev_init_coef,
                         cg_precondition_empirical=cg_precondition_empirical,
                         cg_precondition_regu_coef=cg_precondition_regu_coef,
                         cg_precondition_exp=cg_precondition_exp,
                         shrinkage_method=shrinkage_method,
                         lanczos_amortization=lanczos_amortization,
                         lanczos_iters=lanczos_iters,
                         batch_size=batch_size)

    if optim == 'ngd':
        optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs)
    elif optim == 'natural_adam':
        optimizer = fisher_optim.NaturalAdam(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)
    elif optim == 'natural_adagrad':
        optimizer = fisher_optim.NaturalAdagrad(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)
    elif optim == 'natural_amsgrad':
        optimizer = fisher_optim.NaturalAmsgrad(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)

    if algo == 'trpo':
        from mjrl.algos.trpo_delta import TRPO
        agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True)
        # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True)
    else:
        from mjrl.algos.npg_cg_delta import NPG
        agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True)

    save_dir = build_log_dir(tag, variant)
    try:
        os.makedirs(save_dir)
    except:
        pass

    # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000))
    train_agent(job_name=save_dir,
                agent=agent,
                seed=seed,
                niter=iters,
                gamma=0.995,
                gae_lambda=0.97,
                num_cpu=1,
                sample_mode='samples',
                num_samples=batch_size,
                save_freq=5,
                evaluation_rollouts=5,
                verbose=False)  #True)
コード例 #13
0
ファイル: metaworld_stl.py プロジェクト: Lifelong-ML/LPG-FTW
    e = {}
    baseline_stl = {}
    policy_stl = {}
    agent_stl = {}
    task_order = np.random.permutation(num_tasks)
    for task_id in range(num_tasks):
        e[task_id] = e_unshuffled[task_order[task_id]]
        baseline_stl[task_id] = MLPBaseline(e[task_id].spec,
                                            reg_coef=1e-3,
                                            batch_size=64,
                                            epochs=10,
                                            learn_rate=1e-3,
                                            use_gpu=True)
        policy_stl[task_id] = MLP(e[task_id].spec,
                                  hidden_sizes=(32, 32),
                                  seed=SEED)
        agent_stl[task_id] = NPG(e[task_id],
                                 policy_stl[task_id],
                                 baseline_stl[task_id],
                                 normalized_step_size=0.01,
                                 seed=SEED,
                                 save_logs=True)

    loggers_stl = {}
    grads_stl = {}
    hess_stl = {}
    for task_id in range(num_tasks):
        ts = timer.time()

        train_agent(job_name=job_name_stl_seed,
コード例 #14
0
def train(cfg, run_no, multiple_runs, seed):
    # ===============================================================================
    # Train Loop
    # ===============================================================================

    gpus_available = setup_gpus()
    env_name, job_name = parse_task(cfg)
    env = GymEnv(env_name, **cfg['env_kwargs'])
    policy = MLP(env.spec, hidden_sizes=tuple(cfg['policy_size']), seed=seed)
    baseline = MLPBaseline(env.spec,
                           reg_coef=1e-3,
                           batch_size=cfg['value_function']['batch_size'],
                           epochs=cfg['value_function']['epochs'],
                           learn_rate=cfg['value_function']['lr'],
                           use_gpu=False)

    # Get demonstration data if necessary and behavior clone
    print("========================================")
    print("Collecting expert demonstrations")
    print("========================================")
    demo_filename = cfg['demo_file']
    if cfg['demo_file'] != None:
        demo_paths = pickle.load(open(demo_filename, 'rb'))
    else:
        demo_paths = None

    if 'demo_file' in cfg['BC'] and cfg['BC']['demo_file'] != 'default':
        bc_demo_file_path = cfg['BC']['demo_file']
        if cfg['train']['use_timestamp']:
            bc_demo_file_path = bc_demo_file_path.replace(
                'v0', 'v0_timestamp_inserted')
        bc_demo_paths = pickle.load(open(bc_demo_file_path, 'rb'))
    else:
        bc_demo_paths = demo_paths
    if 'num_demo' in cfg and cfg['num_demo']:
        demo_paths = demo_paths[:cfg['num_demo']]
    if cfg['algorithm'] == 'DAPG_based_IRL':
        if 'get_paths_for_initialisation' in cfg['based_IRL']:
            if cfg['based_IRL']['get_paths_for_initialisation']:
                bc_demo_paths = add_dumped_paths_for_BC(demo_paths, cfg)

    ts = timer.time()
    if bc_demo_paths is not None and cfg['BC']['epochs'] > 0:
        print("========================================")
        print("Running BC with expert demonstrations")
        print("========================================")
        bc_agent = BC(bc_demo_paths[:25],
                      policy=policy,
                      epochs=cfg['BC']['epochs'],
                      batch_size=cfg['BC']['batch_size'],
                      lr=cfg['BC']['lr'],
                      loss_type='MSE',
                      set_transforms=True)

        bc_agent.train()
        print("========================================")
        print("BC training complete !!!")
        print("time taken = %f" % (timer.time() - ts))
        print("========================================")

    if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL':
        IRL_cfg = cfg
        if cfg['algorithm'] == 'DAPG_based_IRL':
            IRL_job_cfg_path = os.path.join("Runs",
                                            cfg['based_IRL']['IRL_job'],
                                            "config.yaml")
            IRL_cfg = yamlreader.yaml_load(IRL_job_cfg_path)

        irl_model = get_irl_model(env, demo_paths, IRL_cfg, seed)
        if cfg['algorithm'] == 'DAPG_based_IRL':
            full_irl_model_checkpoint_path = os.path.join(
                'Runs', cfg['based_IRL']['IRL_job'])
            if cfg['based_IRL']['IRL_run_no'] is not None:
                full_irl_model_checkpoint_path = os.path.join(
                    full_irl_model_checkpoint_path,
                    'run_' + str(cfg['based_IRL']['IRL_run_no']))
            if cfg['based_IRL']['IRL_step'] is not None:
                irl_model.load_iteration(
                    path=full_irl_model_checkpoint_path,
                    iteration=cfg['based_IRL']['IRL_step'])
            else:
                irl_model.load_last(path=full_irl_model_checkpoint_path)
            irl_model.eval(
                demo_paths
            )  # required to load model completely from the given path before changin to different path during training

    if cfg['eval_rollouts'] > 0:
        score = env.evaluate_policy(policy,
                                    num_episodes=cfg['eval_rollouts'],
                                    mean_action=True)
        print("Score with behavior cloning = %f" % score[0][0])

    if not cfg['use_DAPG']:
        # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation
        demo_paths = None

    # ===============================================================================
    # RL Loop
    # ===============================================================================

    irl_kwargs = None
    if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL':
        if cfg['algorithm'] == 'DAPG_based_IRL' or cfg['IRL'][
                'generator_alg'] == 'DAPG':
            generator_algorithm = DAPG
            generator_args = dict(
                demo_paths=demo_paths,
                normalized_step_size=cfg['RL']['step_size'],
                seed=seed,
                lam_0=cfg['RL']['lam_0'],
                lam_1=cfg['RL']['lam_1'],
                save_logs=cfg['save_logs'],
                augmentation=cfg['train']['augmentation'],
                entropy_weight=cfg['train']['entropy_weight'])
        elif cfg['IRL']['generator_alg'] == 'PPO':
            generator_algorithm = PPO
            generator_args = dict(
                demo_paths=demo_paths,
                epochs=cfg['PPO']['epochs'],
                mb_size=cfg['PPO']['batch_size'],
                target_kl_dist=cfg['PPO']['target_kl_dist'],
                seed=seed,
                lam_0=cfg['RL']['lam_0'],
                lam_1=cfg['RL']['lam_1'],
                save_logs=cfg['save_logs'],
                clip_coef=cfg['PPO']['clip_coef'],
                learn_rate=cfg['PPO']['lr'],
                augmentation=cfg['train']['augmentation'],
                entropy_weight=cfg['train']['entropy_weight'])
        else:
            raise ValueError("Generator algorithm name",
                             cfg['IRL']['generator_alg'], "not supported")
        irl_class = irl_training_class(generator_algorithm)
        rl_agent = irl_class(
            env,
            policy,
            baseline,
            train_irl=cfg['algorithm'] != 'DAPG_based_IRL',
            discr_lr=IRL_cfg['IRL']['discr']['lr'],
            irl_batch_size=IRL_cfg['IRL']['discr']['batch_size'],
            lower_lr_on_main_loop_percentage=IRL_cfg['IRL']['discr']
            ['lower_lr_on_main_loop_percentage'],
            irl_model=irl_model,
            **generator_args)
        irl_kwargs = dict(policy=dict(
            min_updates=1,
            max_updates=IRL_cfg['IRL']['max_gen_updates']
            if cfg['algorithm'] != 'DAPG_based_IRL' else 0,
            steps_till_max=IRL_cfg['IRL']['steps_till_max_gen_updates']))
    elif cfg['algorithm'] == 'DAPG':
        rl_agent = DAPG(env,
                        policy,
                        baseline,
                        demo_paths=demo_paths,
                        normalized_step_size=cfg['RL']['step_size'],
                        lam_0=cfg['RL']['lam_0'],
                        lam_1=cfg['RL']['lam_1'],
                        seed=seed,
                        save_logs=cfg['save_logs'],
                        augmentation=cfg['train']['augmentation'],
                        entropy_weight=cfg['train']['entropy_weight'])
    elif cfg['algorithm'] == 'PPO':
        rl_agent = PPO(env,
                       policy,
                       baseline,
                       demo_paths=demo_paths,
                       epochs=cfg['PPO']['epochs'],
                       mb_size=cfg['PPO']['batch_size'],
                       target_kl_dist=cfg['PPO']['target_kl_dist'],
                       seed=seed,
                       lam_0=cfg['RL']['lam_0'],
                       lam_1=cfg['RL']['lam_1'],
                       save_logs=cfg['save_logs'],
                       clip_coef=cfg['PPO']['clip_coef'],
                       learn_rate=cfg['PPO']['lr'],
                       augmentation=cfg['train']['augmentation'],
                       entropy_weight=cfg['train']['entropy_weight'])
    else:
        raise ValueError("Algorithm name", cfg['algorithm'], "not supported")

    # get IRL model kwargs if doing DAPG based on IRL
    env_kwargs = cfg['env_kwargs']
    if cfg['algorithm'] == 'DAPG_based_IRL':
        rl_agent.irl_model = irl_model

    # dump YAML config file
    job_path = os.path.join("Runs", job_name)
    if not os.path.isdir(job_path):
        os.makedirs(job_path)
    with open(os.path.join(job_path, 'config.yaml'), 'w') as f:
        dump(cfg, f)

    print("========================================")
    print("Starting reinforcement learning phase")
    print("========================================")

    ts = timer.time()
    train_agent(
        job_name=job_name,
        agent=rl_agent,
        seed=seed,
        niter=cfg['train']['steps'],
        gamma=cfg['train']['gamma'],
        gae_lambda=cfg['train']['gae_lambda'],
        num_cpu=cfg['num_cpu'],
        sample_mode='trajectories',
        num_traj=cfg['train']['num_traj'],
        save_freq=cfg['train']['save_freq'],
        evaluation_rollouts=cfg['eval_rollouts'],
        should_fresh_start=bool(cfg['IRL']['initialization_job'])
        if cfg['algorithm'] == 'IRL' else False,
        irl_kwargs=irl_kwargs,
        temperature_max=cfg['IRL']['temperature_max']
        if cfg['algorithm'] == 'IRL' else 0,
        temperature_min=cfg['IRL']['temperature_min']
        if cfg['algorithm'] == 'IRL' else 0,
        plot_keys=cfg['plot_keys'],
        run_no=run_no if multiple_runs else None,
        env_kwargs=env_kwargs,
        fixed_evaluation_init_states=cfg['fixed_evaluation_init_states'])
    print("time taken = %f" % (timer.time() - ts))
コード例 #15
0
This script illustrates training from scratch
using NPG on the relocate-v0 task.
"""

from mjrl.utils.gym_env import GymEnv
from mjrl.policies.gaussian_mlp import MLP
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import mj_envs
import time as timer

SEED = 100

e = GymEnv('relocate-v0')
policy = MLP(e.spec, hidden_sizes=(64, 64), seed=SEED, init_log_std=-0.5)
baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)

print("========================================")
print("Training with RL")
ts = timer.time()
train_agent(job_name='relocate_scratch',
            agent=agent,
            seed=SEED,
            niter=100,
            gamma=0.995,
            gae_lambda=0.97,
            num_cpu=5,
            sample_mode='trajectories',
            num_traj=200,
コード例 #16
0
def experiment(variant):
    """
    This is a job script for running NPG/DAPG on hand tasks and other gym envs.
    Note that DAPG generalizes PG and BC init + PG finetuning.
    With appropriate settings of parameters, we can recover the full family.
    """
    import mj_envs

    job_data = default_job_data.copy()
    job_data.update(variant)

    env_params = ENV_PARAMS[variant['env_class']]
    job_data.update(env_params)

    assert 'algorithm' in job_data.keys()
    assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']])

    JOB_DIR = logger.get_snapshot_dir()

    # ===============================================================================
    # Train Loop
    # ===============================================================================

    seed = int(job_data['seedid'])

    e = GymEnv(job_data['env_id'])
    policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=seed)
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=job_data['vf_batch_size'],
                           epochs=job_data['vf_epochs'],
                           learn_rate=job_data['vf_learn_rate'])

    # Get demonstration data if necessary and behavior clone
    if job_data['algorithm'] != 'NPG':
        print("========================================")
        print("Collecting expert demonstrations")
        print("========================================")
        demo_paths = load_local_or_remote_file(job_data['demo_file'], 'rb')

        bc_agent = BC(demo_paths,
                      policy=policy,
                      epochs=job_data['bc_epochs'],
                      batch_size=job_data['bc_batch_size'],
                      lr=job_data['bc_learn_rate'],
                      loss_type='MSE',
                      set_transforms=False)
        in_shift, in_scale, out_shift, out_scale = bc_agent.compute_transformations(
        )
        bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale)
        bc_agent.set_variance_with_data(out_scale)

        ts = timer.time()
        print("========================================")
        print("Running BC with expert demonstrations")
        print("========================================")
        bc_agent.train()
        print("========================================")
        print("BC training complete !!!")
        print("time taken = %f" % (timer.time() - ts))
        print("========================================")

        if job_data['eval_rollouts'] >= 1:
            score = e.evaluate_policy(policy,
                                      num_episodes=job_data['eval_rollouts'],
                                      mean_action=True)
            print("Score with behavior cloning = %f" % score[0][0])

    if job_data['algorithm'] != 'DAPG':
        # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation
        demo_paths = None

    # ===============================================================================
    # RL Loop
    # ===============================================================================

    rl_agent = DAPG(e,
                    policy,
                    baseline,
                    demo_paths,
                    normalized_step_size=job_data['rl_step_size'],
                    lam_0=job_data['lam_0'],
                    lam_1=job_data['lam_1'],
                    seed=seed,
                    save_logs=True)

    print("========================================")
    print("Starting reinforcement learning phase")
    print("========================================")

    ts = timer.time()
    train_agent(job_name=JOB_DIR,
                agent=rl_agent,
                seed=seed,
                niter=job_data['rl_num_iter'],
                gamma=job_data['rl_gamma'],
                gae_lambda=job_data['rl_gae'],
                num_cpu=job_data['num_cpu'],
                sample_mode='trajectories',
                num_traj=job_data['rl_num_traj'],
                save_freq=job_data['save_freq'],
                evaluation_rollouts=job_data['eval_rollouts'])
    print("time taken = %f" % (timer.time() - ts))