def main(): # get args args = get_args() # load env if args.include is not "": exec("import " + args.include) e = GymEnv(args.env_name) # load policy policy = args.policy if args.policy == "": pol = MLP(e.spec, init_log_std=-1) mode = "exploration" else: pol = pickle.load(open(policy, 'rb')) mode = "evaluation" # Visualized policy if args.render == "onscreen": # On screen e.env.env.visualize_policy(pol, horizon=e.horizon, num_episodes=args.num_episodes, mode=mode) else: # Offscreen buffer e.env.env.visualize_policy_offscreen(pol, horizon=100, num_episodes=args.num_episodes, mode=mode, filename=args.filename) # Close envs e.env.env.close_env()
def main(env_name, policy, mode, seed, episodes): e = GymEnv(env_name) e.set_seed(seed) if policy is not None: pi = pickle.load(open(policy, 'rb')) else: pi = MLP(e.spec, hidden_sizes=(32, 32), seed=seed, init_log_std=-1.0) # render policy e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path): render = True # TODO(Aravind): Map to hardware if device_path is specified e = GymEnv(env_name) e.set_seed(seed) np.random.seed(seed) torch.manual_seed(seed) if policy is not None: policy = pickle.load(open(policy, 'rb')) else: policy = MLP(e.spec, hidden_sizes=(32, 32), seed=seed, init_log_std=log_std) for ep in range(episodes): o = e.reset() rew = 0.0 t = 0 done = False while t < e.horizon and done is False: o = e.get_obs() a = policy.get_action( o)[0] if mode == 'exploration' else policy.get_action( o)[1]['evaluation'] next_o, r, done, ifo = e.step(a) if terminate is False: done = False rew = rew + r t = t + 1 if render: e.render() if done and t < e.horizon - 1: print("Episode terminated early") print("episode score = %f " % rew) e.reset()
def train_expert_policy(config): print('-' * 80) previous_dir = os.getcwd() ensure_dir(GEN_DATA_DIR) os.chdir(GEN_DATA_DIR) print('Training Expert') e = make_gym_env(config['env_id'], config) policy = MLP(e.spec, hidden_sizes=(32, 32), seed=config['seed']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=config['seed'], save_logs=True) job_name = '%s_expert' % config['env_name'] # Need to change where it dumps the policy train_agent(job_name=job_name, agent=agent, seed=config['seed'], niter=30, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='trajectories', num_traj=200, save_freq=5, evaluation_rollouts=5) os.chdir(previous_dir) os.rename( os.path.join(GEN_DATA_DIR, job_name, 'iterations/best_policy.pickle'), os.path.join(EXPERT_POLICIES_DIR, EXPERT_POLICIES[config['env_name']])) print('-' * 80)
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False os.chdir(cwd) dirpath = os.path.join(job['save_dir'], job['job_name']) os.makedirs(dirpath, exist_ok=True) # start job os.chdir(cwd) job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env e = GymEnv(job['env_name']) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip job_data_file = open(dirpath + '/job_data.txt', 'w') pprint.pprint(job, stream=job_data_file) job_data_file.close() # Make policy (???vik: sizes are hard coded) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print('log std values in loaded policy = ') print(params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores # params[-policy.m:] += 1.0 policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG(e, policy, baseline, seed=job['seed'], \ normalized_step_size=job['normalized_step_size'], \ save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=dirpath, agent=agent, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job['num_traj'], evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
from mjrl.policies.gaussian_mlp import MLP from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.batch_reinforce import BatchREINFORCE from mjrl.algos.ppo_clip import PPO from mjrl.algos.ppo_clip import PPO from mjrl.utils.train_agent import train_agent import os import gym import argparse import time as timer SEED = 500 # e = GymEnv("half-cheetah-joint-v0") policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = PPO(e, policy, baseline, save_logs=True) print("========================================") print("Starting policy learning") print("========================================") ts = timer.time() train_agent(job_name='beta_test', agent=agent, seed=SEED,
assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']]) job_data['lam_0'] = 0.0 if 'lam_0' not in job_data.keys( ) else job_data['lam_0'] job_data['lam_1'] = 0.0 if 'lam_1' not in job_data.keys( ) else job_data['lam_1'] EXP_FILE = JOB_DIR + '/job_config.json' with open(EXP_FILE, 'w') as f: json.dump(job_data, f, indent=4) # =============================================================================== # Train Loop # =============================================================================== e = GymEnv(job_data['env']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Get demonstration data if necessary and behavior clone if job_data['algorithm'] != 'NPG': print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = pickle.load(open(job_data['demo_file'], 'rb')) bc_agent = BC(demo_paths,
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False # Create a directory for the job results. job_dir = os.path.join(job['output_dir']) if not os.path.isdir(job_dir): os.mkdir(job_dir) # start job job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env env_name = job['env_name'] # adept_envs.global_config.set_config(env_name, { # 'robot_params': job['robot'], # **job.get('env_params', {}), # }) e = GymEnv(env_name) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file: pprint.pprint(job, stream=job_data_file) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print("log std values in loaded policy = ") print(loaded_params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores loaded_params[-policy.m:] += job['init_std'] policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP( e.spec, init_log_std=job['init_std'], hidden_sizes=job['hidden_sizes'], # hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG( e, policy, baseline, seed=job['seed'], normalized_step_size=job['normalized_step_size'], save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=job['job_name'], agent=agent, # save_dir=job_dir, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job.get('num_traj'), num_samples=job.get('num_samples'), evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
assert 'rl_num_traj' in job_data.keys() job_data['rl_num_samples'] = 0 # will be ignored elif job_data['sample_mode'] == 'samples': assert 'rl_num_samples' in job_data.keys() job_data['rl_num_traj'] = 0 # will be ignored else: print("Unknown sampling mode. Choose either trajectories or samples") exit() # =============================================================================== # Train Loop # =============================================================================== e = GymEnv(job_data['env']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Construct the algorithm if job_data['algorithm'] == 'NPG': # Other hyperparameters (like number of CG steps) can be specified in config for pass through # or default hyperparameters will be used agent = NPG(e, policy, baseline,
np.random.seed(SEED) torch.random.manual_seed(SEED) # TODO(Aravind): Map to hardware if device_path is specified e = GymEnv(ENV_NAME) e.set_seed(SEED) models = [ DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED + i, **job_data) for i in range(job_data['num_models']) ] policy = MLP(e.spec, seed=SEED, hidden_sizes=job_data['policy_size'], init_log_std=job_data['init_log_std'], min_log_std=-2.5) baseline = MLPBaseline( e.spec, reg_coef=1e-3, batch_size=256, epochs=2, learn_rate=1e-3, use_gpu=(True if job_data['device'] == 'cuda' else False)) # baseline = QuadraticBaseline(e.spec) agent = ModelAccelNPG( fitted_model=models, env=e, policy=policy, baseline=baseline,
def main(): # See evaluate_args.py for the list of args. args = evaluate_args.get_args() if args.include is not "": exec("import " + args.include) if args.env_name is "": print( "Unknown env. Use 'python examine_policy --help' for instructions") return # load envs # adept_envs.global_config.set_config( # args.env_name, { # 'robot_params': { # 'is_hardware': args.hardware, # 'legacy': args.legacy, # 'device_name': args.device, # 'overlay': args.overlay, # 'calibration_mode': args.calibration_mode, # }, # }) e = GymEnv(args.env_name) # e.env.env._seed(args.seed) # load policy policy = args.policy mode = args.mode if args.policy == "": pol = MLP(e.spec, init_log_std=0.0) mode = "exploration" policy = "random_policy.pickle" elif args.policy == "saved": curr_dir = os.path.dirname(os.path.abspath(__file__)) policy = curr_dir + "/" + args.env_name + "/best_policy.pickle" pol = pickle.load(open(policy, 'rb')) else: # do this on the remote machine ============ # weights = pol.get_param_values() # pickle.dump(weights, open("weights.pickle", 'wb')) # on local machine ============ # pol = MLP(e.spec, init_log_std=-3.50) # loaded_params = pickle.load(open("weights.pickle", 'rb')) # pol.set_param_values(loaded_params) # pickle.dump(pol, open(policy, 'wb')) # save the policy pol = pickle.load(open(policy, 'rb')) # dump rollouts if (args.num_samples > 0): # if (mode == "evaluation"): # pol.log_std = pol.log_std - 10 # since there is no other way of expecifying that we want mean policy samplling # parallel sampling # paths = trajectory_sampler.sample_paths_parallel(num_samples, pol, e.horizon, env_name, 0, 1) # Serial sampling paths = do_rollout(num_traj=args.num_samples, env=e, policy=pol, eval_mode=True, horizon=e.horizon, base_seed=args.seed) # Policy stats eval_success = e.env.env.evaluate_success(paths) eval_rewards = np.mean( [np.sum(p['env_infos']['rwd_dict']['total']) for p in paths]) / e.horizon eval_score = np.mean([ np.sum(p['env_infos']['score']) / len(p['env_infos']['score']) for p in paths ]) # evaluate_success = np.mean([np.sum(p['env_infos']['rwd_dict']['total']) for p in paths]) stats = "Policy stats:: <mean reward/step: %+.3f>, <mean score/step: %+.3f>, <mean success: %2.1f%%>\n" % ( eval_rewards, eval_score, eval_success) for ipath, path in enumerate(paths): stats = stats + "path%d:: <reward[-1]: %+.3f>, <score[-1]: %+.3f>\n" % ( ipath, path['env_infos']['rwd_dict']['total'][-1], path['env_infos']['score'][-1]) print(stats) # save to a file file_name = policy[:-7] + '_stats.txt' print(stats, file=open(file_name, 'w')) print("saved ", file_name) # plot_horizon_distribution(paths, e, fileName_prefix=policy[:-7]) plot_paths(paths, e, fileName_prefix=policy[:-7]) file_name = policy[:-7] + '_paths.pickle' pickle.dump(paths, open(file_name, 'wb')) print("saved ", file_name) else: # Visualized policy if args.render == "onscreen": # On screen e.env.env.visualize_policy(pol, horizon=e.horizon, num_episodes=args.num_episodes, mode=mode) else: # Offscreen buffer e.env.env.visualize_policy_offscreen( pol, horizon=100, num_episodes=args.num_episodes, mode=mode, filename=args.filename) # Close envs e.env.env.close_env()
def launch_job(tag, variant): print(len(variant)) seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \ cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp, \ shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant beta1, beta2 = betas iters = int(total_samples / batch_size) # NN policy # ================================== e = GymEnv(env) if use_nn_policy: policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed) else: policy = LinearPolicy(e.spec, seed=seed) vfn_batch_size = 256 if gn_vfn_opt else 64 vfn_epochs = 2 if gn_vfn_opt else 2 # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=vfn_batch_size, epochs=2, learn_rate=1e-3, use_gauss_newton=gn_vfn_opt) # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True) common_kwargs = dict(lr=lr, curv_type=curv_type, cg_iters=cg_iters, cg_residual_tol=cg_residual_tol, cg_prev_init_coef=cg_prev_init_coef, cg_precondition_empirical=cg_precondition_empirical, cg_precondition_regu_coef=cg_precondition_regu_coef, cg_precondition_exp=cg_precondition_exp, shrinkage_method=shrinkage_method, lanczos_amortization=lanczos_amortization, lanczos_iters=lanczos_iters, batch_size=batch_size) if optim == 'ngd': optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs) elif optim == 'natural_adam': optimizer = fisher_optim.NaturalAdam( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_adagrad': optimizer = fisher_optim.NaturalAdagrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_amsgrad': optimizer = fisher_optim.NaturalAmsgrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) if algo == 'trpo': from mjrl.algos.trpo_delta import TRPO agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True) # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True) else: from mjrl.algos.npg_cg_delta import NPG agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True) save_dir = build_log_dir(tag, variant) try: os.makedirs(save_dir) except: pass # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000)) train_agent(job_name=save_dir, agent=agent, seed=seed, niter=iters, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='samples', num_samples=batch_size, save_freq=5, evaluation_rollouts=5, verbose=False) #True)
e = {} baseline_stl = {} policy_stl = {} agent_stl = {} task_order = np.random.permutation(num_tasks) for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] baseline_stl[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3, use_gpu=True) policy_stl[task_id] = MLP(e[task_id].spec, hidden_sizes=(32, 32), seed=SEED) agent_stl[task_id] = NPG(e[task_id], policy_stl[task_id], baseline_stl[task_id], normalized_step_size=0.01, seed=SEED, save_logs=True) loggers_stl = {} grads_stl = {} hess_stl = {} for task_id in range(num_tasks): ts = timer.time() train_agent(job_name=job_name_stl_seed,
def train(cfg, run_no, multiple_runs, seed): # =============================================================================== # Train Loop # =============================================================================== gpus_available = setup_gpus() env_name, job_name = parse_task(cfg) env = GymEnv(env_name, **cfg['env_kwargs']) policy = MLP(env.spec, hidden_sizes=tuple(cfg['policy_size']), seed=seed) baseline = MLPBaseline(env.spec, reg_coef=1e-3, batch_size=cfg['value_function']['batch_size'], epochs=cfg['value_function']['epochs'], learn_rate=cfg['value_function']['lr'], use_gpu=False) # Get demonstration data if necessary and behavior clone print("========================================") print("Collecting expert demonstrations") print("========================================") demo_filename = cfg['demo_file'] if cfg['demo_file'] != None: demo_paths = pickle.load(open(demo_filename, 'rb')) else: demo_paths = None if 'demo_file' in cfg['BC'] and cfg['BC']['demo_file'] != 'default': bc_demo_file_path = cfg['BC']['demo_file'] if cfg['train']['use_timestamp']: bc_demo_file_path = bc_demo_file_path.replace( 'v0', 'v0_timestamp_inserted') bc_demo_paths = pickle.load(open(bc_demo_file_path, 'rb')) else: bc_demo_paths = demo_paths if 'num_demo' in cfg and cfg['num_demo']: demo_paths = demo_paths[:cfg['num_demo']] if cfg['algorithm'] == 'DAPG_based_IRL': if 'get_paths_for_initialisation' in cfg['based_IRL']: if cfg['based_IRL']['get_paths_for_initialisation']: bc_demo_paths = add_dumped_paths_for_BC(demo_paths, cfg) ts = timer.time() if bc_demo_paths is not None and cfg['BC']['epochs'] > 0: print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent = BC(bc_demo_paths[:25], policy=policy, epochs=cfg['BC']['epochs'], batch_size=cfg['BC']['batch_size'], lr=cfg['BC']['lr'], loss_type='MSE', set_transforms=True) bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL': IRL_cfg = cfg if cfg['algorithm'] == 'DAPG_based_IRL': IRL_job_cfg_path = os.path.join("Runs", cfg['based_IRL']['IRL_job'], "config.yaml") IRL_cfg = yamlreader.yaml_load(IRL_job_cfg_path) irl_model = get_irl_model(env, demo_paths, IRL_cfg, seed) if cfg['algorithm'] == 'DAPG_based_IRL': full_irl_model_checkpoint_path = os.path.join( 'Runs', cfg['based_IRL']['IRL_job']) if cfg['based_IRL']['IRL_run_no'] is not None: full_irl_model_checkpoint_path = os.path.join( full_irl_model_checkpoint_path, 'run_' + str(cfg['based_IRL']['IRL_run_no'])) if cfg['based_IRL']['IRL_step'] is not None: irl_model.load_iteration( path=full_irl_model_checkpoint_path, iteration=cfg['based_IRL']['IRL_step']) else: irl_model.load_last(path=full_irl_model_checkpoint_path) irl_model.eval( demo_paths ) # required to load model completely from the given path before changin to different path during training if cfg['eval_rollouts'] > 0: score = env.evaluate_policy(policy, num_episodes=cfg['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if not cfg['use_DAPG']: # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== irl_kwargs = None if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL': if cfg['algorithm'] == 'DAPG_based_IRL' or cfg['IRL'][ 'generator_alg'] == 'DAPG': generator_algorithm = DAPG generator_args = dict( demo_paths=demo_paths, normalized_step_size=cfg['RL']['step_size'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) elif cfg['IRL']['generator_alg'] == 'PPO': generator_algorithm = PPO generator_args = dict( demo_paths=demo_paths, epochs=cfg['PPO']['epochs'], mb_size=cfg['PPO']['batch_size'], target_kl_dist=cfg['PPO']['target_kl_dist'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], clip_coef=cfg['PPO']['clip_coef'], learn_rate=cfg['PPO']['lr'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) else: raise ValueError("Generator algorithm name", cfg['IRL']['generator_alg'], "not supported") irl_class = irl_training_class(generator_algorithm) rl_agent = irl_class( env, policy, baseline, train_irl=cfg['algorithm'] != 'DAPG_based_IRL', discr_lr=IRL_cfg['IRL']['discr']['lr'], irl_batch_size=IRL_cfg['IRL']['discr']['batch_size'], lower_lr_on_main_loop_percentage=IRL_cfg['IRL']['discr'] ['lower_lr_on_main_loop_percentage'], irl_model=irl_model, **generator_args) irl_kwargs = dict(policy=dict( min_updates=1, max_updates=IRL_cfg['IRL']['max_gen_updates'] if cfg['algorithm'] != 'DAPG_based_IRL' else 0, steps_till_max=IRL_cfg['IRL']['steps_till_max_gen_updates'])) elif cfg['algorithm'] == 'DAPG': rl_agent = DAPG(env, policy, baseline, demo_paths=demo_paths, normalized_step_size=cfg['RL']['step_size'], lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], seed=seed, save_logs=cfg['save_logs'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) elif cfg['algorithm'] == 'PPO': rl_agent = PPO(env, policy, baseline, demo_paths=demo_paths, epochs=cfg['PPO']['epochs'], mb_size=cfg['PPO']['batch_size'], target_kl_dist=cfg['PPO']['target_kl_dist'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], clip_coef=cfg['PPO']['clip_coef'], learn_rate=cfg['PPO']['lr'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) else: raise ValueError("Algorithm name", cfg['algorithm'], "not supported") # get IRL model kwargs if doing DAPG based on IRL env_kwargs = cfg['env_kwargs'] if cfg['algorithm'] == 'DAPG_based_IRL': rl_agent.irl_model = irl_model # dump YAML config file job_path = os.path.join("Runs", job_name) if not os.path.isdir(job_path): os.makedirs(job_path) with open(os.path.join(job_path, 'config.yaml'), 'w') as f: dump(cfg, f) print("========================================") print("Starting reinforcement learning phase") print("========================================") ts = timer.time() train_agent( job_name=job_name, agent=rl_agent, seed=seed, niter=cfg['train']['steps'], gamma=cfg['train']['gamma'], gae_lambda=cfg['train']['gae_lambda'], num_cpu=cfg['num_cpu'], sample_mode='trajectories', num_traj=cfg['train']['num_traj'], save_freq=cfg['train']['save_freq'], evaluation_rollouts=cfg['eval_rollouts'], should_fresh_start=bool(cfg['IRL']['initialization_job']) if cfg['algorithm'] == 'IRL' else False, irl_kwargs=irl_kwargs, temperature_max=cfg['IRL']['temperature_max'] if cfg['algorithm'] == 'IRL' else 0, temperature_min=cfg['IRL']['temperature_min'] if cfg['algorithm'] == 'IRL' else 0, plot_keys=cfg['plot_keys'], run_no=run_no if multiple_runs else None, env_kwargs=env_kwargs, fixed_evaluation_init_states=cfg['fixed_evaluation_init_states']) print("time taken = %f" % (timer.time() - ts))
This script illustrates training from scratch using NPG on the relocate-v0 task. """ from mjrl.utils.gym_env import GymEnv from mjrl.policies.gaussian_mlp import MLP from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import mj_envs import time as timer SEED = 100 e = GymEnv('relocate-v0') policy = MLP(e.spec, hidden_sizes=(64, 64), seed=SEED, init_log_std=-0.5) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) print("========================================") print("Training with RL") ts = timer.time() train_agent(job_name='relocate_scratch', agent=agent, seed=SEED, niter=100, gamma=0.995, gae_lambda=0.97, num_cpu=5, sample_mode='trajectories', num_traj=200,
def experiment(variant): """ This is a job script for running NPG/DAPG on hand tasks and other gym envs. Note that DAPG generalizes PG and BC init + PG finetuning. With appropriate settings of parameters, we can recover the full family. """ import mj_envs job_data = default_job_data.copy() job_data.update(variant) env_params = ENV_PARAMS[variant['env_class']] job_data.update(env_params) assert 'algorithm' in job_data.keys() assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']]) JOB_DIR = logger.get_snapshot_dir() # =============================================================================== # Train Loop # =============================================================================== seed = int(job_data['seedid']) e = GymEnv(job_data['env_id']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=seed) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Get demonstration data if necessary and behavior clone if job_data['algorithm'] != 'NPG': print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = load_local_or_remote_file(job_data['demo_file'], 'rb') bc_agent = BC(demo_paths, policy=policy, epochs=job_data['bc_epochs'], batch_size=job_data['bc_batch_size'], lr=job_data['bc_learn_rate'], loss_type='MSE', set_transforms=False) in_shift, in_scale, out_shift, out_scale = bc_agent.compute_transformations( ) bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale) bc_agent.set_variance_with_data(out_scale) ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if job_data['eval_rollouts'] >= 1: score = e.evaluate_policy(policy, num_episodes=job_data['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if job_data['algorithm'] != 'DAPG': # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== rl_agent = DAPG(e, policy, baseline, demo_paths, normalized_step_size=job_data['rl_step_size'], lam_0=job_data['lam_0'], lam_1=job_data['lam_1'], seed=seed, save_logs=True) print("========================================") print("Starting reinforcement learning phase") print("========================================") ts = timer.time() train_agent(job_name=JOB_DIR, agent=rl_agent, seed=seed, niter=job_data['rl_num_iter'], gamma=job_data['rl_gamma'], gae_lambda=job_data['rl_gae'], num_cpu=job_data['num_cpu'], sample_mode='trajectories', num_traj=job_data['rl_num_traj'], save_freq=job_data['save_freq'], evaluation_rollouts=job_data['eval_rollouts']) print("time taken = %f" % (timer.time() - ts))