def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) # scaler = Scaler(obs_dim) logger.log("loading scaler") with open('models/scaler/scaler.pkl', 'rb') as input: scaler = pickle.load(input) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) load_v = False #whether load value function baseline or train from scratch; no big impact on stein if load_v == True: val_func.load_val_model(load_dir) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) #Split data into validation and training data random.shuffle(trajectories) t_trajectories = trajectories[:int(len(trajectories) / 2)] v_trajectories = trajectories[int(len(trajectories) / 2):] refit_v = True # if fit value function baseline once again before evaluating; no big impact on stein if refit_v == True: tt_trajectories = copy.deepcopy(t_trajectories) add_value(tt_trajectories, val_func) add_disc_sum_rew(tt_trajectories, gamma) add_gae(tt_trajectories, gamma, lam) tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set( tt_trajectories) logger.log("refit value function baseline") val_func.fit(tt_observes, tt_disc_sum_rew) # update value function logger.log("done") # build training data after refit v add_value(t_trajectories, val_func) add_disc_sum_rew(t_trajectories, gamma) add_gae(t_trajectories, gamma, lam) t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set( t_trajectories) # build validation data after refit v add_value(v_trajectories, val_func) add_disc_sum_rew(v_trajectories, gamma) add_gae(v_trajectories, gamma, lam) v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set( v_trajectories) sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ max_timesteps, env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(v_observes, v_actions, v_advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) d = Dataset(dict(ob=t_observes, ac=t_actions, atarg=t_advantages, vtarg=t_disc_sum_rew), shuffle=True) for _ in range(phi_epochs): # optim_epochs for batch in d.iterate_once(128): # optim_batchsize policy.update(load_model, batch['ob'], batch['ac'], batch['atarg'], use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(v_observes, \ v_actions, v_advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) val_func.load_val_model(load_dir) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(observes, actions, advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(observes, \ actions, advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)