def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    # scaler = Scaler(obs_dim)
    logger.log("loading scaler")
    with open('models/scaler/scaler.pkl', 'rb') as input:
        scaler = pickle.load(input)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    load_v = False  #whether load value function baseline or train from scratch; no big impact on stein
    if load_v == True:
        val_func.load_val_model(load_dir)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps,
                                             mode=load_model)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)

    #Split data into validation and training data
    random.shuffle(trajectories)
    t_trajectories = trajectories[:int(len(trajectories) / 2)]
    v_trajectories = trajectories[int(len(trajectories) / 2):]

    refit_v = True  # if fit value function baseline once again before evaluating; no big impact on stein
    if refit_v == True:
        tt_trajectories = copy.deepcopy(t_trajectories)
        add_value(tt_trajectories, val_func)
        add_disc_sum_rew(tt_trajectories, gamma)
        add_gae(tt_trajectories, gamma, lam)
        tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set(
            tt_trajectories)
        logger.log("refit value function baseline")
        val_func.fit(tt_observes, tt_disc_sum_rew)  # update value function
        logger.log("done")

    # build training data after refit v
    add_value(t_trajectories, val_func)
    add_disc_sum_rew(t_trajectories, gamma)
    add_gae(t_trajectories, gamma, lam)
    t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set(
        t_trajectories)

    # build validation data after refit v
    add_value(v_trajectories, val_func)
    add_disc_sum_rew(v_trajectories, gamma)
    add_gae(v_trajectories, gamma, lam)
    v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set(
        v_trajectories)

    sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        max_timesteps, env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(v_observes,
                                             v_actions,
                                             v_advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    d = Dataset(dict(ob=t_observes,
                     ac=t_actions,
                     atarg=t_advantages,
                     vtarg=t_disc_sum_rew),
                shuffle=True)
    for _ in range(phi_epochs):  # optim_epochs
        for batch in d.iterate_once(128):  # optim_batchsize
            policy.update(load_model,
                          batch['ob'],
                          batch['ac'],
                          batch['atarg'],
                          use_lr_adjust,
                          ada_kl_penalty,
                          c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(v_observes, \
                    v_actions, v_advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
Esempio n. 2
0
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    val_func.load_val_model(load_dir)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)
    add_value(trajectories, val_func)
    add_disc_sum_rew(trajectories, gamma)
    add_gae(trajectories, gamma, lam)

    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)

    sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(observes,
                                             actions,
                                             advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    policy.update(load_model,
                  observes,
                  actions,
                  advantages,
                  use_lr_adjust,
                  ada_kl_penalty,
                  c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(observes, \
                    actions, advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)