def load_metabo_policy(logpath, load_iter, env, device, deterministic): with open(os.path.join(logpath, "params_" + str(load_iter)), "rb") as f: train_params = pkl.load(f) pi = NeuralAF(observation_space=env.observation_space, action_space=env.action_space, deterministic=deterministic, options=train_params["policy_options"]).to(device) with open(os.path.join(logpath, "weights_" + str(load_iter)), "rb") as f: pi.load_state_dict(torch.load(f)) with open(os.path.join(logpath, "stats_" + str(load_iter)), "rb") as f: stats = pkl.load(f) return pi, train_params, stats
# register environment register(id=env_spec["env_id"], entry_point="metabo.environment.metabo_gym:MetaBO", max_episode_steps=env_spec["T"], reward_threshold=None, kwargs=env_spec) # log data and weights go here, use this folder for evaluation afterwards logpath = os.path.join(rootdir, "log", env_spec["env_id"], datetime.strftime(datetime.now(), "%Y-%m-%d-%H-%M-%S")) # set up policy policy_fn = lambda observation_space, action_space, deterministic: NeuralAF( observation_space=observation_space, action_space=action_space, deterministic=deterministic, options=ppo_spec["policy_options"]) # do training print( "Training on {}.\nFind logs, weights, and learning curve at {}\n\n".format( env_spec["env_id"], logpath)) ppo = PPO(policy_fn=policy_fn, params=ppo_spec, logpath=logpath, save_interval=1) # learning curve is plotted online in separate process p = mp.Process(target=plot_learning_curve_online, kwargs={ "logpath": logpath,
def eval_experiment(eval_spec): env_id = eval_spec["env_id"] env_seed_offset = eval_spec["env_seed_offset"] policy = eval_spec["policy"] logpath = eval_spec["logpath"] policy_specs = eval_spec["policy_specs"] savepath = eval_spec["savepath"] n_workers = eval_spec["n_workers"] n_episodes = eval_spec["n_episodes"] assert n_episodes % n_workers == 0 T = eval_spec["T"] if policy != "MetaBO": pi = None deterministic = None load_iter = None os.makedirs(savepath, exist_ok=True) env_seeds = env_seed_offset + np.arange(n_workers) dummy_env = gym.make(env_id) timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d-%H-%M-%S") taf_datafile = policy_specs[ "TAF_datafile"] if "TAF_datafile" in policy_specs else None write_overview_logfile(savepath=savepath, timestamp=timestamp, env=dummy_env, policy=policy, env_seeds=env_seeds, taf_datafile=taf_datafile, policy_specs=policy_specs) env_specs = dummy_env.spec._kwargs # prepare the policies if policy == "GP-UCB": feature_order = dummy_env.unwrapped.feature_order_eval_envs D = dummy_env.unwrapped.D policy_fn = lambda *_: UCB(feature_order=feature_order, kappa=policy_specs["kappa"], D=D, delta=policy_specs["delta"]) elif policy == "EI": feature_order = dummy_env.unwrapped.feature_order_eval_envs policy_fn = lambda *_: EI(feature_order=feature_order) elif policy == "TAF-ME": policy_fn = lambda *_: TAF(datafile=policy_specs["TAF_datafile"], mode="me") elif policy == "TAF-RANKING": policy_fn = lambda *_: TAF( datafile=policy_specs["TAF_datafile"], mode="ranking", rho=1.0) elif policy == "PI": feature_order = dummy_env.unwrapped.feature_order_eval_envs policy_fn = lambda *_: PI(feature_order=feature_order, xi=policy_specs["xi"]) elif policy == "EPS-GREEDY": feature_order = dummy_env.unwrapped.feature_order_eps_greedy policy_fn = lambda *_: EpsGreedy(datafile=policy_specs["datafile"], feature_order=feature_order, eps=policy_specs["eps"]) elif policy == "GMM-UCB": feature_order = dummy_env.unwrapped.feature_order_gmm_ucb policy_fn = lambda *_: GMM_UCB(datafile=policy_specs["datafile"], feature_order=feature_order, ucb_kappa=policy_specs["ucb_kappa"], w=policy_specs["w"], n_components=policy_specs["n_components" ]) elif policy == "MetaBO": load_iter = eval_spec["load_iter"] deterministic = eval_spec["deterministic"] pi, policy_specs, _ = load_metabo_policy(logpath=logpath, load_iter=load_iter, env=dummy_env, device="cpu", deterministic=deterministic) policy_fn = lambda osp, asp, det: NeuralAF(observation_space=osp, action_space=asp, deterministic=det, options=policy_specs[ "policy_options"]) elif policy == "Random": pass # will be dealt with separately below else: raise ValueError("Unknown policy!") dummy_env.close() # evaluate the experiment if policy != "Random": br = BatchRecorder(size=T * n_episodes, env_id=env_id, env_seeds=env_seeds, policy_fn=policy_fn, n_workers=n_workers, deterministic=deterministic) if policy == "MetaBO": br.set_worker_weights(pi=pi) br.record_batch(gamma=1.0, lam=1.0) # gamma, lam do not matter for evaluation transitions = Transition(*zip(*br.memory.copy())) rewards = transitions.reward br.cleanup() else: env = gym.make(env_id) env.seed(env_seed_offset) rewards = [] for _ in range(n_episodes): rewards = rewards + env.unwrapped.get_random_sampling_reward() env.close() # save result result = Result(logpath=logpath, env_id=env_id, env_specs=env_specs, policy=policy, policy_specs=policy_specs, deterministic=deterministic, load_iter=load_iter, T=T, n_episodes=n_episodes, rewards=rewards) fn = "result_metabo_iter_{:04d}".format( load_iter) if policy == "MetaBO" else "result_{}".format(policy) with open(os.path.join(savepath, fn), "wb") as f: pkl.dump(result, f)