def load_envs_and_config(model_file): save_dict = torch.load(model_file) config = save_dict['config'] config['device'] = 'cuda' env_factory = EnvFactory(config=config) reward_env = env_factory.generate_reward_env() reward_env.load_state_dict(save_dict['model']) real_env = env_factory.generate_real_env() return reward_env, real_env, config
def load_envs_and_config(model_file): save_dict = torch.load(model_file) config = save_dict['config'] config['device'] = 'cpu' config['envs']['CartPole-v0']['solved_reward'] = 100000 # something big enough to prevent early out triggering env_factory = EnvFactory(config=config) reward_env = env_factory.generate_reward_env() reward_env.load_state_dict(save_dict['model']) real_env = env_factory.generate_real_env() return reward_env, real_env, config
def load_envs_and_config(model_file): save_dict = torch.load(model_file) config = save_dict['config'] if BREAK == 'solved': config['envs']['Cliff']['solved_reward'] = -20 # something big enough to prevent early out triggering else: config['envs']['Cliff']['solved_reward'] = 100000 # something big enough to prevent early out triggering env_factory = EnvFactory(config=config) reward_env = env_factory.generate_reward_env() reward_env.load_state_dict(save_dict['model']) real_env = env_factory.generate_real_env() return reward_env, real_env, config
def compute(self, working_dir, bohb_id, config_id, cso, budget, *args, **kwargs): with open(CONFIG_FILE, 'r') as stream: default_config = yaml.safe_load(stream) config = self.get_specific_config(cso, default_config, budget) print('----------------------------') print("START BOHB ITERATION") print('CONFIG: ' + str(config)) print('CSO: ' + str(cso)) print('BUDGET: ' + str(budget)) print('----------------------------') info = {} # generate environment env_fac = EnvFactory(config) real_env = env_fac.generate_real_env() reward_env = env_fac.generate_reward_env() save_dict = torch.load(SAVE_FILE) #config = save_dict['config'] reward_env.load_state_dict(save_dict['model']) score = 0 for i in range(NUM_EVALS): td3 = TD3(env=reward_env, max_action=reward_env.get_max_action(), config=config) reward_list_train, _, _ = td3.train(reward_env, test_env=real_env) reward_list_test, _, _ = td3.test(real_env) avg_reward_test = statistics.mean(reward_list_test) unsolved_weight = config["agents"]["gtn"]["unsolved_weight"] score += len(reward_list_train) + max( 0, (real_env.get_solved_reward() - avg_reward_test)) * unsolved_weight score = score / NUM_EVALS info['config'] = str(config) print('----------------------------') print('FINAL SCORE: ' + str(score)) print("END BOHB ITERATION") print('----------------------------') return {"loss": score, "info": info}
else: self.eps *= self.eps_decay self.eps = max(self.eps, self.eps_min) if __name__ == "__main__": with open("../default_config_gridworld.yaml", "r") as stream: config = yaml.safe_load(stream) print(config) torch.set_num_threads(1) # generate environment env_fac = EnvFactory(config) real_env = env_fac.generate_real_env() # virtual_env = env_fac.generate_virtual_env() reward_env = env_fac.generate_reward_env() reward_list_len = [] for i in range(20): ql = QL(env=real_env, config=config, count_based=False) reward_list_train, episode_length_list_train, _ = ql.train( env=real_env, test_env=real_env, time_remaining=5000) reward_list_test, episode_length_list_test, _ = ql.test( env=real_env, time_remaining=500) reward_list_len.append(len(reward_list_train)) print(len(reward_list_train)) print(sum(episode_length_list_train)) import statistics print(statistics.mean(reward_list_len))