def train_test_agents(mode, env, real_env, config): rewards = [] episode_lengths = [] # settings for comparability config['agents']['td3']['test_episodes'] = 1 config['agents']['td3']['train_episodes'] = 500 config['agents']['td3']['activation_fn'] = "relu" config['agents']['td3']['lr'] = 0.001 config['agents']['td3']['tau'] = 0.005 config['agents']['td3']['same_action_num'] = 2 config['agents']['td3']['policy_delay'] = 2 config['agents']['td3']['rb_size'] = 10000000 # config['agents']['td3']['policy_std_clip'] = 0.9 config['agents']['td3']['print_rate'] = 100 for i in range(MODEL_AGENTS): if mode == '-1': agent = select_agent(config=config, agent_name='td3_icm') else: agent = select_agent(config=config, agent_name='td3') reward, episode_length, _ = agent.train(env=env, test_env=real_env) print('reward: ' + str(reward)) rewards.append(reward) episode_lengths.append(episode_length) return rewards, episode_lengths
def train_test_agents(mode, env, real_env, config): rewards = [] episode_lengths = [] # settings for comparability config['agents']['duelingddqn'] = {} config['agents']['duelingddqn']['test_episodes'] = 1 config['agents']['duelingddqn']['train_episodes'] = 1000 config['agents']['duelingddqn']['print_rate'] = 100 config['agents']['duelingddqn']['lr'] = 0.00025 config['agents']['duelingddqn']['eps_init'] = 1.0 config['agents']['duelingddqn']['eps_min'] = 0.1 config['agents']['duelingddqn'][ 'eps_decay'] = 0.9 # original DDQN paper uses linear decay over 1M N's config['agents']['duelingddqn']['gamma'] = 0.99 config['agents']['duelingddqn']['batch_size'] = 32 config['agents']['duelingddqn']['same_action_num'] = 1 config['agents']['duelingddqn']['activation_fn'] = "relu" config['agents']['duelingddqn'][ 'tau'] = 0.01 # original DDQN paper has hard update every N steps config['agents']['duelingddqn']['hidden_size'] = 64 config['agents']['duelingddqn']['hidden_layer'] = 1 config['agents']['duelingddqn']['rb_size'] = 1000000 config['agents']['duelingddqn']['init_episodes'] = 1 config['agents']['duelingddqn']['feature_dim'] = 128 config['agents']['duelingddqn']['early_out_num'] = 10 config['agents']['duelingddqn']['early_out_virtual_diff'] = 0.02 # optimized ICM HPs: config['agents']['icm'] = {} config['agents']['icm']['beta'] = 0.05 config['agents']['icm']['eta'] = 0.03 config['agents']['icm']['feature_dim'] = 32 config['agents']['icm']['hidden_size'] = 128 config['agents']['icm']['lr'] = 1e-5 # default ICM HPs: # config['agents']['icm'] = {} # config['agents']['icm']['beta'] = 0.2 # config['agents']['icm']['eta'] = 0.5 # config['agents']['icm']['feature_dim'] = 64 # config['agents']['icm']['hidden_size'] = 128 # config['agents']['icm']['lr'] = 1e-4 for i in range(MODEL_AGENTS): if mode == '-1': agent = select_agent(config=config, agent_name='duelingddqn_icm') else: agent = select_agent(config=config, agent_name='duelingddqn') reward, episode_length, _ = agent.train(env=env, test_env=real_env) rewards.append(reward) episode_lengths.append(episode_length) return rewards, episode_lengths
def train_test_agents(mode, env, real_env, config): rewards = [] episode_lengths = [] # settings for comparability config['agents']['ppo'] = {} config['agents']['ppo']['test_episodes'] = 1 config['agents']['ppo']['train_episodes'] = 5000 config['agents']['ppo']['print_rate'] = 100 config['agents']['ppo']['init_episodes'] = 0 config['agents']['ppo']['update_episodes'] = 1 config['agents']['ppo']['ppo_epochs'] = 10 config['agents']['ppo']['gamma'] = 0.99 # config['agents']['ppo']['lr'] = 3e-4 config['agents']['ppo']['lr'] = 1e-5 config['agents']['ppo']['vf_coef'] = 1 config['agents']['ppo']['ent_coef'] = 0.001 config['agents']['ppo']['eps_clip'] = 0.2 config['agents']['ppo']['rb_size'] = 1000000 config['agents']['ppo']['same_action_num'] = 1 config['agents']['ppo']['activation_fn'] = 'tanh' config['agents']['ppo']['hidden_size'] = 128 config['agents']['ppo']['hidden_layer'] = 2 config['agents']['ppo']['action_std'] = 0.1 config['agents']['ppo']['early_out_num'] = 50 config['agents']['ppo']['early_out_virtual_diff'] = 0.02 # BOHB optimized HPs config['agents']['icm'] = {} config['agents']['icm']['beta'] = 0.05 config['agents']['icm']['eta'] = 0.03 config['agents']['icm']['feature_dim'] = 32 config['agents']['icm']['hidden_size'] = 128 config['agents']['icm']['lr'] = 1e-4 # default ICM HPs: # config['agents']['icm'] = {} # config['agents']['icm']['beta'] = 0.2 # config['agents']['icm']['eta'] = 0.5 # config['agents']['icm']['feature_dim'] = 64 # config['agents']['icm']['hidden_size'] = 128 # config['agents']['icm']['lr'] = 1e-4 for i in range(MODEL_AGENTS): if mode == '-1': agent = select_agent(config=config, agent_name='ppo_icm') else: agent = select_agent(config=config, agent_name='ppo') reward, episode_length, _ = agent.train(env=env, test_env=real_env) print('reward: ' + str(reward)) rewards.append(reward) episode_lengths.append(episode_length) return rewards, episode_lengths
def train_test_agents(mode, env, real_env, config): rewards = [] episode_lengths = [] # settings for comparability config['agents']['td3']['test_episodes'] = 1 config['agents']['td3']['train_episodes'] = 3000 config['agents']['td3']['print_rate'] = 100 config['agents']['td3']['lr'] = 3e-4 config['agents']['td3']['tau'] = 0.005 config['agents']['td3']['activation_fn'] = 'relu' config['agents']['td3']['same_action_num'] = 2 config['agents']['td3']['policy_delay'] = 2 config['agents']['td3']['policy_std_clip'] = 0.5 config['agents']['td3']['policy_std'] = 0.2 config['agents']['td3']['action_std'] = 0.1 config['agents']['td3']['batch_size'] = 256 config['agents']['td3']['gamma'] = 0.99 config['agents']['td3']['rb_size'] = 1000000 config['agents']['td3'][ 'init_episodes'] = 50 # set via time steps in original td3 implementation config['agents']['td3']['early_out_num'] = 10 config['agents']['td3']['early_out_virtual_diff'] = 1e-2 # optimized ICM HPs: config['agents']['icm'] = {} config['agents']['icm']['beta'] = 0.1 config['agents']['icm']['eta'] = 0.01 config['agents']['icm']['feature_dim'] = 32 config['agents']['icm']['hidden_size'] = 128 config['agents']['icm']['lr'] = 5e-4 # default ICM HPs: # config['agents']['icm'] = {} # config['agents']['icm']['beta'] = 0.2 # config['agents']['icm']['eta'] = 0.5 # config['agents']['icm']['feature_dim'] = 64 # config['agents']['icm']['hidden_size'] = 128 # config['agents']['icm']['lr'] = 1e-4 for i in range(MODEL_AGENTS): config_mod = vary_hp(config) if mode == '-1': agent = select_agent(config=config_mod, agent_name='td3_icm') else: agent = select_agent(config=config_mod, agent_name='td3') reward, episode_length, _ = agent.train(env=env, test_env=real_env) print('reward: ' + str(reward)) rewards.append(reward) episode_lengths.append(episode_length) return rewards, episode_lengths
def train_test_agents(env, real_env, config): states = [] # settings for comparability config['agents']['sarsa'] = {} config['agents']['sarsa']['test_episodes'] = 1 config['agents']['sarsa']['train_episodes'] = 200 config['agents']['sarsa']['print_rate'] = 100 config['agents']['sarsa']['init_episodes'] = config['agents']['ql']['init_episodes'] config['agents']['sarsa']['batch_size'] = config['agents']['ql']['batch_size'] config['agents']['sarsa']['alpha'] = config['agents']['ql']['alpha'] config['agents']['sarsa']['gamma'] = config['agents']['ql']['gamma'] config['agents']['sarsa']['eps_init'] = config['agents']['ql']['eps_init'] config['agents']['sarsa']['eps_min'] = config['agents']['ql']['eps_min'] config['agents']['sarsa']['eps_decay'] = config['agents']['ql']['eps_decay'] config['agents']['sarsa']['rb_size'] = config['agents']['ql']['rb_size'] config['agents']['sarsa']['same_action_num'] = config['agents']['ql']['same_action_num'] config['agents']['sarsa']['early_out_num'] = config['agents']['ql']['early_out_num'] config['agents']['sarsa']['early_out_virtual_diff'] = config['agents']['ql']['early_out_virtual_diff'] for i in range(MODEL_AGENTS): agent = select_agent(config=config, agent_name='sarsa') reward, _, _ = agent.train(env=env, test_env=real_env) _, _, replay_buffer = agent.test(env=real_env) state, _, next_state, _, _ = replay_buffer.get_all() state = state.tolist() next_state = next_state.tolist() # skip if we could not solve env if len(reward) == config['agents']['sarsa']['train_episodes'] and BREAK == 'solved': continue state = [int(elem[0]) for elem in state] state.append(int(next_state[-1][0])) states.append(state) return states
def calc_score(self, env, time_remaining): time_start = time.time() agent = select_agent(config=self.config, agent_name=self.agent_name) real_env = self.env_factory.generate_real_env() reward_list_train, episode_length_train, _ = agent.train( env=env, test_env=real_env, time_remaining=time_remaining - (time.time() - time_start)) reward_list_test, _, _ = agent.test(env=real_env, time_remaining=time_remaining - (time.time() - time_start)) avg_reward_test = statistics.mean(reward_list_test) if env.is_virtual_env(): return avg_reward_test else: # # when timeout occurs, reward_list_train is padded (with min. reward values) and episode_length_train is not # if len(episode_length_train) < len(reward_list_train): # print("due to timeout, reward_list_train has been padded") # print(f"shape rewards: {np.shape(reward_list_train)}, shape episode lengths: {np.shape(episode_length_train)}") # reward_list_train = reward_list_train[:len(episode_length_train)] print("AVG REWARD: ", avg_reward_test) return avg_reward_test
def train_test_agents(mode, env, real_env, config): rewards = [] # settings for comparability config['agents']['td3']['test_episodes'] = 1 config['agents']['td3']['train_episodes'] = 50 config['agents']['td3']['print_rate'] = 1 for i in range(MODEL_AGENTS): if mode == '-1': agent = select_agent(config=config, agent_name='td3_icm') else: agent = select_agent(config=config, agent_name='td3') reward, _, _ = agent.train(env=env, test_env=real_env) print('reward: ' + str(reward)) rewards.append(reward) return rewards
def train_test_agents(mode, env, real_env, config): rewards = [] episode_lengths = [] # settings for comparability config['agents']['sarsa'] = {} config['agents']['sarsa']['test_episodes'] = 1 config['agents']['sarsa']['train_episodes'] = 500 config['agents']['sarsa']['print_rate'] = 100 config['agents']['sarsa']['alpha'] = 1.0 config['agents']['sarsa']['eps_decay'] = 0.0 config['agents']['sarsa']['eps_init'] = 0.01 # 0.01 config['agents']['sarsa']['eps_min'] = 0.01 # 0.01 config['agents']['sarsa']['gamma'] = 0.8 config['agents']['sarsa']['same_action_num'] = 1 config['agents']['sarsa'][ 'rb_size'] = 1 # custom to reward env and gridworld config['agents']['sarsa']['init_episodes'] = 0 config['agents']['sarsa']['batch_size'] = 1 config['agents']['sarsa']['early_out_num'] = 10 config['agents']['sarsa']['early_out_virtual_diff'] = 0.02 # for count-based q-learning config['agents']['sarsa']['beta'] = 0.1 # for count-based q-learning (tuned) # config['agents']['sarsa']['beta'] = 0.005 # 0.01 also works fine for i in range(MODEL_AGENTS): if mode == '-1': agent = select_agent(config=config, agent_name='sarsa_cb') else: agent = select_agent(config=config, agent_name='sarsa') reward, episode_length, _ = agent.train(env=env, test_env=real_env) rewards.append(reward) episode_lengths.append(episode_length) return rewards, episode_lengths
def train_test_agents(train_env, test_env, config): reward_list = [] train_steps_needed = [] episodes_needed = [] config["agents"]["ddqn"]["print_rate"] = 10 config["agents"]["ddqn"]["test_episodes"] = 10 config["render_env"] = True agent = select_agent(config=config, agent_name='DDQN') reward_train, episode_length, _ = agent.train(env=train_env) reward, _, _ = agent.test(env=test_env) print('reward: ' + str(reward)) reward_list.append(reward) train_steps_needed.append([sum(episode_length)]) episodes_needed.append([(len(reward_train))]) return reward_list, train_steps_needed, episodes_needed
def __init__(self, config): super().__init__() self.config = config reptile_config = config["agents"]["reptile"] self.max_iterations = reptile_config["max_iterations"] self.step_size = reptile_config["step_size"] self.parallel_update = reptile_config["parallel_update"] self.env_num = reptile_config["env_num"] agent_name = reptile_config["agent_name"] self.env_factory = EnvFactory(config) self.agent = select_agent(config, agent_name) self.envs = [] for i in range(self.env_num): self.envs.append(self.env_factory.generate_random_real_env())
def train_test_agents(mode, env, real_env, config): rewards = [] episode_lengths = [] config['device'] = 'cuda' config['agents']["td3_discrete_vary"]["print_rate"] = 100 config['agents']["td3_discrete_vary"]["train_episodes"] = 500 config['agents']["td3_discrete_vary"]["test_episodes"] = 1 config['envs']['CartPole-v0'][ 'solved_reward'] = 100000 # something big enough to prevent early out triggering for i in range(MODEL_AGENTS): agent = select_agent(config=config, agent_name='td3_discrete_vary') reward, episode_length, _ = agent.train(env=env, test_env=real_env) print('reward: ' + str(reward)) rewards.append(sum(reward)) episode_lengths.append(episode_length) return rewards, episode_lengths
def train_test_agents(train_env, test_env, config, agents_num): reward_list = [] train_steps_needed = [] episodes_needed = [] # settings for comparability config['agents']['duelingddqn_vary']['vary_hp'] = True config['agents']['duelingddqn']['print_rate'] = 10 config['agents']['duelingddqn']['early_out_num'] = 10 config['agents']['duelingddqn']['train_episodes'] = 1000 config['agents']['duelingddqn']['init_episodes'] = 10 config['agents']['duelingddqn']['test_episodes'] = 10 config['agents']['duelingddqn']['early_out_virtual_diff'] = 0.01 for i in range(agents_num): agent = select_agent(config=config, agent_name='DuelingDDQN_vary') reward_train, episode_length, _ = agent.train(env=train_env) reward, _, _ = agent.test(env=test_env) print('reward: ' + str(reward)) reward_list.append(reward) train_steps_needed.append([sum(episode_length)]) episodes_needed.append([(len(reward_train))]) return reward_list, train_steps_needed, episodes_needed
device=device) step_times_per_episode_real_env = {} step_times_per_episode_syn_env = {} for i, file_name in enumerate(file_list): syn_env, real_env, config = load_envs_and_config(file_name=file_name, model_dir=model_dir, device=device) config["agents"][agent_name]["init_episodes"] = init_episodes config["agents"][agent_name]["train_episodes"] = train_episodes config['agents'][agent_name]['print_rate'] = 10 print('train agents on ' + str(file_name)) agent = select_agent(config=config, agent_name=agent_name) print('train on real env') _, _, _, step_times_real_env = agent.train( env=real_env, time_remaining=time_remaining_real) step_times_per_episode_real_env[ real_env.env.env_name + "_" + str(i)] = { "step_times_per_episode_real_env": step_times_real_env, "step_times_mean": np.mean(np.concatenate(step_times_real_env)), "step_times_std": np.std(np.concatenate(step_times_real_env)) } print('train on syn env') _, _, _, step_times_syn_env = agent.train( env=syn_env, time_remaining=time_remaining_syn)