class HierarchicalDQNAgent(object): INTRINSIC_STEP_COST = 0. INTRINSIC_REWARD = 1. def __init__(self, original_states_n: tuple, meta_controller_states_n: tuple, actions_n: int, controller_hidden_layers=[32, 32, 32], meta_controller_hidden_layers=[32, 32, 32], discount=0.99, controller_lr=0.1, meta_controller_lr=0.0001, subgoals_num=None, epsilon_decay_step=10000, epsilon_end=0.02): """ :param original_states_n: tuple :param meta_controller_states_n: tuple :param actions_n: int :param controller_lr: :param meta_controller_lr: :param subgoals: np.ndarray :param meta_controller_state_fn: lambda controller_state: meta_controller_state (a function that maps state for controller to another state for meta_controller) :param check_subgoal_fn: lambda state(np.ndarray), goal_num(int): bool (a function that checks whether the state achieves subgoals[goal_num]) """ self._num_subgoals = subgoals_num self.meta_controller = DQNAgent( states_n=meta_controller_states_n, actions_n=self._num_subgoals, hidden_layers=meta_controller_hidden_layers, scope_name='meta_controller', learning_rate=meta_controller_lr, epsilon_decay_step=epsilon_decay_step, epsilon_end=epsilon_end, discount=discount) self.controller = DQNAgent(states_n=(original_states_n[0] + self._num_subgoals, ), actions_n=actions_n, hidden_layers=controller_hidden_layers, scope_name='controller', learning_rate=controller_lr, epsilon_decay_step=epsilon_decay_step, epsilon_end=epsilon_end, discount=discount) def choose_goal(self, state, epsilon=None): return self.meta_controller.choose_action(state, epsilon=epsilon) def choose_action(self, state, goal, epsilon=None): return self.controller.choose_action(np.concatenate((state, goal), axis=0), epsilon=epsilon)
def _init_single_agent(self, agent_kwargs: Dict[str, Any]): """Create and return an agent. The type of agent depends on the self.type parameter Args: agent_params (dict) Returns: Agent: the agent initialized """ agent = None if self.type == "DQN": agent = DQNAgent(**agent_kwargs) elif self.type == "tile coder test": agent = self._init_tc_agent(**agent_kwargs) elif self.type == "REINFORCE": agent = REINFORCEAgent(**agent_kwargs) elif self.type == "REINFORCE with baseline": agent = REINFORCEAgentWithBaseline(**agent_kwargs) elif self.type == "actor-critic": agent = ActorCriticAgent(**agent_kwargs) elif self.type == "Abaddon test": agent = AbaddonAgent(**agent_kwargs) elif self.type == "PPO": agent = PPOAgent(**agent_kwargs) elif self.type == "DDPG": agent = DDPGAgent(**agent_kwargs) else: raise ValueError( f"agent not initialized because {self.type} is not \ recognised") return agent
def __init__(self, original_states_n: tuple, meta_controller_states_n: tuple, actions_n: int, controller_hidden_layers=[32, 32, 32], meta_controller_hidden_layers=[32, 32, 32], discount=0.99, controller_lr=0.1, meta_controller_lr=0.0001, subgoals_num=None, epsilon_decay_step=10000, epsilon_end=0.02): """ :param original_states_n: tuple :param meta_controller_states_n: tuple :param actions_n: int :param controller_lr: :param meta_controller_lr: :param subgoals: np.ndarray :param meta_controller_state_fn: lambda controller_state: meta_controller_state (a function that maps state for controller to another state for meta_controller) :param check_subgoal_fn: lambda state(np.ndarray), goal_num(int): bool (a function that checks whether the state achieves subgoals[goal_num]) """ self._num_subgoals = subgoals_num self.meta_controller = DQNAgent( states_n=meta_controller_states_n, actions_n=self._num_subgoals, hidden_layers=meta_controller_hidden_layers, scope_name='meta_controller', learning_rate=meta_controller_lr, epsilon_decay_step=epsilon_decay_step, epsilon_end=epsilon_end, discount=discount) self.controller = DQNAgent(states_n=(original_states_n[0] + self._num_subgoals, ), actions_n=actions_n, hidden_layers=controller_hidden_layers, scope_name='controller', learning_rate=controller_lr, epsilon_decay_step=epsilon_decay_step, epsilon_end=epsilon_end, discount=discount)
DEBUG = False TS_GREEDY_COEFF = 1.0 TRAIN = False TEST_NUM = 10 if __name__ == '__main__': env = CMOTP() agent = DQNAgent(env.observation_space.shape, env.action_space.n, [512, 512], 'cmotp', epsilon_decay_step=10000, epsilon_end=0.05, replay_memory_size=100000, learning_rate=1e-4, targetnet_update_freq=5000, tau=1.) if TRAIN: temp_record = Temp_record(shape=tuple(env.observation_space.high + 1) + (env.action_space.n, ), beta_len=1500) for i in range(5000): state = env.reset() episode_len = 0 episode_reward = 0 episode = [] while True:
def get_reward_by_goal(st: np.ndarray, gl: np.ndarray) -> int: if np.all(st == gl): return 0 return -1 if __name__ == '__main__': env = BitsGame(15) agent = DQNAgent(states_n=(env.size * 2, ), actions_n=env.action_space.n, hidden_layers=[256], scope_name='BitsGame', learning_rate=1e-4, replay_memory_size=10000, batch_size=32, targetnet_update_freq=1000, epsilon_end=0.05, epsilon_decay_step=10000) if TRAIN: max_episode_len = env.observation_space.shape[0] rewards_record = [] for episode_iter in range(EPISODES_NUM): state, goal = env.reset() reward_of_this_episode = 0 len_of_this_episode = 0 episode_record = []
import time import numpy as np from DQN.DQNAgent import DQNAgent from Env.Stochastic_MDP import StochasticMDPEnv if __name__ == '__main__': env = StochasticMDPEnv() agent = DQNAgent(env.observation_space.shape, env.action_space.n, [32, 32, 32], 'smdp', epsilon_decay_step=10000, epsilon_end=0.02, replay_memory_size=50000, learning_rate=5e-4) episode_lens = [] episode_rewards = [] for i in range(100000): state = env.reset() episode_len = 0 episode_reward = 0 while True: action = agent.choose_action(state=state) next_state, reward, done, _ = env.step(action) agent.store(state, action, reward, next_state, float(done)) agent.train() episode_len += 1 episode_reward += reward
def main(): env = MultiEnvRunnerWrapper(ENV_NUM, CMOTP) agent = DQNAgent(env.envs[0].observation_space.shape, env.envs[0].action_space.n, [512, 512], 'cmotp', discount=GAMMA, epsilon_decay_step=10000, epsilon_end=0.05, replay_memory_size=100000, learning_rate=1e-4, targetnet_update_freq=5000, tau=1.) if TRAIN: temp_records = [Temp_record(shape=tuple(env.envs[0].observation_space.high + 1) + (env.envs[0].action_space.n,), beta_len=1500) for _ in range(ENV_NUM)] train_input_shape = (ENV_NUM * STEP_N, ) + env.envs[0].observation_space.shape print(train_input_shape) episodes = [[] for _ in range(ENV_NUM)] states = env.reset() ep_cnt = 0 for i in range(TRAIN_NUM): sts = [[] for _ in range(ENV_NUM)] acts = [[] for _ in range(ENV_NUM)] rwds = [[] for _ in range(ENV_NUM)] n_sts = [[] for _ in range(ENV_NUM)] dns = [[] for _ in range(ENV_NUM)] # get a batch of train data for j in range(ENV_NUM): for k in range(STEP_N): action = agent.choose_action(states[j], epsilon=pow(temp_records[j].get_state_temp(states[j]), TS_GREEDY_COEFF)) action_n = [int(action % 5), int(action / 5)] n_st, rwd, dn, _ = env.envs[j].step(action_n) # print(states[j], action, rwd, n_st, dn) # record episodes episodes[j].append((states[j], action)) # record train data sts[j].append(states[j]) acts[j].append(action) rwds[j].append(rwd) n_sts[j].append(n_st) dns[j].append(dn) states[j] = n_st if dn: states[j] = env.envs[j].reset() ep_cnt += 1 print('train_num: {}, episode_cnt: {}, len: {} '.format(i, ep_cnt, len(episodes[j]))) temp_records[j].decay_temp(episodes[j]) episodes[j] = [] # discount reward last_values = agent.get_max_target_Q_s_a(states) for j, (rwd_j, dn_j, l_v_j) in enumerate(zip(rwds, dns, last_values)): if type(rwd_j) is np.ndarray: rwd_j = rwd_j.tolist() if type(dn_j) is np.ndarray: dn_j = dn_j.tolist() if dn_j[-1] == 0: rwd_j = discount_with_dones(rwd_j + [l_v_j], dn_j + [0], GAMMA)[:-1] else: rwd_j = discount_with_dones(rwd_j, dn_j, GAMMA) rwds[j] = rwd_j # flatten sts = np.asarray(sts, dtype=np.float32).reshape(train_input_shape) acts = np.asarray(acts, dtype=np.int32).flatten() rwds = np.asarray(rwds, dtype=np.float32).flatten() n_sts = np.asarray(n_sts, dtype=np.float32).reshape(train_input_shape) dns = np.asarray(dns, dtype=np.bool).flatten() # train agent.train_without_replaybuffer(sts, acts, rwds) else: # test test_env = CMOTP() agent.load_model() for i in range(TEST_NUM): state = test_env.reset() while True: test_env.render() time.sleep(1) action = agent.choose_action(state, epsilon=0.05) action_n = [int(action % 5), int(action / 5)] next_state, reward, done, _ = test_env.step(action_n) state = next_state if done: break test_env.close() env.close()