Exemple #1
0
class HierarchicalDQNAgent(object):

    INTRINSIC_STEP_COST = 0.
    INTRINSIC_REWARD = 1.

    def __init__(self,
                 original_states_n: tuple,
                 meta_controller_states_n: tuple,
                 actions_n: int,
                 controller_hidden_layers=[32, 32, 32],
                 meta_controller_hidden_layers=[32, 32, 32],
                 discount=0.99,
                 controller_lr=0.1,
                 meta_controller_lr=0.0001,
                 subgoals_num=None,
                 epsilon_decay_step=10000,
                 epsilon_end=0.02):
        """

        :param original_states_n: tuple
        :param meta_controller_states_n: tuple
        :param actions_n: int
        :param controller_lr:
        :param meta_controller_lr:
        :param subgoals: np.ndarray
        :param meta_controller_state_fn: lambda controller_state: meta_controller_state
        (a function that maps state for controller to another state for meta_controller)
        :param check_subgoal_fn: lambda state(np.ndarray), goal_num(int): bool
        (a function that checks whether the state achieves subgoals[goal_num])
        """

        self._num_subgoals = subgoals_num

        self.meta_controller = DQNAgent(
            states_n=meta_controller_states_n,
            actions_n=self._num_subgoals,
            hidden_layers=meta_controller_hidden_layers,
            scope_name='meta_controller',
            learning_rate=meta_controller_lr,
            epsilon_decay_step=epsilon_decay_step,
            epsilon_end=epsilon_end,
            discount=discount)
        self.controller = DQNAgent(states_n=(original_states_n[0] +
                                             self._num_subgoals, ),
                                   actions_n=actions_n,
                                   hidden_layers=controller_hidden_layers,
                                   scope_name='controller',
                                   learning_rate=controller_lr,
                                   epsilon_decay_step=epsilon_decay_step,
                                   epsilon_end=epsilon_end,
                                   discount=discount)

    def choose_goal(self, state, epsilon=None):
        return self.meta_controller.choose_action(state, epsilon=epsilon)

    def choose_action(self, state, goal, epsilon=None):
        return self.controller.choose_action(np.concatenate((state, goal),
                                                            axis=0),
                                             epsilon=epsilon)
Exemple #2
0
    def _init_single_agent(self, agent_kwargs: Dict[str, Any]):
        """Create and return an agent. The type of agent depends on the 
        self.type parameter
        Args:
            agent_params (dict)

        Returns:
            Agent: the agent initialized
        """
        agent = None
        if self.type == "DQN":
            agent = DQNAgent(**agent_kwargs)
        elif self.type == "tile coder test":
            agent = self._init_tc_agent(**agent_kwargs)
        elif self.type == "REINFORCE":
            agent = REINFORCEAgent(**agent_kwargs)
        elif self.type == "REINFORCE with baseline":
            agent = REINFORCEAgentWithBaseline(**agent_kwargs)
        elif self.type == "actor-critic":
            agent = ActorCriticAgent(**agent_kwargs)
        elif self.type == "Abaddon test":
            agent = AbaddonAgent(**agent_kwargs)
        elif self.type == "PPO":
            agent = PPOAgent(**agent_kwargs)
        elif self.type == "DDPG":
            agent = DDPGAgent(**agent_kwargs)
        else:
            raise ValueError(
                f"agent not initialized because {self.type} is not \
                recognised")
        return agent
Exemple #3
0
    def __init__(self,
                 original_states_n: tuple,
                 meta_controller_states_n: tuple,
                 actions_n: int,
                 controller_hidden_layers=[32, 32, 32],
                 meta_controller_hidden_layers=[32, 32, 32],
                 discount=0.99,
                 controller_lr=0.1,
                 meta_controller_lr=0.0001,
                 subgoals_num=None,
                 epsilon_decay_step=10000,
                 epsilon_end=0.02):
        """

        :param original_states_n: tuple
        :param meta_controller_states_n: tuple
        :param actions_n: int
        :param controller_lr:
        :param meta_controller_lr:
        :param subgoals: np.ndarray
        :param meta_controller_state_fn: lambda controller_state: meta_controller_state
        (a function that maps state for controller to another state for meta_controller)
        :param check_subgoal_fn: lambda state(np.ndarray), goal_num(int): bool
        (a function that checks whether the state achieves subgoals[goal_num])
        """

        self._num_subgoals = subgoals_num

        self.meta_controller = DQNAgent(
            states_n=meta_controller_states_n,
            actions_n=self._num_subgoals,
            hidden_layers=meta_controller_hidden_layers,
            scope_name='meta_controller',
            learning_rate=meta_controller_lr,
            epsilon_decay_step=epsilon_decay_step,
            epsilon_end=epsilon_end,
            discount=discount)
        self.controller = DQNAgent(states_n=(original_states_n[0] +
                                             self._num_subgoals, ),
                                   actions_n=actions_n,
                                   hidden_layers=controller_hidden_layers,
                                   scope_name='controller',
                                   learning_rate=controller_lr,
                                   epsilon_decay_step=epsilon_decay_step,
                                   epsilon_end=epsilon_end,
                                   discount=discount)
DEBUG = False

TS_GREEDY_COEFF = 1.0

TRAIN = False
TEST_NUM = 10

if __name__ == '__main__':

    env = CMOTP()
    agent = DQNAgent(env.observation_space.shape,
                     env.action_space.n, [512, 512],
                     'cmotp',
                     epsilon_decay_step=10000,
                     epsilon_end=0.05,
                     replay_memory_size=100000,
                     learning_rate=1e-4,
                     targetnet_update_freq=5000,
                     tau=1.)
    if TRAIN:
        temp_record = Temp_record(shape=tuple(env.observation_space.high + 1) +
                                  (env.action_space.n, ),
                                  beta_len=1500)

        for i in range(5000):
            state = env.reset()
            episode_len = 0
            episode_reward = 0
            episode = []
            while True:
Exemple #5
0

def get_reward_by_goal(st: np.ndarray, gl: np.ndarray) -> int:
    if np.all(st == gl):
        return 0
    return -1


if __name__ == '__main__':

    env = BitsGame(15)
    agent = DQNAgent(states_n=(env.size * 2, ),
                     actions_n=env.action_space.n,
                     hidden_layers=[256],
                     scope_name='BitsGame',
                     learning_rate=1e-4,
                     replay_memory_size=10000,
                     batch_size=32,
                     targetnet_update_freq=1000,
                     epsilon_end=0.05,
                     epsilon_decay_step=10000)

    if TRAIN:
        max_episode_len = env.observation_space.shape[0]
        rewards_record = []

        for episode_iter in range(EPISODES_NUM):
            state, goal = env.reset()
            reward_of_this_episode = 0
            len_of_this_episode = 0
            episode_record = []
Exemple #6
0
import time
import numpy as np
from DQN.DQNAgent import DQNAgent
from Env.Stochastic_MDP import StochasticMDPEnv

if __name__ == '__main__':

    env = StochasticMDPEnv()
    agent = DQNAgent(env.observation_space.shape,
                     env.action_space.n, [32, 32, 32],
                     'smdp',
                     epsilon_decay_step=10000,
                     epsilon_end=0.02,
                     replay_memory_size=50000,
                     learning_rate=5e-4)

    episode_lens = []
    episode_rewards = []

    for i in range(100000):
        state = env.reset()
        episode_len = 0
        episode_reward = 0
        while True:
            action = agent.choose_action(state=state)
            next_state, reward, done, _ = env.step(action)
            agent.store(state, action, reward, next_state, float(done))
            agent.train()

            episode_len += 1
            episode_reward += reward
Exemple #7
0
def main():
    env = MultiEnvRunnerWrapper(ENV_NUM, CMOTP)
    agent = DQNAgent(env.envs[0].observation_space.shape, env.envs[0].action_space.n, [512, 512], 'cmotp',
                     discount=GAMMA,
                     epsilon_decay_step=10000, epsilon_end=0.05, replay_memory_size=100000,
                     learning_rate=1e-4, targetnet_update_freq=5000, tau=1.)
    if TRAIN:
        temp_records = [Temp_record(shape=tuple(env.envs[0].observation_space.high + 1) + (env.envs[0].action_space.n,), beta_len=1500)
                        for _ in range(ENV_NUM)]

        train_input_shape = (ENV_NUM * STEP_N, ) + env.envs[0].observation_space.shape
        print(train_input_shape)

        episodes = [[] for _ in range(ENV_NUM)]
        states = env.reset()

        ep_cnt = 0

        for i in range(TRAIN_NUM):

            sts = [[] for _ in range(ENV_NUM)]
            acts = [[] for _ in range(ENV_NUM)]
            rwds = [[] for _ in range(ENV_NUM)]
            n_sts = [[] for _ in range(ENV_NUM)]
            dns = [[] for _ in range(ENV_NUM)]

            # get a batch of train data
            for j in range(ENV_NUM):
                for k in range(STEP_N):
                    action = agent.choose_action(states[j],
                                                 epsilon=pow(temp_records[j].get_state_temp(states[j]), TS_GREEDY_COEFF))
                    action_n = [int(action % 5), int(action / 5)]
                    n_st, rwd, dn, _ = env.envs[j].step(action_n)
                    # print(states[j], action, rwd, n_st, dn)
                    # record episodes
                    episodes[j].append((states[j], action))

                    # record train data
                    sts[j].append(states[j])
                    acts[j].append(action)
                    rwds[j].append(rwd)
                    n_sts[j].append(n_st)
                    dns[j].append(dn)

                    states[j] = n_st

                    if dn:
                        states[j] = env.envs[j].reset()
                        ep_cnt += 1
                        print('train_num: {}, episode_cnt: {}, len: {} '.format(i, ep_cnt, len(episodes[j])))
                        temp_records[j].decay_temp(episodes[j])
                        episodes[j] = []

            # discount reward
            last_values = agent.get_max_target_Q_s_a(states)
            for j, (rwd_j, dn_j, l_v_j) in enumerate(zip(rwds, dns, last_values)):
                if type(rwd_j) is np.ndarray:
                    rwd_j = rwd_j.tolist()
                if type(dn_j) is np.ndarray:
                    dn_j = dn_j.tolist()

                if dn_j[-1] == 0:
                    rwd_j = discount_with_dones(rwd_j + [l_v_j], dn_j + [0], GAMMA)[:-1]
                else:
                    rwd_j = discount_with_dones(rwd_j, dn_j, GAMMA)

                rwds[j] = rwd_j

            # flatten
            sts = np.asarray(sts, dtype=np.float32).reshape(train_input_shape)
            acts = np.asarray(acts, dtype=np.int32).flatten()
            rwds = np.asarray(rwds, dtype=np.float32).flatten()
            n_sts = np.asarray(n_sts, dtype=np.float32).reshape(train_input_shape)
            dns = np.asarray(dns, dtype=np.bool).flatten()

            # train
            agent.train_without_replaybuffer(sts, acts, rwds)

    else:
        # test
        test_env = CMOTP()
        agent.load_model()
        for i in range(TEST_NUM):
            state = test_env.reset()
            while True:
                test_env.render()
                time.sleep(1)
                action = agent.choose_action(state, epsilon=0.05)
                action_n = [int(action % 5), int(action / 5)]
                next_state, reward, done, _ = test_env.step(action_n)
                state = next_state
                if done:
                    break
        test_env.close()

    env.close()