예제 #1
0
class Agent(BaseAgent):

    def __init__(self, env):
        super(Agent, self).__init__(env)

        self.model = DQN(self.obs_dim, self.action_dim)
        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)

        self.set_gui_flag(False, False)

    def get_action(self, obs, train=True):

        eps_min = 0.1
        eps_max = 1.0
        eps_decay_steps = self.train_step
        epsilon = max(eps_min, eps_max - (eps_max - eps_min)*self.global_step/eps_decay_steps)

        if train and np.random.rand(1) < epsilon:
            action = self.env.action_space.sample()
        else:
            action = self.model.get_action(obs)

        return action

    def train_model(self, obs, action, reward, obs_next, done):

        self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done))
        
        if len(self.replay_buffer.replay_memory) < minibatch_size * pre_train_step:
            return None

        minibatch = self.replay_buffer.sample_from_memory()
        s, a, r, s_, done = map(np.array, zip(*minibatch))
        self.model.train_network(s, a, r, s_, done)

        if self.global_step % target_update_period == 0:
            self.model.update_target()

        return
예제 #2
0
class Agent(AbstractAgent):
    def __init__(self, env):
        super(Agent, self).__init__(env)
        print("Q-network Agent is created")

        self.action_dim = env.action_space.n
        self.obs_dim = np.power(int(env.observation_space.high[0] + 1), 2)

        self.model = Q_Network(self.obs_dim, self.action_dim, train_step)

        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)

    def learn(self):
        print("Start train for {} steps".format(train_step))
        global_step = 0
        episode_num = 0

        while global_step < train_step:
            episode_num += 1
            step_in_ep = 0

            obs_v = self.env.reset()
            obs = self.one_hot(obs_v)

            total_reward = 0
            done = False

            while (not done and global_step < train_step):

                global_step += 1
                step_in_ep += 1

                action = self.get_action(obs, global_step)

                # For debugging
                if global_step % 1000 == 0:
                    self.draw_current_optimal_actions(global_step)

                obs_v_next, reward, done, _ = self.env.step(action)
                obs_next = self.one_hot(obs_v_next)

                self.train_agent(obs, action, reward, obs_next, done,
                                 global_step)

                # GUI
                # self.env.render()

                obs = obs_next
                total_reward += reward

    def test(self, global_step=0):
        print("Start test for {} steps".format(test_step))

        global_step = 0
        episode_num = 0

        self.draw_current_optimal_actions(0)

        while global_step < test_step:
            episode_num += 1
            step_in_ep = 0
            total_reward = 0
            done = False

            obs_v = self.env.reset()  # Reset environment
            obs = self.one_hot(obs_v)

            while (not done and global_step < test_step):

                global_step += 1
                step_in_ep += 1

                action = self.get_action(obs, global_step, False)

                obs_v_next, reward, done, _ = self.env.step(action)
                obs_next = self.one_hot(obs_v_next)

                # GUI
                time.sleep(0.05)
                self.env.render()

                obs = obs_next
                total_reward += reward

            print("[ test_ep: {}, total reward: {} ]".format(
                episode_num, total_reward))

    def get_action(self, obs, global_step, train=True):

        eps_min = 0.1
        eps_max = 1.0
        eps_decay_steps = train_step
        epsilon = max(
            eps_min,
            eps_max - (eps_max - eps_min) * global_step / eps_decay_steps)

        if train and np.random.rand(1) < epsilon:
            action = self.env.action_space.sample()
        else:
            action = self.model.get_action(obs)

        return action

    def train_agent(self, obs, action, reward, obs_next, done, global_step):

        self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done))

        if len(self.replay_buffer.replay_memory
               ) < minibatch_size * pre_train_step:
            return None

        minibatch = self.replay_buffer.sample_from_memory()
        s, a, r, s_, done = map(np.array, zip(*minibatch))
        self.model.train_network(s, a, r, s_, done)

        if False:  # TODO fill here
            self.model.update_target()

        return

    def one_hot(self, obs):
        idx = int(obs[1] * (self.env.observation_space.high[0] + 1) + obs[0])
        return np.eye(int(pow(self.env.observation_space.high[0] + 1, 2)))[idx]

    def draw_current_optimal_actions(self, step):
        idx = int(np.sqrt(self.obs_dim))
        directions = ["U", "D", "R", "L"]
        print("optimal actions at step {}".format(step))
        for i in range(idx):
            print("----" * idx + "-")
            row = ""
            for j in range(idx):
                row = row + "| {} ".format(directions[self.model.get_action(
                    np.eye(self.obs_dim)[int(idx * i + j)])])  # one-hot
            row = row + "|"
            print(row)
        print("----" * idx + "-")
        return
예제 #3
0
class Agent(AbstractAgent):

    def __init__(self, env):
        super(Agent, self).__init__(env)
        print("DQN Agent")

        self.action_dim = env.action_space.n
        self.obs_dim = observation_dim(env.observation_space)
            
        self.model = DQN(self.obs_dim, self.action_dim)

        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)
        
    def learn(self):
        print("Start train for {} steps".format(train_step))
        global_step = 0
        episode_num = 0

        while global_step < train_step:
            episode_num += 1

            obs = self.env.reset()  # Reset environment

            total_reward = 0
            done = False

            while (not done and global_step < train_step):

                global_step += 1

                action = self.get_action(obs, global_step)

                obs_next, reward, done, _ = self.env.step(action)

                self.train_agent(obs, action, reward, obs_next, done, global_step)

                # GUI
                self.env.render()

                obs = obs_next
                total_reward += reward

                if global_step % 10000 == 0:
                    print(global_step)

        self.model.save_network()

    def test(self, global_step=0):
        print("Start test for {} steps".format(test_step))

        global_step = 0
        episode_num = 0
        total_reward = 0

        while global_step < test_step:

            episode_num += 1

            obs = self.env.reset()  # Reset environment
            done = False

            while (not done and global_step < test_step):

                global_step += 1

                action = self.get_action(obs, global_step, False)

                obs_next, reward, done, _ = self.env.step(action)

                # GUI
                self.env.render()

                obs = obs_next
                total_reward += reward

            print("[ train_ep: {}, total reward: {} ]".format(episode_num, total_reward))
            total_reward = 0

    def get_action(self, obs, global_step, train=True):

        eps_min = 0.1
        eps_max = 1.0
        eps_decay_steps = train_step
        epsilon = max(eps_min, eps_max - (eps_max - eps_min)*global_step/eps_decay_steps)

        if train and np.random.rand(1) < epsilon:
            action = self.env.action_space.sample()
        else:
            action = self.model.get_action(obs)

        return action

    def train_agent(self, obs, action, reward, obs_next, done, global_step):

        state = self.model.preprocess_observation(obs)
        state_next = self.model.preprocess_observation(obs_next)

        self.replay_buffer.add_to_memory((state, action, reward, state_next, done))

        if len(self.replay_buffer.replay_memory) < minibatch_size * pre_train_step:
            return None

        if global_step % training_interval == 0:
            minibatch = self.replay_buffer.sample_from_memory()
            s, a, r, s_, done = map(np.array, zip(*minibatch))
            self.model.train_network(s, a, r, s_, done, global_step)

        if global_step % target_update_period == 0:
            self.model.update_target()
            
        return
예제 #4
0
class Agent(AbstractAgent):
    def __init__(self, env):
        super(Agent, self).__init__(env)
        print("DDPG Agent")

        self.action_dim = action_dim(
            env.action_space)  ### KH: for continuous action task
        self.obs_dim = observation_dim(env.observation_space)
        self.action_max = env.action_space.high  ### KH: DDPG action bound
        self.action_min = env.action_space.low  ### KH: DDPG action bound
        self.model = self.set_model()
        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)

    def set_model(self):
        # model can be q-table or q-network

        model = DDPG(self.obs_dim, self.action_dim, self.action_max,
                     self.action_min)

        return model

    def learn(self):
        print("Start Learn")

        global_step = 0
        episode_num = 0

        while global_step < train_step:

            episode_num += 1
            step_in_ep = 0

            obs = self.env.reset()  # Reset environment
            total_reward = 0
            done = False
            self.noise = np.zeros(self.action_dim)

            while (not done and step_in_ep < max_step_per_episode and
                   global_step < train_step):  ### KH: reset every 200 steps

                global_step += 1
                step_in_ep += 1

                action = self.get_action(obs, global_step)

                obs_next, reward, done, _ = self.env.step(action)

                self.train_agent(obs, action, reward, obs_next, done,
                                 global_step)

                # GUI
                self.env.render()

                obs = obs_next
                total_reward += reward

            print("[ train_ep: {}, total reward: {} ]".format(
                episode_num, total_reward))  ### KH: train result

    def test(self, global_step=0):
        print("Test step: {}".format(global_step))

        global_step = 0
        episode_num = 0
        total_reward = 0

        while global_step < test_step:

            episode_num += 1
            step_in_ep = 0

            obs = self.env.reset()  # Reset environment
            total_reward = 0  ### KH: Added missing
            done = False

            while (not done and step_in_ep < max_step_per_episode
                   and global_step < test_step):  ### KH: reset every 200 steps

                global_step += 1
                step_in_ep += 1

                action = self.get_action(obs, global_step, False)

                obs_next, reward, done, _ = self.env.step(action)

                # GUI
                self.env.render()

                obs = obs_next
                total_reward += reward

            print("[ test_ep: {}, total reward: {} ]".format(
                episode_num, total_reward))  ### KH: test result

    def get_action(self, obs, global_step, train=True):
        # 최적의 액션 선택 + Exploration (Epsilon greedy)

        action = self.model.choose_action(obs)

        if train:
            scale = 1 - global_step / train_step
            self.noise = self.ou_noise(self.noise)
            action = action + self.noise * (self.action_max -
                                            self.action_min) / 2 * scale
            action = np.maximum(action, self.action_min)
            action = np.minimum(action, self.action_max)

        return action

    def train_agent(self, obs, action, reward, obs_next, done, step):

        self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done))

        if len(self.replay_buffer.replay_memory
               ) < minibatch_size * pre_train_step:
            return None

        minibatch = self.replay_buffer.sample_from_memory()
        s, a, r, ns, d = map(np.array, zip(*minibatch))

        self.model.train_network(s, a, r, ns, d, step)

        return None

    def ou_noise(self, x):
        return x + theta * (mu - x) + sigma * np.random.randn(self.action_dim)