コード例 #1
0
ファイル: sl.py プロジェクト: NTAWolf/pyagents
class SLAgent(Agent):
    """Agent using keras NN
    """

    def __init__(self, n_frames_per_action=4):
        super(SLAgent, self).__init__(name="SL", version="1")
        self.experience = CircularList(1000)
        self.epsilon = LinearInterpolationManager([(0, 1.0), (1e4, 0.1)])
        self.action_repeat_manager = RepeatManager(n_frames_per_action - 1)

    def select_action(self):
        # Repeat last chosen action?
        action = self.action_repeat_manager.next()
        if action != None:
            return action

        state = self.preprocessor.process()
        try:
            s = np.array(state).reshape(len(state), 1)
        except:
            s = np.array(state).reshape(1, 1)

        if self._sars[2]:
            self._sars[3] = s
            self.flush_experience()

        # Consider postponing the first training until we have 32 samples
        if len(self.experience) > 0:
            self.nn.train(self.experience)

        if np.random.random() < self.epsilon.next():
            action = self.get_random_action()
        else:
            action_index = self.nn.predict(s)
            action = self.available_actions[action_index]

        self.action_repeat_manager.set(action)

        self._sars[0] = s
        self._sars[1] = self.available_actions.index(action)

        return action

    def set_available_actions(self, actions):
        super(SLAgent, self).set_available_actions(actions)
        # possible state values
        state_n = len(self.preprocessor.enumerate_states())

        self.nn = MLP(config="simple", input_ranges=[[0, state_n]], n_outputs=len(actions), batch_size=4)

    def set_raw_state_callbacks(self, state_functions):
        self.preprocessor = StateIndex(RelativeBall(state_functions, trinary=True))

    def receive_reward(self, reward):
        self._sars[2] = reward

    def on_episode_start(self):
        self._reset_sars()

    def on_episode_end(self):
        self._sars[3] = self._sars[0]
        self._sars[4] = 0
        self.flush_experience()

    def flush_experience(self):
        self.experience.append(tuple(self._sars))
        self._reset_sars()

    def _reset_sars(self):
        # state, action, reward, newstate, newstate_not_terminal
        self._sars = [None, None, None, None, 1]

    def get_settings(self):
        settings = {
            "name": self.name,
            "version": self.version,
            "experience_replay": self.experience.capacity(),
            "preprocessor": self.preprocessor.get_settings(),
            "epsilon": self.epsilon.get_settings(),
            "nn": self.nn.get_settings(),
        }

        settings.update(super(SLAgent, self).get_settings())

        return settings
コード例 #2
0
class SLAgent(Agent):
    """Agent using keras NN
    """
    def __init__(self, n_frames_per_action=4):
        super(SLAgent, self).__init__(name='SL', version='1')
        self.experience = CircularList(1000)
        self.epsilon = LinearInterpolationManager([(0, 1.0), (1e4, 0.1)])
        self.action_repeat_manager = RepeatManager(n_frames_per_action - 1)

    def select_action(self):
        # Repeat last chosen action?
        action = self.action_repeat_manager.next()
        if action != None:
            return action

        state = self.preprocessor.process()
        try:
            s = np.array(state).reshape(len(state), 1)
        except:
            s = np.array(state).reshape(1, 1)

        if self._sars[2]:
            self._sars[3] = s
            self.flush_experience()

        # Consider postponing the first training until we have 32 samples
        if len(self.experience) > 0:
            self.nn.train(self.experience)

        if np.random.random() < self.epsilon.next():
            action = self.get_random_action()
        else:
            action_index = self.nn.predict(s)
            action = self.available_actions[action_index]

        self.action_repeat_manager.set(action)

        self._sars[0] = s
        self._sars[1] = self.available_actions.index(action)

        return action

    def set_available_actions(self, actions):
        super(SLAgent, self).set_available_actions(actions)
        # possible state values
        state_n = len(self.preprocessor.enumerate_states())

        self.nn = MLP(config='simple',
                      input_ranges=[[0, state_n]],
                      n_outputs=len(actions),
                      batch_size=4)

    def set_raw_state_callbacks(self, state_functions):
        self.preprocessor = StateIndex(
            RelativeBall(state_functions, trinary=True))

    def receive_reward(self, reward):
        self._sars[2] = reward

    def on_episode_start(self):
        self._reset_sars()

    def on_episode_end(self):
        self._sars[3] = self._sars[0]
        self._sars[4] = 0
        self.flush_experience()

    def flush_experience(self):
        self.experience.append(tuple(self._sars))
        self._reset_sars()

    def _reset_sars(self):
        # state, action, reward, newstate, newstate_not_terminal
        self._sars = [None, None, None, None, 1]

    def get_settings(self):
        settings = {
            "name": self.name,
            "version": self.version,
            "experience_replay": self.experience.capacity(),
            "preprocessor": self.preprocessor.get_settings(),
            "epsilon": self.epsilon.get_settings(),
            "nn": self.nn.get_settings(),
        }

        settings.update(super(SLAgent, self).get_settings())

        return settings
コード例 #3
0
class ActionChainAgent(Agent):
    """docstring for RandomAgent"""
    def __init__(self, chain_length):
        super(ActionChainAgent, self).__init__(name='ActionChainAgent',
                                               version='1.2')
        self.q = dict()  # state-action values: q[state][action]
        self.chain = CircularList(chain_length)
        # e=1 until frame 5k, then interpolate down to e=0.05 in frame 10k,
        # and keep it there for the remaining time
        self.e_params = (5000, 10000, 1.0, 0.05)
        self.e = 0.5
        self.nframes = 0
        self.learning_rate = 0.1
        self.discount = 0.9
        self.last_action = None

    def update_e(self):
        self.e = linear_latch(self.nframes, *self.e_params)

    def select_action(self):
        # Always take random action first
        action = self.get_random_action()

        # Greedy action
        if random() > self.e and self.chain.full:
            res = self.get_greedy_action(self.available_actions)
            if res is not None:
                action = res

        self.chain.append(action)
        return action

    def receive_reward(self, reward):
        for chain in sublists(self.chain):
            # Consider the previous moves to be the current state
            state = chain[1:]
            action = chain[0]
            self.update_chain(state, action, reward)
        self.on_frame_end()

    def on_frame_end(self):
        self.nframes += 1
        self.update_e()

    def on_episode_start(self):
        pass

    def on_episode_end(self):
        pass

    def update_chain(self, state, action, reward):
        lhstate = listhash(state)
        if not lhstate in self.q:
            self.q[lhstate] = dict()
        if not action in self.q[lhstate]:
            self.q[lhstate][action] = reward
        else:
            val = self.q[lhstate][action]
            self.q[lhstate][action] = val + self.learning_rate * \
                (reward - self.discount * val)

    def get_greedy_action(self, available_actions):
        # Do a tree search in the previously seen states
        # that match the current state
        best_action = None
        best_value = None
        for state in sublists(self.chain):
            lhstate = listhash(state)
            if lhstate in self.q:
                s = self.q[lhstate]
                for a in available_actions:
                    if a in s:
                        val = s[a]
                        if val > best_value:
                            best_action = a
                            best_value = val
        return best_action

    def reset(self):
        self.e = 0.5
        self.nframes = 0
        self.last_action = None
        self.q = dict()
        self.chain.clear()

    def get_settings(self):
        settings = {
            'chain_length': self.chain.capacity(),
            'e_params': self.e_params,
            'learning_rate': self.learning_rate,
            'discount': self.discount
        }

        settings.update(super(ActionChainAgent, self).get_settings())

        return settings
コード例 #4
0
ファイル: action_chain.py プロジェクト: NTAWolf/pyagents
class ActionChainAgent(Agent):
    """docstring for RandomAgent"""

    def __init__(self, chain_length):
        super(ActionChainAgent, self).__init__(
            name='ActionChainAgent', version='1.2')
        self.q = dict()  # state-action values: q[state][action]
        self.chain = CircularList(chain_length)
        # e=1 until frame 5k, then interpolate down to e=0.05 in frame 10k,
        # and keep it there for the remaining time
        self.e_params = (5000, 10000, 1.0, 0.05)
        self.e = 0.5
        self.nframes = 0
        self.learning_rate = 0.1
        self.discount = 0.9
        self.last_action = None

    def update_e(self):
        self.e = linear_latch(self.nframes, *self.e_params)

    def select_action(self):
        # Always take random action first
        action = self.get_random_action()

        # Greedy action
        if random() > self.e and self.chain.full:
            res = self.get_greedy_action(self.available_actions)
            if res is not None:
                action = res

        self.chain.append(action)
        return action

    def receive_reward(self, reward):
        for chain in sublists(self.chain):
            # Consider the previous moves to be the current state
            state = chain[1:]
            action = chain[0]
            self.update_chain(state, action, reward)
        self.on_frame_end()

    def on_frame_end(self):
        self.nframes += 1
        self.update_e()

    def on_episode_start(self):
        pass

    def on_episode_end(self):
        pass

    def update_chain(self, state, action, reward):
        lhstate = listhash(state)
        if not lhstate in self.q:
            self.q[lhstate] = dict()
        if not action in self.q[lhstate]:
            self.q[lhstate][action] = reward
        else:
            val = self.q[lhstate][action]
            self.q[lhstate][action] = val + self.learning_rate * \
                (reward - self.discount * val)

    def get_greedy_action(self, available_actions):
        # Do a tree search in the previously seen states
        # that match the current state
        best_action = None
        best_value = None
        for state in sublists(self.chain):
            lhstate = listhash(state)
            if lhstate in self.q:
                s = self.q[lhstate]
                for a in available_actions:
                    if a in s:
                        val = s[a]
                        if val > best_value:
                            best_action = a
                            best_value = val
        return best_action

    def reset(self):
        self.e = 0.5
        self.nframes = 0
        self.last_action = None
        self.q = dict() 
        self.chain.clear()


    def get_settings(self):
        settings = {'chain_length': self.chain.capacity(),
                    'e_params': self.e_params,
                    'learning_rate': self.learning_rate,
                    'discount': self.discount
                    }

        settings.update(super(ActionChainAgent, self).get_settings())

        return settings