class SLAgent(Agent): """Agent using keras NN """ def __init__(self, n_frames_per_action=4): super(SLAgent, self).__init__(name="SL", version="1") self.experience = CircularList(1000) self.epsilon = LinearInterpolationManager([(0, 1.0), (1e4, 0.1)]) self.action_repeat_manager = RepeatManager(n_frames_per_action - 1) def select_action(self): # Repeat last chosen action? action = self.action_repeat_manager.next() if action != None: return action state = self.preprocessor.process() try: s = np.array(state).reshape(len(state), 1) except: s = np.array(state).reshape(1, 1) if self._sars[2]: self._sars[3] = s self.flush_experience() # Consider postponing the first training until we have 32 samples if len(self.experience) > 0: self.nn.train(self.experience) if np.random.random() < self.epsilon.next(): action = self.get_random_action() else: action_index = self.nn.predict(s) action = self.available_actions[action_index] self.action_repeat_manager.set(action) self._sars[0] = s self._sars[1] = self.available_actions.index(action) return action def set_available_actions(self, actions): super(SLAgent, self).set_available_actions(actions) # possible state values state_n = len(self.preprocessor.enumerate_states()) self.nn = MLP(config="simple", input_ranges=[[0, state_n]], n_outputs=len(actions), batch_size=4) def set_raw_state_callbacks(self, state_functions): self.preprocessor = StateIndex(RelativeBall(state_functions, trinary=True)) def receive_reward(self, reward): self._sars[2] = reward def on_episode_start(self): self._reset_sars() def on_episode_end(self): self._sars[3] = self._sars[0] self._sars[4] = 0 self.flush_experience() def flush_experience(self): self.experience.append(tuple(self._sars)) self._reset_sars() def _reset_sars(self): # state, action, reward, newstate, newstate_not_terminal self._sars = [None, None, None, None, 1] def get_settings(self): settings = { "name": self.name, "version": self.version, "experience_replay": self.experience.capacity(), "preprocessor": self.preprocessor.get_settings(), "epsilon": self.epsilon.get_settings(), "nn": self.nn.get_settings(), } settings.update(super(SLAgent, self).get_settings()) return settings
class SLAgent(Agent): """Agent using keras NN """ def __init__(self, n_frames_per_action=4): super(SLAgent, self).__init__(name='SL', version='1') self.experience = CircularList(1000) self.epsilon = LinearInterpolationManager([(0, 1.0), (1e4, 0.1)]) self.action_repeat_manager = RepeatManager(n_frames_per_action - 1) def select_action(self): # Repeat last chosen action? action = self.action_repeat_manager.next() if action != None: return action state = self.preprocessor.process() try: s = np.array(state).reshape(len(state), 1) except: s = np.array(state).reshape(1, 1) if self._sars[2]: self._sars[3] = s self.flush_experience() # Consider postponing the first training until we have 32 samples if len(self.experience) > 0: self.nn.train(self.experience) if np.random.random() < self.epsilon.next(): action = self.get_random_action() else: action_index = self.nn.predict(s) action = self.available_actions[action_index] self.action_repeat_manager.set(action) self._sars[0] = s self._sars[1] = self.available_actions.index(action) return action def set_available_actions(self, actions): super(SLAgent, self).set_available_actions(actions) # possible state values state_n = len(self.preprocessor.enumerate_states()) self.nn = MLP(config='simple', input_ranges=[[0, state_n]], n_outputs=len(actions), batch_size=4) def set_raw_state_callbacks(self, state_functions): self.preprocessor = StateIndex( RelativeBall(state_functions, trinary=True)) def receive_reward(self, reward): self._sars[2] = reward def on_episode_start(self): self._reset_sars() def on_episode_end(self): self._sars[3] = self._sars[0] self._sars[4] = 0 self.flush_experience() def flush_experience(self): self.experience.append(tuple(self._sars)) self._reset_sars() def _reset_sars(self): # state, action, reward, newstate, newstate_not_terminal self._sars = [None, None, None, None, 1] def get_settings(self): settings = { "name": self.name, "version": self.version, "experience_replay": self.experience.capacity(), "preprocessor": self.preprocessor.get_settings(), "epsilon": self.epsilon.get_settings(), "nn": self.nn.get_settings(), } settings.update(super(SLAgent, self).get_settings()) return settings
class ActionChainAgent(Agent): """docstring for RandomAgent""" def __init__(self, chain_length): super(ActionChainAgent, self).__init__(name='ActionChainAgent', version='1.2') self.q = dict() # state-action values: q[state][action] self.chain = CircularList(chain_length) # e=1 until frame 5k, then interpolate down to e=0.05 in frame 10k, # and keep it there for the remaining time self.e_params = (5000, 10000, 1.0, 0.05) self.e = 0.5 self.nframes = 0 self.learning_rate = 0.1 self.discount = 0.9 self.last_action = None def update_e(self): self.e = linear_latch(self.nframes, *self.e_params) def select_action(self): # Always take random action first action = self.get_random_action() # Greedy action if random() > self.e and self.chain.full: res = self.get_greedy_action(self.available_actions) if res is not None: action = res self.chain.append(action) return action def receive_reward(self, reward): for chain in sublists(self.chain): # Consider the previous moves to be the current state state = chain[1:] action = chain[0] self.update_chain(state, action, reward) self.on_frame_end() def on_frame_end(self): self.nframes += 1 self.update_e() def on_episode_start(self): pass def on_episode_end(self): pass def update_chain(self, state, action, reward): lhstate = listhash(state) if not lhstate in self.q: self.q[lhstate] = dict() if not action in self.q[lhstate]: self.q[lhstate][action] = reward else: val = self.q[lhstate][action] self.q[lhstate][action] = val + self.learning_rate * \ (reward - self.discount * val) def get_greedy_action(self, available_actions): # Do a tree search in the previously seen states # that match the current state best_action = None best_value = None for state in sublists(self.chain): lhstate = listhash(state) if lhstate in self.q: s = self.q[lhstate] for a in available_actions: if a in s: val = s[a] if val > best_value: best_action = a best_value = val return best_action def reset(self): self.e = 0.5 self.nframes = 0 self.last_action = None self.q = dict() self.chain.clear() def get_settings(self): settings = { 'chain_length': self.chain.capacity(), 'e_params': self.e_params, 'learning_rate': self.learning_rate, 'discount': self.discount } settings.update(super(ActionChainAgent, self).get_settings()) return settings
class ActionChainAgent(Agent): """docstring for RandomAgent""" def __init__(self, chain_length): super(ActionChainAgent, self).__init__( name='ActionChainAgent', version='1.2') self.q = dict() # state-action values: q[state][action] self.chain = CircularList(chain_length) # e=1 until frame 5k, then interpolate down to e=0.05 in frame 10k, # and keep it there for the remaining time self.e_params = (5000, 10000, 1.0, 0.05) self.e = 0.5 self.nframes = 0 self.learning_rate = 0.1 self.discount = 0.9 self.last_action = None def update_e(self): self.e = linear_latch(self.nframes, *self.e_params) def select_action(self): # Always take random action first action = self.get_random_action() # Greedy action if random() > self.e and self.chain.full: res = self.get_greedy_action(self.available_actions) if res is not None: action = res self.chain.append(action) return action def receive_reward(self, reward): for chain in sublists(self.chain): # Consider the previous moves to be the current state state = chain[1:] action = chain[0] self.update_chain(state, action, reward) self.on_frame_end() def on_frame_end(self): self.nframes += 1 self.update_e() def on_episode_start(self): pass def on_episode_end(self): pass def update_chain(self, state, action, reward): lhstate = listhash(state) if not lhstate in self.q: self.q[lhstate] = dict() if not action in self.q[lhstate]: self.q[lhstate][action] = reward else: val = self.q[lhstate][action] self.q[lhstate][action] = val + self.learning_rate * \ (reward - self.discount * val) def get_greedy_action(self, available_actions): # Do a tree search in the previously seen states # that match the current state best_action = None best_value = None for state in sublists(self.chain): lhstate = listhash(state) if lhstate in self.q: s = self.q[lhstate] for a in available_actions: if a in s: val = s[a] if val > best_value: best_action = a best_value = val return best_action def reset(self): self.e = 0.5 self.nframes = 0 self.last_action = None self.q = dict() self.chain.clear() def get_settings(self): settings = {'chain_length': self.chain.capacity(), 'e_params': self.e_params, 'learning_rate': self.learning_rate, 'discount': self.discount } settings.update(super(ActionChainAgent, self).get_settings()) return settings