class Agent(BaseAgent): def __init__(self, name, gamma=0.95, epsilon=1, decay=1-1e-4): super().__init__(name) self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model(((16,),), ((4,),)) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, reward=0, done=None): zeros = np.zeros(16); zeros[position] = 1; position = zeros action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if done: self.learn() self.memory.forget() self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.model.output_shapes[0]) else: prediction = self.model.predict([[position]])[0][0] choice = np.random.choice(prediction, p=prediction) return np.argmax(prediction == choice) def learn(self): positions, actions, rewards, dones, outcomes = self.memory[:-1] if len(positions) >= 1: advantages = discount(rewards, self.gamma).reshape(-1, 1) # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) or 1e-9) self.model.fit([positions], [actions.reshape(-1, 1), advantages])
class Agent(BaseAgent): def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.actor_model = ActorModel(((16, ), ), ((4, ), )) self.critic_model = CriticModel(((16, ), ), ((1, ), )) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, reward=0, done=None): zeros = np.zeros(16) zeros[position] = 1 position = zeros action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if self.age % self.batch == (self.batch - 1) or done: self.learn(self.batch) if done: self.memory.forget() self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.actor_model.output_shapes[0]) else: prediction = self.actor_model.predict([[position]])[0][0] choice = np.random.choice(prediction, p=prediction) return np.argmax(prediction == choice) def learn(self, number=1): positions, actions, rewards, dones, outcomes = self.memory[-(number + 1):-1] if len(positions) >= 1: past_value_predictions = self.critic_model.predict([positions])[0] future_value_prediction = [ 0 ] if dones[-1] else self.critic_model.predict([outcomes[-1:] ])[0][0] targets = discount( np.concatenate((rewards, future_value_prediction)), self.gamma)[:-1] targets = targets.reshape(-1, 1) advantages = targets - past_value_predictions self.actor_model.fit([positions], [actions.reshape(-1, 1), advantages]) self.critic_model.fit([positions], [targets])
class Agent(BaseAgent): def __init__(self, name, dimensions, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model([[dimensions]], [[dimensions * 2]], [7]) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, time, reward=0, done=None): action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if self.age % self.batch == (self.batch - 1) or done: self.learn(self.batch) if done: self.memory.forget() self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.model.output_shapes[0]) else: prediction = self.model.predict([[position]])[0][0] return np.argmax(prediction) def learn(self, number=1): positions, actions, rewards, dones, outcomes = self.memory[-(number + 1):-1] if len(positions) >= 1: past_value_predictions = self.model.predict([positions])[0] future_value_prediction = [0] if dones[-1] else self.model.predict( [outcomes[-1:]])[0][0] future_value_prediction = [np.max(future_value_prediction)] targets = past_value_predictions discounted = discount( np.concatenate((rewards, future_value_prediction)), self.gamma) targets[range(len(targets)), actions] = discounted[:-1] self.model.fit([positions], [targets])
class Agent(BaseAgent): def __init__(self, name): super().__init__(name) self.memory = Transitions(['chosen'], extra_keys=['perf']) def react(self, numbers, reward=0, done=None): action = self.respond(numbers, reward, done) self.memory.store(numbers[action], perf=reward) if done: self.memory.forget() self.age += 1 return {'action': action} def respond(self, numbers, reward, done): if done or not self.age: return randrange(2) chosen, = self.memory[-1] if reward > 0: return numbers.index(chosen) return 1 - numbers.index(chosen)
class Agent(BaseAgent): def __init__(self, name, dimensions=5, length=10): super().__init__(name) seed(0) self.sequence = choices(range(dimensions), k=length) self.index = None self.memory = Transitions(['dummy'], extra_keys=['perf']) def react(self, reward=0, done=None): action = self.respond(done) self.memory.store(perf=reward) if done: self.memory.forget() self.age += 1 return {'action': action} def respond(self, done): if done or not self.age: self.index = -1 self.index += 1 return self.sequence[self.index]