class Agent(BaseAgent): def __init__(self, name, gamma=0.95, epsilon=1, decay=1-1e-4): super().__init__(name) self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model(((16,),), ((4,),)) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, reward=0, done=None): zeros = np.zeros(16); zeros[position] = 1; position = zeros action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if done: self.learn() self.memory.forget() self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.model.output_shapes[0]) else: prediction = self.model.predict([[position]])[0][0] choice = np.random.choice(prediction, p=prediction) return np.argmax(prediction == choice) def learn(self): positions, actions, rewards, dones, outcomes = self.memory[:-1] if len(positions) >= 1: advantages = discount(rewards, self.gamma).reshape(-1, 1) # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) or 1e-9) self.model.fit([positions], [actions.reshape(-1, 1), advantages])
def __init__(self, name, gamma=0.95, epsilon=1, decay=1-1e-4): super().__init__(name) self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model(((16,),), ((4,),)) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes'])
class Agent(BaseAgent): def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4, frequency=1000): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.frequency = frequency self.model = Model([[16]], [[4]], [15]) self.target = Model([[16]], [[4]], [15]) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, reward=0, done=None): zeros = np.zeros(16) zeros[position] = 1 position = zeros action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if self.age % self.batch == (self.batch - 1) or done: self.learn(self.batch) if self.age % self.frequency == (self.frequency - 1): self.target.set_parameters(self.model.get_parameters()) if done: self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.model.output_shapes[0]) else: prediction = self.model.predict([[position]])[0][0] return np.argmax(prediction) def learn(self, number=1): positions, actions, rewards, dones, outcomes = self.memory.shuffled( number) if len(positions) >= 1: past_value_predictions = self.model.predict([positions])[0] future_value_predictions = np.select( [~dones.reshape(-1, 1)], [self.target.predict([outcomes])[0]]) future_value_predictions = np.max(future_value_predictions, axis=1) targets = past_value_predictions targets[range(len(targets)), actions] = rewards + self.gamma * future_value_predictions self.model.fit([positions], [targets])
def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model(((2, ), ), ((3, ), ), [7]) self.memory = Transitions(['states', 'actions'], ['rewards', 'dones', 'outcomes'])
def __init__(self, name, dimensions, batch=10, gamma=0.95, epsilon=1, decay=1-1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.actor_model = ActorModel(((dimensions,),), ((dimensions*2,),)) self.critic_model = CriticModel(((dimensions,),), ((1,),), [7]) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes'])
class Agent(BaseAgent): def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.actor_model = ActorModel(((16, ), ), ((4, ), )) self.critic_model = CriticModel(((16, ), ), ((1, ), )) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, reward=0, done=None): zeros = np.zeros(16) zeros[position] = 1 position = zeros action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if self.age % self.batch == (self.batch - 1) or done: self.learn(self.batch) if done: self.memory.forget() self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.actor_model.output_shapes[0]) else: prediction = self.actor_model.predict([[position]])[0][0] choice = np.random.choice(prediction, p=prediction) return np.argmax(prediction == choice) def learn(self, number=1): positions, actions, rewards, dones, outcomes = self.memory[-(number + 1):-1] if len(positions) >= 1: past_value_predictions = self.critic_model.predict([positions])[0] future_value_prediction = [ 0 ] if dones[-1] else self.critic_model.predict([outcomes[-1:] ])[0][0] targets = discount( np.concatenate((rewards, future_value_prediction)), self.gamma)[:-1] targets = targets.reshape(-1, 1) advantages = targets - past_value_predictions self.actor_model.fit([positions], [actions.reshape(-1, 1), advantages]) self.critic_model.fit([positions], [targets])
class Agent(BaseAgent): def __init__(self, name, dimensions, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model([[dimensions]], [[dimensions * 2]], [7]) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes']) def react(self, position, time, reward=0, done=None): action = self.respond(position) self.memory.store(position.copy(), action, reward, bool(done), position.copy()) if self.age % self.batch == (self.batch - 1) or done: self.learn(self.batch) if done: self.memory.forget() self.epsilon *= self.decay super().react(reward, done) return {'action': action} def respond(self, position): if np.random.rand() < self.epsilon: return np.random.randint(0, *self.model.output_shapes[0]) else: prediction = self.model.predict([[position]])[0][0] return np.argmax(prediction) def learn(self, number=1): positions, actions, rewards, dones, outcomes = self.memory[-(number + 1):-1] if len(positions) >= 1: past_value_predictions = self.model.predict([positions])[0] future_value_prediction = [0] if dones[-1] else self.model.predict( [outcomes[-1:]])[0][0] future_value_prediction = [np.max(future_value_prediction)] targets = past_value_predictions discounted = discount( np.concatenate((rewards, future_value_prediction)), self.gamma) targets[range(len(targets)), actions] = discounted[:-1] self.model.fit([positions], [targets])
def __init__(self, name, dimensions, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.model = Model([[dimensions]], [[dimensions * 2]], [7]) self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes'])
def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4, frequency=1000): super().__init__(name) self.batch = batch self.gamma = gamma self.epsilon = epsilon self.decay = decay self.frequency = frequency self.model = Model([[4]], [[2]], [15, 15]) self.target = Model([[4]], [[2]], [15, 15]) self.memory = Transitions(['states', 'actions'], ['rewards', 'dones', 'outcomes'])
class Agent(BaseAgent): def __init__(self, name): super().__init__(name) self.memory = Transitions(['chosen'], extra_keys=['perf']) def react(self, numbers, reward=0, done=None): action = self.respond(numbers, reward, done) self.memory.store(numbers[action], perf=reward) if done: self.memory.forget() self.age += 1 return {'action': action} def respond(self, numbers, reward, done): if done or not self.age: return randrange(2) chosen, = self.memory[-1] if reward > 0: return numbers.index(chosen) return 1 - numbers.index(chosen)
class Agent(BaseAgent): def __init__(self, name, dimensions=5, length=10): super().__init__(name) seed(0) self.sequence = choices(range(dimensions), k=length) self.index = None self.memory = Transitions(['dummy'], extra_keys=['perf']) def react(self, reward=0, done=None): action = self.respond(done) self.memory.store(perf=reward) if done: self.memory.forget() self.age += 1 return {'action': action} def respond(self, done): if done or not self.age: self.index = -1 self.index += 1 return self.sequence[self.index]
def __init__(self, name): super().__init__(name) self.memory = Transitions(['chosen'], extra_keys=['perf'])
def __init__(self, name, dimensions=5, length=10): super().__init__(name) seed(0) self.sequence = choices(range(dimensions), k=length) self.index = None self.memory = Transitions(['dummy'], extra_keys=['perf'])