Beispiel #1
0
class Agent(BaseAgent):
	def __init__(self, name, gamma=0.95, epsilon=1, decay=1-1e-4):
		super().__init__(name)
		self.gamma = gamma
		self.epsilon = epsilon
		self.decay = decay
		
		self.model = Model(((16,),), ((4,),))
		self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes'])
	
	def react(self, position, reward=0, done=None):
		zeros = np.zeros(16); zeros[position] = 1; position = zeros
		action = self.respond(position)
		self.memory.store(position.copy(), action, reward, bool(done), position.copy())
		if done:
			self.learn()
			self.memory.forget()
			self.epsilon *= self.decay
		super().react(reward, done)
		return {'action': action}
	
	def respond(self, position):
		if np.random.rand() < self.epsilon:
			return np.random.randint(0, *self.model.output_shapes[0])
		else:
			prediction = self.model.predict([[position]])[0][0]
			choice = np.random.choice(prediction, p=prediction)
			return np.argmax(prediction == choice)
	
	def learn(self):
		positions, actions, rewards, dones, outcomes = self.memory[:-1]
		if len(positions) >= 1:
			advantages = discount(rewards, self.gamma).reshape(-1, 1)
			# advantages = (advantages - np.mean(advantages)) / (np.std(advantages) or 1e-9)
			self.model.fit([positions], [actions.reshape(-1, 1), advantages])
Beispiel #2
0
	def __init__(self, name, gamma=0.95, epsilon=1, decay=1-1e-4):
		super().__init__(name)
		self.gamma = gamma
		self.epsilon = epsilon
		self.decay = decay
		
		self.model = Model(((16,),), ((4,),))
		self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes'])
Beispiel #3
0
class Agent(BaseAgent):
    def __init__(self,
                 name,
                 batch=10,
                 gamma=0.95,
                 epsilon=1,
                 decay=1 - 1e-4,
                 frequency=1000):
        super().__init__(name)
        self.batch = batch
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay = decay
        self.frequency = frequency

        self.model = Model([[16]], [[4]], [15])
        self.target = Model([[16]], [[4]], [15])
        self.memory = Transitions(['positions', 'actions'],
                                  ['rewards', 'dones', 'outcomes'])

    def react(self, position, reward=0, done=None):
        zeros = np.zeros(16)
        zeros[position] = 1
        position = zeros
        action = self.respond(position)
        self.memory.store(position.copy(), action, reward, bool(done),
                          position.copy())
        if self.age % self.batch == (self.batch - 1) or done:
            self.learn(self.batch)
        if self.age % self.frequency == (self.frequency - 1):
            self.target.set_parameters(self.model.get_parameters())
        if done:
            self.epsilon *= self.decay
        super().react(reward, done)
        return {'action': action}

    def respond(self, position):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, *self.model.output_shapes[0])
        else:
            prediction = self.model.predict([[position]])[0][0]
            return np.argmax(prediction)

    def learn(self, number=1):
        positions, actions, rewards, dones, outcomes = self.memory.shuffled(
            number)
        if len(positions) >= 1:
            past_value_predictions = self.model.predict([positions])[0]
            future_value_predictions = np.select(
                [~dones.reshape(-1, 1)], [self.target.predict([outcomes])[0]])
            future_value_predictions = np.max(future_value_predictions, axis=1)

            targets = past_value_predictions
            targets[range(len(targets)),
                    actions] = rewards + self.gamma * future_value_predictions

            self.model.fit([positions], [targets])
Beispiel #4
0
    def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4):
        super().__init__(name)
        self.batch = batch
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay = decay

        self.model = Model(((2, ), ), ((3, ), ), [7])
        self.memory = Transitions(['states', 'actions'],
                                  ['rewards', 'dones', 'outcomes'])
Beispiel #5
0
	def __init__(self, name, dimensions, batch=10, gamma=0.95, epsilon=1, decay=1-1e-4):
		super().__init__(name)
		self.batch = batch
		self.gamma = gamma
		self.epsilon = epsilon
		self.decay = decay
		
		self.actor_model = ActorModel(((dimensions,),), ((dimensions*2,),))
		self.critic_model = CriticModel(((dimensions,),), ((1,),), [7])
		self.memory = Transitions(['positions', 'actions'], ['rewards', 'dones', 'outcomes'])
Beispiel #6
0
class Agent(BaseAgent):
    def __init__(self, name, batch=10, gamma=0.95, epsilon=1, decay=1 - 1e-4):
        super().__init__(name)
        self.batch = batch
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay = decay

        self.actor_model = ActorModel(((16, ), ), ((4, ), ))
        self.critic_model = CriticModel(((16, ), ), ((1, ), ))
        self.memory = Transitions(['positions', 'actions'],
                                  ['rewards', 'dones', 'outcomes'])

    def react(self, position, reward=0, done=None):
        zeros = np.zeros(16)
        zeros[position] = 1
        position = zeros
        action = self.respond(position)
        self.memory.store(position.copy(), action, reward, bool(done),
                          position.copy())
        if self.age % self.batch == (self.batch - 1) or done:
            self.learn(self.batch)
        if done:
            self.memory.forget()
            self.epsilon *= self.decay
        super().react(reward, done)
        return {'action': action}

    def respond(self, position):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, *self.actor_model.output_shapes[0])
        else:
            prediction = self.actor_model.predict([[position]])[0][0]
            choice = np.random.choice(prediction, p=prediction)
            return np.argmax(prediction == choice)

    def learn(self, number=1):
        positions, actions, rewards, dones, outcomes = self.memory[-(number +
                                                                     1):-1]
        if len(positions) >= 1:
            past_value_predictions = self.critic_model.predict([positions])[0]
            future_value_prediction = [
                0
            ] if dones[-1] else self.critic_model.predict([outcomes[-1:]
                                                           ])[0][0]

            targets = discount(
                np.concatenate((rewards, future_value_prediction)),
                self.gamma)[:-1]
            targets = targets.reshape(-1, 1)
            advantages = targets - past_value_predictions

            self.actor_model.fit([positions],
                                 [actions.reshape(-1, 1), advantages])
            self.critic_model.fit([positions], [targets])
Beispiel #7
0
class Agent(BaseAgent):
    def __init__(self,
                 name,
                 dimensions,
                 batch=10,
                 gamma=0.95,
                 epsilon=1,
                 decay=1 - 1e-4):
        super().__init__(name)
        self.batch = batch
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay = decay

        self.model = Model([[dimensions]], [[dimensions * 2]], [7])
        self.memory = Transitions(['positions', 'actions'],
                                  ['rewards', 'dones', 'outcomes'])

    def react(self, position, time, reward=0, done=None):
        action = self.respond(position)
        self.memory.store(position.copy(), action, reward, bool(done),
                          position.copy())
        if self.age % self.batch == (self.batch - 1) or done:
            self.learn(self.batch)
        if done:
            self.memory.forget()
            self.epsilon *= self.decay
        super().react(reward, done)
        return {'action': action}

    def respond(self, position):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, *self.model.output_shapes[0])
        else:
            prediction = self.model.predict([[position]])[0][0]
            return np.argmax(prediction)

    def learn(self, number=1):
        positions, actions, rewards, dones, outcomes = self.memory[-(number +
                                                                     1):-1]
        if len(positions) >= 1:
            past_value_predictions = self.model.predict([positions])[0]
            future_value_prediction = [0] if dones[-1] else self.model.predict(
                [outcomes[-1:]])[0][0]
            future_value_prediction = [np.max(future_value_prediction)]

            targets = past_value_predictions
            discounted = discount(
                np.concatenate((rewards, future_value_prediction)), self.gamma)
            targets[range(len(targets)), actions] = discounted[:-1]

            self.model.fit([positions], [targets])
Beispiel #8
0
    def __init__(self,
                 name,
                 dimensions,
                 batch=10,
                 gamma=0.95,
                 epsilon=1,
                 decay=1 - 1e-4):
        super().__init__(name)
        self.batch = batch
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay = decay

        self.model = Model([[dimensions]], [[dimensions * 2]], [7])
        self.memory = Transitions(['positions', 'actions'],
                                  ['rewards', 'dones', 'outcomes'])
Beispiel #9
0
    def __init__(self,
                 name,
                 batch=10,
                 gamma=0.95,
                 epsilon=1,
                 decay=1 - 1e-4,
                 frequency=1000):
        super().__init__(name)
        self.batch = batch
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay = decay
        self.frequency = frequency

        self.model = Model([[4]], [[2]], [15, 15])
        self.target = Model([[4]], [[2]], [15, 15])
        self.memory = Transitions(['states', 'actions'],
                                  ['rewards', 'dones', 'outcomes'])
Beispiel #10
0
class Agent(BaseAgent):
	def __init__(self, name):
		super().__init__(name)
		self.memory = Transitions(['chosen'], extra_keys=['perf'])
	
	def react(self, numbers, reward=0, done=None):
		action = self.respond(numbers, reward, done)
		self.memory.store(numbers[action], perf=reward)
		if done:
			self.memory.forget()
		self.age += 1
		return {'action': action}
	
	def respond(self, numbers, reward, done):
		if done or not self.age:
			return randrange(2)
		chosen, = self.memory[-1]
		if reward > 0:
			return numbers.index(chosen)
		return 1 - numbers.index(chosen)
Beispiel #11
0
class Agent(BaseAgent):
	def __init__(self, name, dimensions=5, length=10):
		super().__init__(name)
		seed(0)
		self.sequence = choices(range(dimensions), k=length)
		self.index = None
		self.memory = Transitions(['dummy'], extra_keys=['perf'])
	
	def react(self, reward=0, done=None):
		action = self.respond(done)
		self.memory.store(perf=reward)
		if done:
			self.memory.forget()
		self.age += 1
		return {'action': action}
	
	def respond(self, done):
		if done or not self.age:
			self.index = -1
		self.index += 1
		return self.sequence[self.index]
Beispiel #12
0
	def __init__(self, name):
		super().__init__(name)
		self.memory = Transitions(['chosen'], extra_keys=['perf'])
Beispiel #13
0
	def __init__(self, name, dimensions=5, length=10):
		super().__init__(name)
		seed(0)
		self.sequence = choices(range(dimensions), k=length)
		self.index = None
		self.memory = Transitions(['dummy'], extra_keys=['perf'])