def test_can_fit_model(self): """ This test check ability of fitting model in PER to random vector. """ state_shape = (4, ) action_space = 2 model = PrioritizedExperienceReplayTests._create_model( state_shape, action_space) PER = PrioritizedExperienceReplay(maxlen=1, model=model, key_scaling=10, gamma=1) model_wrapper = ModelWrapper( model=model, optimizer=K.optimizers.Adam(learning_rate=0.01)) model_wrapper.compile() sample = Sample(action=np.random.randint(0, action_space), state=np.random.rand(state_shape[0]), reward=10, next_state=None) PER.add(samples=[sample]) history_of_loss = [] fit_vector = np.zeros((action_space, )) fit_vector[sample.action] = sample.reward for _ in range(100): model_wrapper.fit(sample.state, fit_vector) history_of_loss.append(PER._loss_calculate(sample=sample)) for idx, loss in enumerate(history_of_loss[:-1]): self.assertGreater(loss, history_of_loss[idx + 1])
class DQN(AbstractAgent): def __init__(self, environment: Env, memory: AbstractMemory, policy: AbstractPolicy, model: K.Model, logger: Logger, gamma: float, optimizer: K.optimizers.Optimizer, n_step: int = 1): self.model = ModelWrapper(model, optimizer) #self.model.compile() self.current_model = None self.gamma = gamma self.n_step = n_step super(DQN, self).__init__(environment=environment, memory=memory, policy=policy, model=model, optimizer=optimizer, logger=logger) def _bellman_equation(self, batch: List[Sample]) -> np.ndarray: state = np.array([sample.state for sample in batch]) q_values = self.current_model.predict(state) for idx in range(q_values.shape[0]): q_values[idx][batch[idx].action] = batch[idx].reward if not batch[idx].is_done(): best_action_for_q_next = np.argmax( self.model.predict(batch[idx].next_state)) q_next = self.model.predict( batch[idx].next_state)[0][best_action_for_q_next] q_values[idx][batch[idx].action] += self.gamma * q_next return q_values def learn(self, epochs: int, batch_size_in_step: int, min_n_game_in_exploration: int, batch_size_in_exploration: int, change_model_delay: int): self.model.compile() self.current_model = self.model.clone() self.current_model.compile() eval_score, starting_experience = self._explore_env(self.memory.maxlen) self.memory.add(starting_experience) for epoch in tqdm(range(epochs), desc='Learning in progress: '): if epoch % change_model_delay == 0: self.model = self.current_model.clone() self.model.compile() if type(self.memory) == PrioritizedExperienceReplay: self.memory.update_model(self.model) eval_score, batch = self._explore_env( batch_size_in_exploration, min_n_game_in_exploration) self.memory.add(batch) batch = self.memory.sample(batch_size_in_step) q_values = self._bellman_equation(batch) state = np.array([sample.state for sample in batch]) loss = self.current_model.fit(state, q_values) self.policy.update() self.logger.add_event({ 'loss_value': loss, 'mean_gain': eval_score, 'epoch': epoch }) def __str__(self): return "Agent: " + self.__class__.__name__ + "\n\n" + \ "Discount value: " + str(self.gamma) + "\n"\ "N-step: " + str(self.n_step) + "\n\n"\ "Environment:\n" + str(self.environment) + "\n\n" + \ "Memory:\n" + str(self.memory) + "\n" + \ "Policy:\n" + str(self.policy)