Ejemplo n.º 1
0
    def update_on_batch(self, batch_memory):
        """

        """
        # Unzip the batch
        batch_state, batch_action, batch_reward, batch_state_next = self.memory.unzip_batch(
            batch_memory)
        # print_timestamp("Actual Batch: {}".format(batch_state))
        # Prediction time
        q_target = self.estimator.model.predict(batch_state)
        q_next1 = self.estimator.model.predict(batch_state_next)
        q_next2 = self.target.model.predict(batch_state_next)

        # Find the index of the best estimated action
        _, q_next_max_idx = greedy_batch(q_next1)

        batch_index11 = np.arange(self.batch_size, dtype=np.int32)
        q_next_max = q_next2[batch_index11, q_next_max_idx]
        # print("q_next2: {}".format(q_next2))
        # print("q_next_max: {}".format(q_next_max))
        # print(q_next_max)
        q_target[batch_index11,
                 batch_action] = batch_reward + self.gamma * q_next_max

        self.estimator.model.fit(batch_state,
                                 q_target,
                                 batch_size=self.batch_size,
                                 epochs=1,
                                 verbose=0)
Ejemplo n.º 2
0
    def _estimate_policy(self, points, direction, estimator):
        sine = np.sin(points[:, 1])
        cosine = np.cos(points[:, 1])
        velocity = points[:, 0]
        states = []
        for idx in range(len(sine)):
            if direction is 'positive':
                states.append([sine[idx], cosine[idx], velocity[idx]])
            if direction is 'negative':
                states.append([sine[idx], cosine[idx], -velocity[idx]])

        # Estimate the q-values and the index of the best action.
        states = np.array(states)
        q_values = estimator.model.predict(states)
        return greedy_batch(q_values)