Beispiel #1
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):
        # The previous state could never be terminal
        # (otherwise the episode would have already terminated)
        prevStateTerminal = False

        self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal)
        discount_factor = self.discount_factor
        weight_vec = self.representation.weight_vec
        phi_s = self.representation.phi(s, prevStateTerminal)
        phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s)
        phi_prime_s = self.representation.phi(ns, terminal)
        na = self._future_action(
            ns, terminal, np_actions, phi_prime_s,
            na)  # here comes the difference between SARSA and Q-Learning
        phi_prime = self.representation.phi_sa(ns, terminal, na, phi_prime_s)
        nnz = count_nonzero(phi_s)  # Number of non-zero elements

        # Set eligibility traces:
        if self.lambda_:
            expanded = old_div((- len(self.eligibility_trace) + len(phi)), \
                self.representation.actions_num)
            if expanded > 0:
                # Correct the size of eligibility traces (pad with zeros for
                # new features)
                self.eligibility_trace = addNewElementForAllActions(
                    self.eligibility_trace, self.representation.actions_num,
                    np.zeros((self.representation.actions_num, expanded)))

            self.eligibility_trace *= discount_factor * self.lambda_
            self.eligibility_trace += phi

            # Set max to 1
            self.eligibility_trace[self.eligibility_trace > 1] = 1
        else:
            self.eligibility_trace = phi

        td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec)
        if nnz > 0:
            self.updateLearnRate(phi, phi_prime, self.eligibility_trace,
                                 discount_factor, nnz, terminal)
            weight_vec_old = weight_vec.copy()
            weight_vec               += self.learn_rate * \
                self.representation.featureLearningRate() * \
                td_error * self.eligibility_trace
            if not np.all(np.isfinite(weight_vec)):
                weight_vec = weight_vec_old
                print(
                    "WARNING: TD-Learning diverged, weight_vec reached infinity!"
                )
        # Discover features if the representation has the discover method
        expanded = self.representation.post_discover(s, prevStateTerminal, a,
                                                     td_error, phi_s)

        if terminal:
            # If THIS state is terminal:
            self.episodeTerminated()
Beispiel #2
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):
        self.representation.pre_discover(s, False, a, ns, terminal)
        discount_factor = self.discount_factor
        weight_vec = self.representation.weight_vec
        phi_s = self.representation.phi(s, False)
        phi = self.representation.phi_sa(s, False, a, phi_s)
        phi_prime_s = self.representation.phi(ns, terminal)
        na = self.representation.bestAction(
            ns, terminal, np_actions,
            phi_prime_s)  # Switch na to the best possible action
        phi_prime = self.representation.phi_sa(ns, terminal, na, phi_prime_s)
        nnz = count_nonzero(phi_s)  # Number of non-zero elements

        expanded = (-len(self.GQWeight) +
                    len(phi)) / self.representation.actions_num
        if expanded:
            self._expand_vectors(expanded)
        # Set eligibility traces:
        if self.lambda_:
            self.eligibility_trace *= discount_factor * self.lambda_
            self.eligibility_trace += phi

            self.eligibility_trace_s *= discount_factor * self.lambda_
            self.eligibility_trace_s += phi_s

            # Set max to 1
            self.eligibility_trace[self.eligibility_trace > 1] = 1
            self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1
        else:
            self.eligibility_trace = phi
            self.eligibility_trace_s = phi_s

        td_error                     = r + \
            np.dot(discount_factor * phi_prime - phi, weight_vec)
        self.updateLearnRate(phi_s, phi_prime_s, self.eligibility_trace_s,
                             discount_factor, nnz, terminal)

        if nnz > 0:  # Phi has some nonzero elements, proceed with update
            td_error_estimate_now = np.dot(phi, self.GQWeight)
            Delta_weight_vec                 = td_error * self.eligibility_trace - \
                discount_factor * td_error_estimate_now * phi_prime
            weight_vec += self.learn_rate * Delta_weight_vec
            Delta_GQWeight = (td_error - td_error_estimate_now) * phi
            self.GQWeight               += self.learn_rate * \
                self.secondLearningRateCoef * Delta_GQWeight

        expanded = self.representation.post_discover(s, False, a, td_error,
                                                     phi_s)
        if expanded:
            self._expand_vectors(expanded)
        if terminal:
            self.episodeTerminated()
Beispiel #3
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):
        # The previous state could never be terminal
        # (otherwise the episode would have already terminated)

        prevStateTerminal = False

        # MUST call this at start of learn()
        self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal)

        # Compute feature function values and next action to be taken

        discount_factor = self.discount_factor  # 'gamma' in literature
        feat_weights = self.representation.weight_vec  # Value function, expressed as feature weights
        features_s = self.representation.phi(
            s, prevStateTerminal)  # active feats in state
        features = self.representation.phi_sa(
            s, prevStateTerminal, a,
            features_s)  # active features or an (s,a) pair
        features_prime_s = self.representation.phi(ns, terminal)
        features_prime = self.representation.phi_sa(ns, terminal, na,
                                                    features_prime_s)
        nnz = count_nonzero(features_s)  # Number of non-zero elements

        # Compute td-error
        td_error = r + np.dot(discount_factor * features_prime - features,
                              feat_weights)

        # Update value function (or if TD-learning diverges, take no action)
        if nnz > 0:
            feat_weights_old = feat_weights.copy()
            feat_weights += self.learn_rate * td_error
            if not np.all(np.isfinite(feat_weights)):
                feat_weights = feat_weights_old
                print "WARNING: TD-Learning diverged, theta reached infinity!"

        # MUST call this at end of learn() - add new features to representation as required.
        expanded = self.representation.post_discover(s, False, a, td_error,
                                                     features_s)

        # MUST call this at end of learn() - handle episode termination cleanup as required.
        if terminal:
            self.episodeTerminated()
Beispiel #4
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):

        # The previous state could never be terminal
        # (otherwise the episode would have already terminated)
        prevStateTerminal = False

        self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal)
        discount_factor = self.discount_factor
        weight_vec = self.representation.weight_vec
        phi_s = self.representation.phi(s, prevStateTerminal)
        phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s)
        phi_prime_s = self.representation.phi(ns, terminal)
        na = self._future_action(
            ns,
            terminal,
            np_actions,
            phi_prime_s,
            na)  # here comes the difference between SARSA and Q-Learning
        phi_prime = self.representation.phi_sa(
            ns,
            terminal,
            na,
            phi_prime_s)
        nnz = count_nonzero(phi_s)    # Number of non-zero elements

        # Set eligibility traces:
        if self.lambda_:
            expanded = (- len(self.eligibility_trace) + len(phi)) / \
                self.representation.actions_num
            if expanded > 0:
                # Correct the size of eligibility traces (pad with zeros for
                # new features)
                self.eligibility_trace = addNewElementForAllActions(
                    self.eligibility_trace,
                    self.representation.actions_num,
                    np.zeros((self.representation.actions_num,
                              expanded)))
                self.eligibility_trace_s = addNewElementForAllActions(
                    self.eligibility_trace_s, 1, np.zeros((1, expanded)))

            self.eligibility_trace *= discount_factor * self.lambda_
            self.eligibility_trace += phi

            self.eligibility_trace_s *= discount_factor * self.lambda_
            self.eligibility_trace_s += phi_s

            # Set max to 1
            self.eligibility_trace[self.eligibility_trace > 1] = 1
            self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1
        else:
            self.eligibility_trace = phi
            self.eligibility_trace_s = phi_s

        td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec)
        if nnz > 0:
            self.updateLearnRate(
                phi_s,
                phi_prime_s,
                self.eligibility_trace_s,
                discount_factor,
                nnz,
                terminal)
            weight_vec_old = weight_vec.copy()
            weight_vec               += self.learn_rate * \
                td_error * self.eligibility_trace
            if not np.all(np.isfinite(weight_vec)):
                weight_vec = weight_vec_old
                print "WARNING: TD-Learning diverged, weight_vec reached infinity!"
        # Discover features if the representation has the discover method
        expanded = self.representation.post_discover(
            s,
            prevStateTerminal,
            a,
            td_error,
            phi_s)

        if terminal:
            # If THIS state is terminal:
            self.episodeTerminated()
Beispiel #5
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):
        self.representation.pre_discover(s, False, a, ns, terminal)
        discount_factor = self.discount_factor
        weight_vec = self.representation.weight_vec
        phi_s = self.representation.phi(s, False)
        phi = self.representation.phi_sa(s, False, a, phi_s)
        phi_prime_s = self.representation.phi(ns, terminal)
        na = self.representation.bestAction(
            ns,
            terminal,
            np_actions,
            phi_prime_s)  # Switch na to the best possible action
        phi_prime = self.representation.phi_sa(
            ns,
            terminal,
            na,
            phi_prime_s)
        nnz = count_nonzero(phi_s)    # Number of non-zero elements

        expanded = (- len(self.GQWeight) + len(phi)) / self.representation.actions_num
        if expanded:
            self._expand_vectors(expanded)
        # Set eligibility traces:
        if self.lambda_:
            self.eligibility_trace *= discount_factor * self.lambda_
            self.eligibility_trace += phi

            self.eligibility_trace_s *= discount_factor * self.lambda_
            self.eligibility_trace_s += phi_s

            # Set max to 1
            self.eligibility_trace[self.eligibility_trace > 1] = 1
            self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1
        else:
            self.eligibility_trace = phi
            self.eligibility_trace_s = phi_s

        td_error                     = r + \
            np.dot(discount_factor * phi_prime - phi, weight_vec)
        self.updateLearnRate(
            phi_s,
            phi_prime_s,
            self.eligibility_trace_s,
            discount_factor,
            nnz,
            terminal)

        if nnz > 0:  # Phi has some nonzero elements, proceed with update
            td_error_estimate_now = np.dot(phi, self.GQWeight)
            Delta_weight_vec                 = td_error * self.eligibility_trace - \
                discount_factor * td_error_estimate_now * phi_prime
            weight_vec += self.learn_rate * Delta_weight_vec
            Delta_GQWeight = (
                td_error - td_error_estimate_now) * phi
            self.GQWeight               += self.learn_rate * \
                self.secondLearningRateCoef * Delta_GQWeight

        expanded = self.representation.post_discover(
            s,
            False,
            a,
            td_error,
            phi_s)
        if expanded:
            self._expand_vectors(expanded)
        if terminal:
            self.episodeTerminated()