Beispiel #1
0
    def add_refined_feature(self, index1, index2, Q):
        """
        adds the combination of 2 existing features to the representation
        """
        f1 = self.features[index1]
        f2 = self.features[index2]
        new_center = np.zeros_like(f1.center)
        cnt = np.zeros_like(f1.center)
        cnt[f1.dim] += 1
        cnt[f2.dim] += 1
        cnt[cnt == 0] = 1.
        new_center[f1.dim] += f1.center[f1.dim]
        new_center[f2.dim] += f2.center[f2.dim]
        new_center /= cnt
        new_dim = list(frozenset(f1.dim) | frozenset(f2.dim))
        new_base_ids = f1.base_ids | f2.base_ids
        new_dim.sort()
        new_f = KernelizedFeature(center=new_center, dim=new_dim,
                                  kernel_args=self.kernel_args,
                                  kernel=self.kernel, index=self.features_num,
                                  base_ids=new_base_ids)
        self.features.append(new_f)
        # Priority is the negative number of base ids
        self.sorted_ids.push(-len(new_f.base_ids), self.features_num)
        #assert(len(self.sorted_ids.toList()) == self.features_num + 1)
        self.base_id_sets.add(new_f.base_ids)
        del self.candidates[(index1, index2)]

        # add new candidates
        new_cand = {(f, self.features_num): Candidate(f, self.features_num) for f in xrange(self.features_num)
                    if (self.features[f].base_ids | new_base_ids) not in self.base_id_sets
                    and len(frozenset(self.features[f].dim) & frozenset(new_dim)) == 0}
        for c, _ in new_cand.keys():
            self.base_id_sets.add(new_base_ids | self.features[c].base_ids)
        self.candidates.update(new_cand)
        self.logger.debug(
            "Added refined feature {} {}".format(
                self.features_num,
                new_f))
        self.logger.debug("{} candidates".format(len(self.candidates)))
        self.features_num += 1
        if self.normalization:
            self.weight_vec = addNewElementForAllActions(
                self.weight_vec,
                self.domain.actions_num,
                Q)
        else:
            self.weight_vec = addNewElementForAllActions(
                self.weight_vec,
                self.domain.actions_num)

        return self.features_num - 1
Beispiel #2
0
 def _expand_vectors(self, num_expansions):
     """
     correct size of GQ weight and e-traces when new features were expanded
     """
     new_elem = np.zeros((self.representation.actions_num, num_expansions))
     self.GQWeight = addNewElementForAllActions(
         self.GQWeight, self.representation.actions_num, new_elem)
     if self.lambda_:
         # Correct the size of eligibility traces (pad with zeros for new
         # features)
         self.eligibility_trace = addNewElementForAllActions(
             self.eligibility_trace, self.representation.actions_num,
             new_elem)
         self.eligibility_trace_s = addNewElementForAllActions(
             self.eligibility_trace_s, 1, np.zeros((1, num_expansions)))
Beispiel #3
0
 def addNewWeight(self):
     """
     Add a new zero weight, corresponding to a newly added feature,
     to all actions.
     """
     self.weight_vec = addNewElementForAllActions(self.weight_vec,
                                                  self.actions_num)
Beispiel #4
0
 def addNewWeight(self):
     """
     Add a new zero weight, corresponding to a newly added feature,
     to all actions.
     """
     self.weight_vec = addNewElementForAllActions(
         self.weight_vec,
         self.actions_num)
Beispiel #5
0
 def _expand_vectors(self, num_expansions):
     """
     correct size of GQ weight and e-traces when new features were expanded
     """
     new_elem = np.zeros((self.representation.actions_num, num_expansions))
     self.GQWeight = addNewElementForAllActions(
         self.GQWeight,
         self.representation.actions_num,
         new_elem)
     if self.lambda_:
         # Correct the size of eligibility traces (pad with zeros for new
         # features)
         self.eligibility_trace = addNewElementForAllActions(
             self.eligibility_trace,
             self.representation.actions_num,
             new_elem)
         self.eligibility_trace_s = addNewElementForAllActions(
             self.eligibility_trace_s, 1, np.zeros((1, num_expansions)))
Beispiel #6
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):
        # The previous state could never be terminal
        # (otherwise the episode would have already terminated)
        prevStateTerminal = False

        self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal)
        discount_factor = self.discount_factor
        weight_vec = self.representation.weight_vec
        phi_s = self.representation.phi(s, prevStateTerminal)
        phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s)
        phi_prime_s = self.representation.phi(ns, terminal)
        na = self._future_action(
            ns, terminal, np_actions, phi_prime_s,
            na)  # here comes the difference between SARSA and Q-Learning
        phi_prime = self.representation.phi_sa(ns, terminal, na, phi_prime_s)
        nnz = count_nonzero(phi_s)  # Number of non-zero elements

        # Set eligibility traces:
        if self.lambda_:
            expanded = old_div((- len(self.eligibility_trace) + len(phi)), \
                self.representation.actions_num)
            if expanded > 0:
                # Correct the size of eligibility traces (pad with zeros for
                # new features)
                self.eligibility_trace = addNewElementForAllActions(
                    self.eligibility_trace, self.representation.actions_num,
                    np.zeros((self.representation.actions_num, expanded)))

            self.eligibility_trace *= discount_factor * self.lambda_
            self.eligibility_trace += phi

            # Set max to 1
            self.eligibility_trace[self.eligibility_trace > 1] = 1
        else:
            self.eligibility_trace = phi

        td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec)
        if nnz > 0:
            self.updateLearnRate(phi, phi_prime, self.eligibility_trace,
                                 discount_factor, nnz, terminal)
            weight_vec_old = weight_vec.copy()
            weight_vec               += self.learn_rate * \
                self.representation.featureLearningRate() * \
                td_error * self.eligibility_trace
            if not np.all(np.isfinite(weight_vec)):
                weight_vec = weight_vec_old
                print(
                    "WARNING: TD-Learning diverged, weight_vec reached infinity!"
                )
        # Discover features if the representation has the discover method
        expanded = self.representation.post_discover(s, prevStateTerminal, a,
                                                     td_error, phi_s)

        if terminal:
            # If THIS state is terminal:
            self.episodeTerminated()
Beispiel #7
0
    def add_base_feature(self, center, dim, Q):
        """
        adds a new 1-dimensional feature and returns its index
        """
        new_f = KernelizedFeature(
            center=center, dim=[dim], kernel_args=self.kernel_args,
            kernel=self.kernel, index=self.features_num)
        self.features.append(new_f)

        self.base_id_sets.add(new_f.base_ids)
        self.sorted_ids.push(-1, self.features_num)
        self.logger.debug(
            "Added Feature {} {}".format(
                self.features_num,
                new_f))

        # add combinations with all existing features as candidates
        new_cand = {(f, self.features_num): Candidate(f, self.features_num)
                    for f in xrange(self.features_num) if dim not in self.features[f].dim}

        self.candidates.update(new_cand)
        for f, _ in new_cand.keys():
            self.base_id_sets.add(new_f.base_ids | self.features[f].base_ids)
        self.features_num += 1

        # add parameter dimension
        if self.normalization:
            self.weight_vec = addNewElementForAllActions(
                self.weight_vec,
                self.domain.actions_num,
                Q)
        else:
            self.weight_vec = addNewElementForAllActions(
                self.weight_vec,
                self.domain.actions_num)
        return self.features_num - 1
Beispiel #8
0
 def updateWeight(self, p1_index, p2_index):
     # Add a new weight corresponding to the new added feature for all actions.
     # The new weight is set to zero if sparsify = False, and equal to the
     # sum of weights corresponding to the parents if sparsify = True
     a = self.domain.actions_num
     # Number of feature before adding the new one
     f = self.features_num - 1
     if self.sparsify:
         newElem = (self.weight_vec[p1_index::f] +
                    self.weight_vec[p2_index::f]).reshape((-1, 1))
     else:
         newElem = None
     self.weight_vec = addNewElementForAllActions(self.weight_vec, a, newElem)
     # We dont want to reuse the hased phi because phi function is changed!
     self.hashed_s = None
Beispiel #9
0
 def updateWeight(self, p1_index, p2_index):
     # Add a new weight corresponding to the new added feature for all actions.
     # The new weight is set to zero if sparsify = False, and equal to the
     # sum of weights corresponding to the parents if sparsify = True
     a = self.domain.actions_num
     # Number of feature before adding the new one
     f = self.features_num - 1
     if self.sparsify:
         newElem = (self.weight_vec[p1_index::f] +
                    self.weight_vec[p2_index::f]).reshape((-1, 1))
     else:
         newElem = None
     self.weight_vec = addNewElementForAllActions(self.weight_vec, a,
                                                  newElem)
     # We dont want to reuse the hased phi because phi function is changed!
     self.hashed_s = None
Beispiel #10
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):

        # The previous state could never be terminal
        # (otherwise the episode would have already terminated)
        prevStateTerminal = False

        self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal)
        discount_factor = self.discount_factor
        weight_vec = self.representation.weight_vec
        phi_s = self.representation.phi(s, prevStateTerminal)
        phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s)
        phi_prime_s = self.representation.phi(ns, terminal)
        na = self._future_action(
            ns,
            terminal,
            np_actions,
            phi_prime_s,
            na)  # here comes the difference between SARSA and Q-Learning
        phi_prime = self.representation.phi_sa(
            ns,
            terminal,
            na,
            phi_prime_s)
        nnz = count_nonzero(phi_s)    # Number of non-zero elements

        # Set eligibility traces:
        if self.lambda_:
            expanded = (- len(self.eligibility_trace) + len(phi)) / \
                self.representation.actions_num
            if expanded > 0:
                # Correct the size of eligibility traces (pad with zeros for
                # new features)
                self.eligibility_trace = addNewElementForAllActions(
                    self.eligibility_trace,
                    self.representation.actions_num,
                    np.zeros((self.representation.actions_num,
                              expanded)))
                self.eligibility_trace_s = addNewElementForAllActions(
                    self.eligibility_trace_s, 1, np.zeros((1, expanded)))

            self.eligibility_trace *= discount_factor * self.lambda_
            self.eligibility_trace += phi

            self.eligibility_trace_s *= discount_factor * self.lambda_
            self.eligibility_trace_s += phi_s

            # Set max to 1
            self.eligibility_trace[self.eligibility_trace > 1] = 1
            self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1
        else:
            self.eligibility_trace = phi
            self.eligibility_trace_s = phi_s

        td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec)
        if nnz > 0:
            self.updateLearnRate(
                phi_s,
                phi_prime_s,
                self.eligibility_trace_s,
                discount_factor,
                nnz,
                terminal)
            weight_vec_old = weight_vec.copy()
            weight_vec               += self.learn_rate * \
                td_error * self.eligibility_trace
            if not np.all(np.isfinite(weight_vec)):
                weight_vec = weight_vec_old
                print "WARNING: TD-Learning diverged, weight_vec reached infinity!"
        # Discover features if the representation has the discover method
        expanded = self.representation.post_discover(
            s,
            prevStateTerminal,
            a,
            td_error,
            phi_s)

        if terminal:
            # If THIS state is terminal:
            self.episodeTerminated()