def add_refined_feature(self, index1, index2, Q): """ adds the combination of 2 existing features to the representation """ f1 = self.features[index1] f2 = self.features[index2] new_center = np.zeros_like(f1.center) cnt = np.zeros_like(f1.center) cnt[f1.dim] += 1 cnt[f2.dim] += 1 cnt[cnt == 0] = 1. new_center[f1.dim] += f1.center[f1.dim] new_center[f2.dim] += f2.center[f2.dim] new_center /= cnt new_dim = list(frozenset(f1.dim) | frozenset(f2.dim)) new_base_ids = f1.base_ids | f2.base_ids new_dim.sort() new_f = KernelizedFeature(center=new_center, dim=new_dim, kernel_args=self.kernel_args, kernel=self.kernel, index=self.features_num, base_ids=new_base_ids) self.features.append(new_f) # Priority is the negative number of base ids self.sorted_ids.push(-len(new_f.base_ids), self.features_num) #assert(len(self.sorted_ids.toList()) == self.features_num + 1) self.base_id_sets.add(new_f.base_ids) del self.candidates[(index1, index2)] # add new candidates new_cand = {(f, self.features_num): Candidate(f, self.features_num) for f in xrange(self.features_num) if (self.features[f].base_ids | new_base_ids) not in self.base_id_sets and len(frozenset(self.features[f].dim) & frozenset(new_dim)) == 0} for c, _ in new_cand.keys(): self.base_id_sets.add(new_base_ids | self.features[c].base_ids) self.candidates.update(new_cand) self.logger.debug( "Added refined feature {} {}".format( self.features_num, new_f)) self.logger.debug("{} candidates".format(len(self.candidates))) self.features_num += 1 if self.normalization: self.weight_vec = addNewElementForAllActions( self.weight_vec, self.domain.actions_num, Q) else: self.weight_vec = addNewElementForAllActions( self.weight_vec, self.domain.actions_num) return self.features_num - 1
def _expand_vectors(self, num_expansions): """ correct size of GQ weight and e-traces when new features were expanded """ new_elem = np.zeros((self.representation.actions_num, num_expansions)) self.GQWeight = addNewElementForAllActions( self.GQWeight, self.representation.actions_num, new_elem) if self.lambda_: # Correct the size of eligibility traces (pad with zeros for new # features) self.eligibility_trace = addNewElementForAllActions( self.eligibility_trace, self.representation.actions_num, new_elem) self.eligibility_trace_s = addNewElementForAllActions( self.eligibility_trace_s, 1, np.zeros((1, num_expansions)))
def addNewWeight(self): """ Add a new zero weight, corresponding to a newly added feature, to all actions. """ self.weight_vec = addNewElementForAllActions(self.weight_vec, self.actions_num)
def addNewWeight(self): """ Add a new zero weight, corresponding to a newly added feature, to all actions. """ self.weight_vec = addNewElementForAllActions( self.weight_vec, self.actions_num)
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # The previous state could never be terminal # (otherwise the episode would have already terminated) prevStateTerminal = False self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal) discount_factor = self.discount_factor weight_vec = self.representation.weight_vec phi_s = self.representation.phi(s, prevStateTerminal) phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s) phi_prime_s = self.representation.phi(ns, terminal) na = self._future_action( ns, terminal, np_actions, phi_prime_s, na) # here comes the difference between SARSA and Q-Learning phi_prime = self.representation.phi_sa(ns, terminal, na, phi_prime_s) nnz = count_nonzero(phi_s) # Number of non-zero elements # Set eligibility traces: if self.lambda_: expanded = old_div((- len(self.eligibility_trace) + len(phi)), \ self.representation.actions_num) if expanded > 0: # Correct the size of eligibility traces (pad with zeros for # new features) self.eligibility_trace = addNewElementForAllActions( self.eligibility_trace, self.representation.actions_num, np.zeros((self.representation.actions_num, expanded))) self.eligibility_trace *= discount_factor * self.lambda_ self.eligibility_trace += phi # Set max to 1 self.eligibility_trace[self.eligibility_trace > 1] = 1 else: self.eligibility_trace = phi td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec) if nnz > 0: self.updateLearnRate(phi, phi_prime, self.eligibility_trace, discount_factor, nnz, terminal) weight_vec_old = weight_vec.copy() weight_vec += self.learn_rate * \ self.representation.featureLearningRate() * \ td_error * self.eligibility_trace if not np.all(np.isfinite(weight_vec)): weight_vec = weight_vec_old print( "WARNING: TD-Learning diverged, weight_vec reached infinity!" ) # Discover features if the representation has the discover method expanded = self.representation.post_discover(s, prevStateTerminal, a, td_error, phi_s) if terminal: # If THIS state is terminal: self.episodeTerminated()
def add_base_feature(self, center, dim, Q): """ adds a new 1-dimensional feature and returns its index """ new_f = KernelizedFeature( center=center, dim=[dim], kernel_args=self.kernel_args, kernel=self.kernel, index=self.features_num) self.features.append(new_f) self.base_id_sets.add(new_f.base_ids) self.sorted_ids.push(-1, self.features_num) self.logger.debug( "Added Feature {} {}".format( self.features_num, new_f)) # add combinations with all existing features as candidates new_cand = {(f, self.features_num): Candidate(f, self.features_num) for f in xrange(self.features_num) if dim not in self.features[f].dim} self.candidates.update(new_cand) for f, _ in new_cand.keys(): self.base_id_sets.add(new_f.base_ids | self.features[f].base_ids) self.features_num += 1 # add parameter dimension if self.normalization: self.weight_vec = addNewElementForAllActions( self.weight_vec, self.domain.actions_num, Q) else: self.weight_vec = addNewElementForAllActions( self.weight_vec, self.domain.actions_num) return self.features_num - 1
def updateWeight(self, p1_index, p2_index): # Add a new weight corresponding to the new added feature for all actions. # The new weight is set to zero if sparsify = False, and equal to the # sum of weights corresponding to the parents if sparsify = True a = self.domain.actions_num # Number of feature before adding the new one f = self.features_num - 1 if self.sparsify: newElem = (self.weight_vec[p1_index::f] + self.weight_vec[p2_index::f]).reshape((-1, 1)) else: newElem = None self.weight_vec = addNewElementForAllActions(self.weight_vec, a, newElem) # We dont want to reuse the hased phi because phi function is changed! self.hashed_s = None
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # The previous state could never be terminal # (otherwise the episode would have already terminated) prevStateTerminal = False self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal) discount_factor = self.discount_factor weight_vec = self.representation.weight_vec phi_s = self.representation.phi(s, prevStateTerminal) phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s) phi_prime_s = self.representation.phi(ns, terminal) na = self._future_action( ns, terminal, np_actions, phi_prime_s, na) # here comes the difference between SARSA and Q-Learning phi_prime = self.representation.phi_sa( ns, terminal, na, phi_prime_s) nnz = count_nonzero(phi_s) # Number of non-zero elements # Set eligibility traces: if self.lambda_: expanded = (- len(self.eligibility_trace) + len(phi)) / \ self.representation.actions_num if expanded > 0: # Correct the size of eligibility traces (pad with zeros for # new features) self.eligibility_trace = addNewElementForAllActions( self.eligibility_trace, self.representation.actions_num, np.zeros((self.representation.actions_num, expanded))) self.eligibility_trace_s = addNewElementForAllActions( self.eligibility_trace_s, 1, np.zeros((1, expanded))) self.eligibility_trace *= discount_factor * self.lambda_ self.eligibility_trace += phi self.eligibility_trace_s *= discount_factor * self.lambda_ self.eligibility_trace_s += phi_s # Set max to 1 self.eligibility_trace[self.eligibility_trace > 1] = 1 self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1 else: self.eligibility_trace = phi self.eligibility_trace_s = phi_s td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec) if nnz > 0: self.updateLearnRate( phi_s, phi_prime_s, self.eligibility_trace_s, discount_factor, nnz, terminal) weight_vec_old = weight_vec.copy() weight_vec += self.learn_rate * \ td_error * self.eligibility_trace if not np.all(np.isfinite(weight_vec)): weight_vec = weight_vec_old print "WARNING: TD-Learning diverged, weight_vec reached infinity!" # Discover features if the representation has the discover method expanded = self.representation.post_discover( s, prevStateTerminal, a, td_error, phi_s) if terminal: # If THIS state is terminal: self.episodeTerminated()