def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # The previous state could never be terminal # (otherwise the episode would have already terminated) prevStateTerminal = False self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal) discount_factor = self.discount_factor weight_vec = self.representation.weight_vec phi_s = self.representation.phi(s, prevStateTerminal) phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s) phi_prime_s = self.representation.phi(ns, terminal) na = self._future_action( ns, terminal, np_actions, phi_prime_s, na) # here comes the difference between SARSA and Q-Learning phi_prime = self.representation.phi_sa(ns, terminal, na, phi_prime_s) nnz = count_nonzero(phi_s) # Number of non-zero elements # Set eligibility traces: if self.lambda_: expanded = old_div((- len(self.eligibility_trace) + len(phi)), \ self.representation.actions_num) if expanded > 0: # Correct the size of eligibility traces (pad with zeros for # new features) self.eligibility_trace = addNewElementForAllActions( self.eligibility_trace, self.representation.actions_num, np.zeros((self.representation.actions_num, expanded))) self.eligibility_trace *= discount_factor * self.lambda_ self.eligibility_trace += phi # Set max to 1 self.eligibility_trace[self.eligibility_trace > 1] = 1 else: self.eligibility_trace = phi td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec) if nnz > 0: self.updateLearnRate(phi, phi_prime, self.eligibility_trace, discount_factor, nnz, terminal) weight_vec_old = weight_vec.copy() weight_vec += self.learn_rate * \ self.representation.featureLearningRate() * \ td_error * self.eligibility_trace if not np.all(np.isfinite(weight_vec)): weight_vec = weight_vec_old print( "WARNING: TD-Learning diverged, weight_vec reached infinity!" ) # Discover features if the representation has the discover method expanded = self.representation.post_discover(s, prevStateTerminal, a, td_error, phi_s) if terminal: # If THIS state is terminal: self.episodeTerminated()
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): self.representation.pre_discover(s, False, a, ns, terminal) discount_factor = self.discount_factor weight_vec = self.representation.weight_vec phi_s = self.representation.phi(s, False) phi = self.representation.phi_sa(s, False, a, phi_s) phi_prime_s = self.representation.phi(ns, terminal) na = self.representation.bestAction( ns, terminal, np_actions, phi_prime_s) # Switch na to the best possible action phi_prime = self.representation.phi_sa(ns, terminal, na, phi_prime_s) nnz = count_nonzero(phi_s) # Number of non-zero elements expanded = (-len(self.GQWeight) + len(phi)) / self.representation.actions_num if expanded: self._expand_vectors(expanded) # Set eligibility traces: if self.lambda_: self.eligibility_trace *= discount_factor * self.lambda_ self.eligibility_trace += phi self.eligibility_trace_s *= discount_factor * self.lambda_ self.eligibility_trace_s += phi_s # Set max to 1 self.eligibility_trace[self.eligibility_trace > 1] = 1 self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1 else: self.eligibility_trace = phi self.eligibility_trace_s = phi_s td_error = r + \ np.dot(discount_factor * phi_prime - phi, weight_vec) self.updateLearnRate(phi_s, phi_prime_s, self.eligibility_trace_s, discount_factor, nnz, terminal) if nnz > 0: # Phi has some nonzero elements, proceed with update td_error_estimate_now = np.dot(phi, self.GQWeight) Delta_weight_vec = td_error * self.eligibility_trace - \ discount_factor * td_error_estimate_now * phi_prime weight_vec += self.learn_rate * Delta_weight_vec Delta_GQWeight = (td_error - td_error_estimate_now) * phi self.GQWeight += self.learn_rate * \ self.secondLearningRateCoef * Delta_GQWeight expanded = self.representation.post_discover(s, False, a, td_error, phi_s) if expanded: self._expand_vectors(expanded) if terminal: self.episodeTerminated()
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # The previous state could never be terminal # (otherwise the episode would have already terminated) prevStateTerminal = False # MUST call this at start of learn() self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal) # Compute feature function values and next action to be taken discount_factor = self.discount_factor # 'gamma' in literature feat_weights = self.representation.weight_vec # Value function, expressed as feature weights features_s = self.representation.phi( s, prevStateTerminal) # active feats in state features = self.representation.phi_sa( s, prevStateTerminal, a, features_s) # active features or an (s,a) pair features_prime_s = self.representation.phi(ns, terminal) features_prime = self.representation.phi_sa(ns, terminal, na, features_prime_s) nnz = count_nonzero(features_s) # Number of non-zero elements # Compute td-error td_error = r + np.dot(discount_factor * features_prime - features, feat_weights) # Update value function (or if TD-learning diverges, take no action) if nnz > 0: feat_weights_old = feat_weights.copy() feat_weights += self.learn_rate * td_error if not np.all(np.isfinite(feat_weights)): feat_weights = feat_weights_old print "WARNING: TD-Learning diverged, theta reached infinity!" # MUST call this at end of learn() - add new features to representation as required. expanded = self.representation.post_discover(s, False, a, td_error, features_s) # MUST call this at end of learn() - handle episode termination cleanup as required. if terminal: self.episodeTerminated()
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # The previous state could never be terminal # (otherwise the episode would have already terminated) prevStateTerminal = False self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal) discount_factor = self.discount_factor weight_vec = self.representation.weight_vec phi_s = self.representation.phi(s, prevStateTerminal) phi = self.representation.phi_sa(s, prevStateTerminal, a, phi_s) phi_prime_s = self.representation.phi(ns, terminal) na = self._future_action( ns, terminal, np_actions, phi_prime_s, na) # here comes the difference between SARSA and Q-Learning phi_prime = self.representation.phi_sa( ns, terminal, na, phi_prime_s) nnz = count_nonzero(phi_s) # Number of non-zero elements # Set eligibility traces: if self.lambda_: expanded = (- len(self.eligibility_trace) + len(phi)) / \ self.representation.actions_num if expanded > 0: # Correct the size of eligibility traces (pad with zeros for # new features) self.eligibility_trace = addNewElementForAllActions( self.eligibility_trace, self.representation.actions_num, np.zeros((self.representation.actions_num, expanded))) self.eligibility_trace_s = addNewElementForAllActions( self.eligibility_trace_s, 1, np.zeros((1, expanded))) self.eligibility_trace *= discount_factor * self.lambda_ self.eligibility_trace += phi self.eligibility_trace_s *= discount_factor * self.lambda_ self.eligibility_trace_s += phi_s # Set max to 1 self.eligibility_trace[self.eligibility_trace > 1] = 1 self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1 else: self.eligibility_trace = phi self.eligibility_trace_s = phi_s td_error = r + np.dot(discount_factor * phi_prime - phi, weight_vec) if nnz > 0: self.updateLearnRate( phi_s, phi_prime_s, self.eligibility_trace_s, discount_factor, nnz, terminal) weight_vec_old = weight_vec.copy() weight_vec += self.learn_rate * \ td_error * self.eligibility_trace if not np.all(np.isfinite(weight_vec)): weight_vec = weight_vec_old print "WARNING: TD-Learning diverged, weight_vec reached infinity!" # Discover features if the representation has the discover method expanded = self.representation.post_discover( s, prevStateTerminal, a, td_error, phi_s) if terminal: # If THIS state is terminal: self.episodeTerminated()
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): self.representation.pre_discover(s, False, a, ns, terminal) discount_factor = self.discount_factor weight_vec = self.representation.weight_vec phi_s = self.representation.phi(s, False) phi = self.representation.phi_sa(s, False, a, phi_s) phi_prime_s = self.representation.phi(ns, terminal) na = self.representation.bestAction( ns, terminal, np_actions, phi_prime_s) # Switch na to the best possible action phi_prime = self.representation.phi_sa( ns, terminal, na, phi_prime_s) nnz = count_nonzero(phi_s) # Number of non-zero elements expanded = (- len(self.GQWeight) + len(phi)) / self.representation.actions_num if expanded: self._expand_vectors(expanded) # Set eligibility traces: if self.lambda_: self.eligibility_trace *= discount_factor * self.lambda_ self.eligibility_trace += phi self.eligibility_trace_s *= discount_factor * self.lambda_ self.eligibility_trace_s += phi_s # Set max to 1 self.eligibility_trace[self.eligibility_trace > 1] = 1 self.eligibility_trace_s[self.eligibility_trace_s > 1] = 1 else: self.eligibility_trace = phi self.eligibility_trace_s = phi_s td_error = r + \ np.dot(discount_factor * phi_prime - phi, weight_vec) self.updateLearnRate( phi_s, phi_prime_s, self.eligibility_trace_s, discount_factor, nnz, terminal) if nnz > 0: # Phi has some nonzero elements, proceed with update td_error_estimate_now = np.dot(phi, self.GQWeight) Delta_weight_vec = td_error * self.eligibility_trace - \ discount_factor * td_error_estimate_now * phi_prime weight_vec += self.learn_rate * Delta_weight_vec Delta_GQWeight = ( td_error - td_error_estimate_now) * phi self.GQWeight += self.learn_rate * \ self.secondLearningRateCoef * Delta_GQWeight expanded = self.representation.post_discover( s, False, a, td_error, phi_s) if expanded: self._expand_vectors(expanded) if terminal: self.episodeTerminated()