def fit(self, dataset): phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset( dataset, self.phi) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) norm = np.inf while norm > self._epsilon: q = self.approximator.predict(phi_next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) next_action = np.argmax(q, axis=1).reshape(-1, 1) phi_next_state_next_action = get_action_features( phi_next_state, next_action, self.mdp_info.action_space.n ) tmp = phi_state_action - self.mdp_info.gamma *\ phi_next_state_next_action self._A += phi_state_action.T.dot(tmp) self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1) old_w = self.approximator.get_weights() if np.linalg.matrix_rank(self._A) == self._A.shape[1]: w = np.linalg.solve(self._A, self._b).ravel() else: w = np.linalg.pinv(self._A).dot(self._b).ravel() self.approximator.set_weights(w) norm = np.linalg.norm(w - old_w)
def fit(self, dataset): phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset( dataset, self.phi) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) norm = np.inf while norm > self._epsilon: q = self.approximator.predict(phi_next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) next_action = np.argmax(q, axis=1).reshape(-1, 1) phi_next_state_next_action = get_action_features( phi_next_state, next_action, self.mdp_info.action_space.n) tmp = phi_state_action - self.mdp_info.gamma *\ phi_next_state_next_action self._A += phi_state_action.T.dot(tmp) self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1) old_w = self.approximator.get_weights() if np.linalg.matrix_rank(self._A) == self._A.shape[1]: w = np.linalg.solve(self._A, self._b).ravel() else: w = np.linalg.pinv(self._A).dot(self._b).ravel() self.approximator.set_weights(w) norm = np.linalg.norm(w - old_w)
def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * (self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next
def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * ( self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next