def find_closest(x_mat: ndarray, centroids_mat: ndarray): """ k均值算法中,对每个样本找到其对应最接近的聚类中心的下标。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param centroids_mat: 聚类中心 :return: 样本对应聚类中心下标行向量 """ x_mat = __t.r2m(x_mat) centroids_mat = __t.r2m(centroids_mat) k = centroids_mat.shape[0] m = x_mat.shape[0] idx = np.empty((m, ), dtype=int) for i in range(m): min_j = 0 min_dist = np.sum((x_mat[i, :] - centroids_mat[0, :])**2) for j in range(1, k): dist = np.sum((x_mat[i, :] - centroids_mat[j, :])**2) if dist < min_dist: min_dist = dist min_j = j idx[i] = min_j return idx
def compute_centroids(x_mat: ndarray, idx_vec: ndarray): """ 根据之前的聚类中心下标和样本,计算移动后的平均聚类中心值。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param idx_vec: 聚类中心下标 :return: 新的聚类中心 """ x_mat = __t.r2m(x_mat) idx_vec = __t.c2r(idx_vec) m, n = x_mat.shape k = np.max(idx_vec) + 1 centroids = np.zeros((k, n)) cen_num = np.zeros((k, )) for i in range(m): centroids[idx_vec[i], :] = centroids[idx_vec[i], :] + x_mat[i, :] cen_num[idx_vec[i]] = cen_num[idx_vec[i]] + 1 for i in range(k): if cen_num[i] > 0: centroids[i, :] = centroids[i, :] / cen_num[i] return centroids
def predict(self, x_mat: ndarray) -> Union[ndarray, int]: """ 返回预测值,是对应于 x_mat 的标记。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测标记 """ if not self._trained: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) pred = self.__predict(x_mat) if self.strategy == 'ovr': return _t.ret(pred) else: m = x_mat.shape[0] result = np.empty((m, )) for i in range(m): r = pred[:, i] result[i] = sorted([(np.sum(r == label), label) for label in set(r)])[-1][1] return _t.ret(result)
def map_features(x_mat: ndarray, *, degrees: int = 3, variables: int = 2) -> ndarray: """ 将 x_mat 中的一次特征映射成多次特征。可以用在线性回归、逻辑回归中。 注意不能包含截距列。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param degrees: 最大次方数,也就是被选取特征次方之和最大值,取值需要大于等于 2 :param variables: 最大乘项数,即每个乘项最多有多少个特征相乘。这个值不会大于 degree,也不会大于特征数。当它等于 1 时,就只取次方项 :return: 映射完成的特征向量组 """ x_mat = __t.r2m(x_mat) m = x_mat.shape[0] n = x_mat.shape[1] if variables > degrees: variables = degrees if variables > n: variables = n out = np.ones((m, 1)) if variables == 1: out = np.hstack((out, x_mat)) for d in range(2, degrees + 1): for i in range(n): out = np.hstack((out, x_mat[:, i:i + 1]**d)) else: idx = list(range(n)) degree_coms = [] # 次方从 1 到 degree out = np.hstack((out, x_mat)) for degree in range(2, degrees + 1): # 从 n 个特征中选出 x1,x2,...,x_variables for features_idx in list(combinations(idx, variables)): # 转置是因为在循环中,ndarray 是按行取值的。 features = x_mat[:, features_idx].T # 获取次方数的组合 degree_com = list(repeat(0, variables)) __degree_combinations(degree, variables, degree_com, degree_coms) for degree_c in degree_coms: val = 1 for f, dc in zip(features, degree_c): val *= 1 if dc == 0 else f**dc out = np.hstack((out, val.reshape((val.shape[0], 1)))) degree_coms.clear() return out
def predict(self, x_mat: ndarray) -> Union[ndarray, int]: """ 对新的数据进行预测。当 x_mat 中只有一个样本时返回一个数字,否则返回一个行向量。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测值或向量 """ if self._root is None: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) if x_mat.shape[1] != len(self.pvs): raise DataNotMatchError('feature quantity mismatch') return self.__predict(self._root, x_mat)
def __match_theta_x(self, x_mat: ndarray) -> ndarray: """ 检查输入是否和参数匹配。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: """ if self._theta is None: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) if x_mat.shape[1] != self._theta.shape[0]: raise DataNotMatchError('feature quantity mismatch') return x_mat
def train(x_mat: ndarray, k: int, *, max_iters: int = 10, initial_centroids: Iterable = None, history: bool = False): """ 进行k均值训练 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param k: 聚类数目 :param max_iters: 最大迭代次数 :param initial_centroids: 初始聚类中心,不提供别的话将随机挑选聚类中心 :param history: 是否返回历史信息 :return: 计算好的聚类中心;包含每个样本所属聚类中心下标的行向量;包含每一次迭代计算的聚类中心列表(history为True的话) """ x_mat = __t.r2m(x_mat) m, n = x_mat.shape if initial_centroids is None: rand_indices = np.arange(0, m) np.random.shuffle(rand_indices) initial_centroids = x_mat[rand_indices[:k], :] if not isinstance(initial_centroids, ndarray): initial_centroids = np.asarray(initial_centroids) idx = None centroids_history = None if history: centroids_history = [initial_centroids] for i in range(max_iters): idx = find_closest(x_mat, initial_centroids) initial_centroids = compute_centroids(x_mat, idx) if history: centroids_history.append(initial_centroids) if history: return initial_centroids, idx, centroids_history else: return initial_centroids, idx
def __evaluation(self, x_mat: ndarray, eval: Callable[[IProbabilityLearner], ndarray]): if not self._trained: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) m = x_mat.shape[0] n = len(self._labels) evaluation = np.zeros((m, n), dtype=float) if self.strategy == 'ovr': for i, learner in enumerate(self._learners.values()): evaluation[:, i] = eval(learner) else: for i, positive in enumerate(self.labels): for negative in self.labels: if positive == negative: continue evaluation[:, i] = evaluation[:, i] + eval( self._learners[(positive, negative)]) evaluation[:, i] = evaluation[:, i] / (n - 1) return evaluation
def predict(self, x_mat: ndarray): """ 返回预测值,是对应于 x_mat 的标记。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测标记 """ if self._theta is None: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) if x_mat.shape[1] != self._theta.shape[0]: raise DataNotMatchError('feature quantity mismatch') if self.kernel == 'linear': pred = x_mat @ self._theta + self._b elif self.kernel == 'gauss': m = x_mat.shape[0] pred = np.empty((m, )) for i in range(m): for j in range(self._x_mat.shape[0]): pred[i] = pred[i] + self._alphas[ j] * self._y_row[j] * gaussian_kernel( x_mat[i], self._x_mat[j], self.gamma) pred[i] = pred[i] + self._b else: m = x_mat.shape[0] pred = np.empty((m, )) for i in range(m): for j in range(self._x_mat.shape[0]): pred[i] = pred[i] + self._alphas[j] * self._y_row[ j] * self.kernel(x_mat[i], self._x_mat[j]) pred[i] = pred[i] + self._b return _t.ret( _t.convert_y(self.labels, (pred >= 0).astype(dtype=np.int16), to=False))
def x_cv(self, value: Optional[ndarray]): if value is not None: self._x_cv = _t.r2m(value) else: self._x_cv = None