예제 #1
0
def find_closest(x_mat: ndarray, centroids_mat: ndarray):
    """
    k均值算法中,对每个样本找到其对应最接近的聚类中心的下标。

    :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
    :param centroids_mat: 聚类中心
    :return: 样本对应聚类中心下标行向量
    """

    x_mat = __t.r2m(x_mat)
    centroids_mat = __t.r2m(centroids_mat)

    k = centroids_mat.shape[0]
    m = x_mat.shape[0]
    idx = np.empty((m, ), dtype=int)

    for i in range(m):
        min_j = 0
        min_dist = np.sum((x_mat[i, :] - centroids_mat[0, :])**2)
        for j in range(1, k):
            dist = np.sum((x_mat[i, :] - centroids_mat[j, :])**2)
            if dist < min_dist:
                min_dist = dist
                min_j = j
        idx[i] = min_j

    return idx
예제 #2
0
def compute_centroids(x_mat: ndarray, idx_vec: ndarray):
    """
    根据之前的聚类中心下标和样本,计算移动后的平均聚类中心值。

    :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
    :param idx_vec: 聚类中心下标
    :return: 新的聚类中心
    """

    x_mat = __t.r2m(x_mat)
    idx_vec = __t.c2r(idx_vec)

    m, n = x_mat.shape
    k = np.max(idx_vec) + 1
    centroids = np.zeros((k, n))
    cen_num = np.zeros((k, ))

    for i in range(m):
        centroids[idx_vec[i], :] = centroids[idx_vec[i], :] + x_mat[i, :]
        cen_num[idx_vec[i]] = cen_num[idx_vec[i]] + 1
    for i in range(k):
        if cen_num[i] > 0:
            centroids[i, :] = centroids[i, :] / cen_num[i]

    return centroids
예제 #3
0
    def predict(self, x_mat: ndarray) -> Union[ndarray, int]:
        """
        返回预测值,是对应于 x_mat 的标记。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测标记
        """

        if not self._trained:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        pred = self.__predict(x_mat)

        if self.strategy == 'ovr':
            return _t.ret(pred)
        else:
            m = x_mat.shape[0]
            result = np.empty((m, ))
            for i in range(m):
                r = pred[:, i]
                result[i] = sorted([(np.sum(r == label), label)
                                    for label in set(r)])[-1][1]

            return _t.ret(result)
예제 #4
0
def map_features(x_mat: ndarray,
                 *,
                 degrees: int = 3,
                 variables: int = 2) -> ndarray:
    """
    将 x_mat 中的一次特征映射成多次特征。可以用在线性回归、逻辑回归中。
    注意不能包含截距列。

    :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
    :param degrees: 最大次方数,也就是被选取特征次方之和最大值,取值需要大于等于 2
    :param variables: 最大乘项数,即每个乘项最多有多少个特征相乘。这个值不会大于 degree,也不会大于特征数。当它等于 1 时,就只取次方项
    :return: 映射完成的特征向量组
    """

    x_mat = __t.r2m(x_mat)
    m = x_mat.shape[0]
    n = x_mat.shape[1]
    if variables > degrees:
        variables = degrees
    if variables > n:
        variables = n

    out = np.ones((m, 1))
    if variables == 1:
        out = np.hstack((out, x_mat))
        for d in range(2, degrees + 1):
            for i in range(n):
                out = np.hstack((out, x_mat[:, i:i + 1]**d))
    else:
        idx = list(range(n))
        degree_coms = []

        # 次方从 1 到 degree
        out = np.hstack((out, x_mat))
        for degree in range(2, degrees + 1):
            # 从 n 个特征中选出 x1,x2,...,x_variables
            for features_idx in list(combinations(idx, variables)):
                # 转置是因为在循环中,ndarray 是按行取值的。
                features = x_mat[:, features_idx].T
                # 获取次方数的组合
                degree_com = list(repeat(0, variables))
                __degree_combinations(degree, variables, degree_com,
                                      degree_coms)
                for degree_c in degree_coms:
                    val = 1
                    for f, dc in zip(features, degree_c):
                        val *= 1 if dc == 0 else f**dc
                    out = np.hstack((out, val.reshape((val.shape[0], 1))))
                degree_coms.clear()

    return out
예제 #5
0
    def predict(self, x_mat: ndarray) -> Union[ndarray, int]:
        """
        对新的数据进行预测。当 x_mat 中只有一个样本时返回一个数字,否则返回一个行向量。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测值或向量
        """

        if self._root is None:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        if x_mat.shape[1] != len(self.pvs):
            raise DataNotMatchError('feature quantity mismatch')

        return self.__predict(self._root, x_mat)
예제 #6
0
파일: linear.py 프로젝트: taowu750/wtml
    def __match_theta_x(self, x_mat: ndarray) -> ndarray:
        """
        检查输入是否和参数匹配。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return:
        """

        if self._theta is None:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        if x_mat.shape[1] != self._theta.shape[0]:
            raise DataNotMatchError('feature quantity mismatch')

        return x_mat
예제 #7
0
def train(x_mat: ndarray,
          k: int,
          *,
          max_iters: int = 10,
          initial_centroids: Iterable = None,
          history: bool = False):
    """
    进行k均值训练

    :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
    :param k: 聚类数目
    :param max_iters: 最大迭代次数
    :param initial_centroids: 初始聚类中心,不提供别的话将随机挑选聚类中心
    :param history: 是否返回历史信息
    :return: 计算好的聚类中心;包含每个样本所属聚类中心下标的行向量;包含每一次迭代计算的聚类中心列表(history为True的话)
    """

    x_mat = __t.r2m(x_mat)

    m, n = x_mat.shape
    if initial_centroids is None:
        rand_indices = np.arange(0, m)
        np.random.shuffle(rand_indices)
        initial_centroids = x_mat[rand_indices[:k], :]
    if not isinstance(initial_centroids, ndarray):
        initial_centroids = np.asarray(initial_centroids)

    idx = None
    centroids_history = None
    if history:
        centroids_history = [initial_centroids]
    for i in range(max_iters):
        idx = find_closest(x_mat, initial_centroids)
        initial_centroids = compute_centroids(x_mat, idx)
        if history:
            centroids_history.append(initial_centroids)

    if history:
        return initial_centroids, idx, centroids_history
    else:
        return initial_centroids, idx
예제 #8
0
    def __evaluation(self, x_mat: ndarray,
                     eval: Callable[[IProbabilityLearner], ndarray]):
        if not self._trained:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        m = x_mat.shape[0]
        n = len(self._labels)
        evaluation = np.zeros((m, n), dtype=float)

        if self.strategy == 'ovr':
            for i, learner in enumerate(self._learners.values()):
                evaluation[:, i] = eval(learner)
        else:
            for i, positive in enumerate(self.labels):
                for negative in self.labels:
                    if positive == negative:
                        continue
                    evaluation[:, i] = evaluation[:, i] + eval(
                        self._learners[(positive, negative)])
                evaluation[:, i] = evaluation[:, i] / (n - 1)

        return evaluation
예제 #9
0
    def predict(self, x_mat: ndarray):
        """
        返回预测值,是对应于 x_mat 的标记。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测标记
        """

        if self._theta is None:
            raise StateError('not trained yet')
        x_mat = _t.r2m(x_mat)
        if x_mat.shape[1] != self._theta.shape[0]:
            raise DataNotMatchError('feature quantity mismatch')

        if self.kernel == 'linear':
            pred = x_mat @ self._theta + self._b
        elif self.kernel == 'gauss':
            m = x_mat.shape[0]
            pred = np.empty((m, ))
            for i in range(m):
                for j in range(self._x_mat.shape[0]):
                    pred[i] = pred[i] + self._alphas[
                        j] * self._y_row[j] * gaussian_kernel(
                            x_mat[i], self._x_mat[j], self.gamma)
                pred[i] = pred[i] + self._b
        else:
            m = x_mat.shape[0]
            pred = np.empty((m, ))
            for i in range(m):
                for j in range(self._x_mat.shape[0]):
                    pred[i] = pred[i] + self._alphas[j] * self._y_row[
                        j] * self.kernel(x_mat[i], self._x_mat[j])
                pred[i] = pred[i] + self._b

        return _t.ret(
            _t.convert_y(self.labels, (pred >= 0).astype(dtype=np.int16),
                         to=False))
예제 #10
0
 def x_cv(self, value: Optional[ndarray]):
     if value is not None:
         self._x_cv = _t.r2m(value)
     else:
         self._x_cv = None