def fit(self, X, y):
        """
        Learn the idf vector (global term weights)
        :param X: sparse matrix, [n_samples, n_features]
                  X must be a matrix of term counts
        :param y: class_label, [n_samples]
        :return: [n_class, n_features]
        """
        if self.use_idf:
            labelbin = LabelBinarizer()
            # 计算样本属于哪个类别 [n_samples, n_classes]
            Y = labelbin.fit_transform(y)
            self.classes_ = labelbin.classes_

            # 计算类别下的文档数 [n_classes]
            class_count_ = np.sum(Y, axis=0)
            class_size = class_count_.shape[0]

            # 计算每个特征词属于每个类别的样本数 [n_classes, n_features]
            class_df_ = vectorize.class_df(X, Y)

            # 计算类别下的词汇数 [n_classes]
            self.class_freq_ = np.sum(safe_sparse_dot(Y.T, X), axis=1)

            # 计算出现特征词的类别数 [n_features]
            feature_count_ = np.sum(vectorize.tobool(class_df_), axis=0)

            # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替
            unknow_class_count_ = np.array([np.sum(class_count_, axis=0)])
            class_count_ = np.concatenate((class_count_, unknow_class_count_))

            unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1)
            class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0)

            unknow_class_freq_ = np.array([np.sum(self.class_freq_, axis=0)])
            self.class_freq_ = np.concatenate((self.class_freq_, unknow_class_freq_))

            self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0)

            # smooth class_count_, class_df_, feature_count_
            class_count_ += int(self.smooth_idf)
            class_df_ += int(self.smooth_idf)
            feature_count_ += int(self.smooth_idf)

            _, n_features = X.shape

            # [n_classes, n_features]
            first_part = np.log(np.divide(class_count_.reshape(-1, 1), class_df_)) + 1.0
            # [n_features]
            second_part = np.log(class_size / feature_count_) + 1.0
            second_part_diag = sp.spdiags(second_part, diags=0, m=n_features, n=n_features)

            self._idf = safe_sparse_dot(first_part, second_part_diag)

        return self
Example #2
0
    def fit(self, X, y):
        """
        Learn the idf vector (global term weights)
        don't care the specifict value in X
        :param X: sparse matrix, [n_samples, n_features]
                  a matrix of term counts
        :param y: class_label, [n_samples]
        :return: [n_class, n_features]
        """
        if self.use_idf:
            labelbin = LabelBinarizer()
            # 计算样本属于哪个类别 [n_samples, n_classes]
            Y = labelbin.fit_transform(y)
            # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样
            # so deal with binary
            if labelbin.y_type_ == "binary":
                Y = np.hstack((1 - Y, Y))
            self.classes_ = labelbin.classes_

            # 计算每个特征词属于每个类别的样本数 [n_classes, n_features]
            class_df_ = vectorize.class_df(X, Y)

            # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替
            unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1)
            class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0)
            self.classes_ = np.concatenate(
                (self.classes_, np.array(["unknow"])), axis=0)

            # smooth class_df_
            class_df_ += int(self.smooth_idf)

            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = float(n_samples) / df
            idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)

            # [n_classes, n_features]
            self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0

        return self
    def fit(self, X, y):
        """
        Learn the idf vector (global term weights)
        don't care the specifict value in X
        :param X: sparse matrix, [n_samples, n_features]
                  a matrix of term counts
        :param y: class_label, [n_samples]
        :return: [n_class, n_features]
        """
        if self.use_idf:
            labelbin = LabelBinarizer()
            # 计算样本属于哪个类别 [n_samples, n_classes]
            Y = labelbin.fit_transform(y)
            # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样
            # so deal with binary
            if labelbin.y_type_ == "binary":
                Y = np.hstack((1 - Y, Y))
            self.classes_ = labelbin.classes_

            # 计算每个特征词属于每个类别的样本数 [n_classes, n_features]
            class_df_ = vectorize.class_df(X, Y)

            # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替
            unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1)
            class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0)
            self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0)

            # smooth class_df_
            class_df_ += int(self.smooth_idf)

            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = float(n_samples) / df
            idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)

            # [n_classes, n_features]
            self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0

        return self
    def fit(self, X, y):
        """
        Learn the idf vector (global term weights)
        :param X: sparse matrix, [n_samples, n_features]
                  X must be a matrix of term counts
        :param y: class_label, [n_samples]
        :return: [n_class, n_features]
        """
        if self.use_idf:
            labelbin = LabelBinarizer()
            # 计算样本属于哪个类别 [n_samples, n_classes]
            Y = labelbin.fit_transform(y)
            self.classes_ = labelbin.classes_

            # 计算类别下的文档数 [n_classes]
            class_count_ = np.sum(Y, axis=0)
            class_size = class_count_.shape[0]

            # 计算每个特征词属于每个类别的样本数 [n_classes, n_features]
            class_df_ = vectorize.class_df(X, Y)

            # 计算类别下的词汇数 [n_classes]
            self.class_freq_ = np.sum(safe_sparse_dot(Y.T, X), axis=1)

            # 计算出现特征词的类别数 [n_features]
            feature_count_ = np.sum(vectorize.tobool(class_df_), axis=0)

            # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替
            unknow_class_count_ = np.array([np.sum(class_count_, axis=0)])
            class_count_ = np.concatenate((class_count_, unknow_class_count_))

            unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1)
            class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0)

            unknow_class_freq_ = np.array([np.sum(self.class_freq_, axis=0)])
            self.class_freq_ = np.concatenate(
                (self.class_freq_, unknow_class_freq_))

            self.classes_ = np.concatenate(
                (self.classes_, np.array(["unknow"])), axis=0)

            # smooth class_count_, class_df_, feature_count_
            class_count_ += int(self.smooth_idf)
            class_df_ += int(self.smooth_idf)
            feature_count_ += int(self.smooth_idf)

            _, n_features = X.shape

            # [n_classes, n_features]
            first_part = np.log(
                np.divide(class_count_.reshape(-1, 1), class_df_)) + 1.0
            # [n_features]
            second_part = np.log(class_size / feature_count_) + 1.0
            second_part_diag = sp.spdiags(second_part,
                                          diags=0,
                                          m=n_features,
                                          n=n_features)

            self._idf = safe_sparse_dot(first_part, second_part_diag)

        return self