def fit(self, X, y): """ Learn the idf vector (global term weights) :param X: sparse matrix, [n_samples, n_features] X must be a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ # 计算类别下的文档数 [n_classes] class_count_ = np.sum(Y, axis=0) class_size = class_count_.shape[0] # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 计算类别下的词汇数 [n_classes] self.class_freq_ = np.sum(safe_sparse_dot(Y.T, X), axis=1) # 计算出现特征词的类别数 [n_features] feature_count_ = np.sum(vectorize.tobool(class_df_), axis=0) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_count_ = np.array([np.sum(class_count_, axis=0)]) class_count_ = np.concatenate((class_count_, unknow_class_count_)) unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) unknow_class_freq_ = np.array([np.sum(self.class_freq_, axis=0)]) self.class_freq_ = np.concatenate((self.class_freq_, unknow_class_freq_)) self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0) # smooth class_count_, class_df_, feature_count_ class_count_ += int(self.smooth_idf) class_df_ += int(self.smooth_idf) feature_count_ += int(self.smooth_idf) _, n_features = X.shape # [n_classes, n_features] first_part = np.log(np.divide(class_count_.reshape(-1, 1), class_df_)) + 1.0 # [n_features] second_part = np.log(class_size / feature_count_) + 1.0 second_part_diag = sp.spdiags(second_part, diags=0, m=n_features, n=n_features) self._idf = safe_sparse_dot(first_part, second_part_diag) return self
def fit(self, X, y): """ Learn the idf vector (global term weights) don't care the specifict value in X :param X: sparse matrix, [n_samples, n_features] a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样 # so deal with binary if labelbin.y_type_ == "binary": Y = np.hstack((1 - Y, Y)) self.classes_ = labelbin.classes_ # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) self.classes_ = np.concatenate( (self.classes_, np.array(["unknow"])), axis=0) # smooth class_df_ class_df_ += int(self.smooth_idf) n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = float(n_samples) / df idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) # [n_classes, n_features] self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0 return self
def fit(self, X, y): """ Learn the idf vector (global term weights) don't care the specifict value in X :param X: sparse matrix, [n_samples, n_features] a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样 # so deal with binary if labelbin.y_type_ == "binary": Y = np.hstack((1 - Y, Y)) self.classes_ = labelbin.classes_ # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0) # smooth class_df_ class_df_ += int(self.smooth_idf) n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = float(n_samples) / df idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) # [n_classes, n_features] self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0 return self
def fit(self, X, y): """ Learn the idf vector (global term weights) :param X: sparse matrix, [n_samples, n_features] X must be a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ # 计算类别下的文档数 [n_classes] class_count_ = np.sum(Y, axis=0) class_size = class_count_.shape[0] # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 计算类别下的词汇数 [n_classes] self.class_freq_ = np.sum(safe_sparse_dot(Y.T, X), axis=1) # 计算出现特征词的类别数 [n_features] feature_count_ = np.sum(vectorize.tobool(class_df_), axis=0) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_count_ = np.array([np.sum(class_count_, axis=0)]) class_count_ = np.concatenate((class_count_, unknow_class_count_)) unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) unknow_class_freq_ = np.array([np.sum(self.class_freq_, axis=0)]) self.class_freq_ = np.concatenate( (self.class_freq_, unknow_class_freq_)) self.classes_ = np.concatenate( (self.classes_, np.array(["unknow"])), axis=0) # smooth class_count_, class_df_, feature_count_ class_count_ += int(self.smooth_idf) class_df_ += int(self.smooth_idf) feature_count_ += int(self.smooth_idf) _, n_features = X.shape # [n_classes, n_features] first_part = np.log( np.divide(class_count_.reshape(-1, 1), class_df_)) + 1.0 # [n_features] second_part = np.log(class_size / feature_count_) + 1.0 second_part_diag = sp.spdiags(second_part, diags=0, m=n_features, n=n_features) self._idf = safe_sparse_dot(first_part, second_part_diag) return self