from sklearn.datasets import load_mlcomp from aiml.learnig.my_stemmer import StemmedTfidfVectorizer from sklearn.cluster import KMeans from aiml.learnig.utils import dis_raw MLCOMP_ROOT = '/home/ring/datasets/mlcomp' groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] train_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='train', categories=groups) #test_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='test') vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore') vectorized = vectorizer.fit_transform(train_data.data) num_samples, num_features = vectorized.shape print("#samples: %d, #features: %d" % (num_samples, num_features)) new_posts = """Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks.""" new_posts_vec = vectorizer.transform([new_posts]) # K-Means聚类 num_clusters = train_data.target.max() + 1 km = KMeans(n_clusters=num_clusters, n_init=1, init='random', verbose=1) km.fit(vectorized) print('#K-Means total matched %i' % len(train_data.target == km.labels_)) # 分类预测 new_post_label = km.predict(new_posts_vec) # 获取相同类别文章的下标
import numpy as np from aiml.learnig.my_stemmer import StemmedCountVectorizer from aiml.learnig.my_stemmer import StemmedTfidfVectorizer from aiml.learnig.utils import dis_raw import os vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english') # 训练集 dir = '/home/ring/bmls-2nd/ch03/data/toy' content = np.array([(open(os.path.join(dir, file)).read(), file) for file in os.listdir(dir)]) training = vectorizer.fit_transform(content[:, 0]) filename = content[:, 1] # 待测试文本 test_content = 'imaging databases' test_vec = vectorizer.transform([test_content]) # 输出TF-IDF值 print(vectorizer.vocabulary_) print(vectorizer._tfidf.idf_[vectorizer.vocabulary_['toy']]) best_distance = np.infty best_text = '' num_samples, num_features = training.shape for i in range(num_samples): distance = dis_raw(test_vec, training[i]) if distance < best_distance: best_distance = distance best_text = filename[i]