num_samples, num_features = vectorized.shape print("#samples: %d, #features: %d" % (num_samples, num_features)) new_posts = """Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks.""" new_posts_vec = vectorizer.transform([new_posts]) # K-Means聚类 num_clusters = train_data.target.max() + 1 km = KMeans(n_clusters=num_clusters, n_init=1, init='random', verbose=1) km.fit(vectorized) print('#K-Means total matched %i' % len(train_data.target == km.labels_)) # 分类预测 new_post_label = km.predict(new_posts_vec) # 获取相同类别文章的下标 similar_indices = (km.labels_ == new_post_label[0]).nonzero()[0] # 计算相似性 similar = [] for i in similar_indices: dist = dis_raw(new_posts_vec, vectorized[i]) similar.append((dist, train_data.data[i])) similar = sorted(similar) print('#Predict matched %i' % len(similar)) # 相似度最大的文章 show_at_1 = similar[0] print('#Best match is {0}'.format(show_at_1))
import os vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english') # 训练集 dir = '/home/ring/bmls-2nd/ch03/data/toy' content = np.array([(open(os.path.join(dir, file)).read(), file) for file in os.listdir(dir)]) training = vectorizer.fit_transform(content[:, 0]) filename = content[:, 1] # 待测试文本 test_content = 'imaging databases' test_vec = vectorizer.transform([test_content]) # 输出TF-IDF值 print(vectorizer.vocabulary_) print(vectorizer._tfidf.idf_[vectorizer.vocabulary_['toy']]) best_distance = np.infty best_text = '' num_samples, num_features = training.shape for i in range(num_samples): distance = dis_raw(test_vec, training[i]) if distance < best_distance: best_distance = distance best_text = filename[i] print('{0:.2f}, {1}'.format(distance, filename[i])) print('Best match is {0}, {1:.2f}'.format(best_text, best_distance))