Example #1
0
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

new_posts = """Disk drive problems. Hi, I have a problem with my hard disk.
After 1 year it is working only sporadically now.
I tried to format it, but now it doesn't boot any more.
Any ideas? Thanks."""
new_posts_vec = vectorizer.transform([new_posts])

# K-Means聚类
num_clusters = train_data.target.max() + 1
km = KMeans(n_clusters=num_clusters, n_init=1, init='random', verbose=1)
km.fit(vectorized)
print('#K-Means total matched %i' % len(train_data.target == km.labels_))
# 分类预测
new_post_label = km.predict(new_posts_vec)
# 获取相同类别文章的下标
similar_indices = (km.labels_ == new_post_label[0]).nonzero()[0]

# 计算相似性
similar = []
for i in similar_indices:
    dist = dis_raw(new_posts_vec, vectorized[i])
    similar.append((dist, train_data.data[i]))
similar = sorted(similar)
print('#Predict matched %i' % len(similar))

# 相似度最大的文章
show_at_1 = similar[0]
print('#Best match is {0}'.format(show_at_1))
Example #2
0
import os

vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')

# 训练集
dir = '/home/ring/bmls-2nd/ch03/data/toy'
content = np.array([(open(os.path.join(dir, file)).read(), file)
                    for file in os.listdir(dir)])
training = vectorizer.fit_transform(content[:, 0])
filename = content[:, 1]

# 待测试文本
test_content = 'imaging databases'
test_vec = vectorizer.transform([test_content])
# 输出TF-IDF值
print(vectorizer.vocabulary_)
print(vectorizer._tfidf.idf_[vectorizer.vocabulary_['toy']])

best_distance = np.infty
best_text = ''
num_samples, num_features = training.shape
for i in range(num_samples):
    distance = dis_raw(test_vec, training[i])
    if distance < best_distance:
        best_distance = distance
        best_text = filename[i]

    print('{0:.2f}, {1}'.format(distance, filename[i]))

print('Best match is {0}, {1:.2f}'.format(best_text, best_distance))