Exemple #1
0
from sklearn.datasets import load_mlcomp
from aiml.learnig.my_stemmer import StemmedTfidfVectorizer
from sklearn.cluster import KMeans
from aiml.learnig.utils import dis_raw

MLCOMP_ROOT = '/home/ring/datasets/mlcomp'
groups = ['comp.graphics', 'comp.os.ms-windows.misc',
          'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
train_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='train', categories=groups)
#test_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='test')

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore')
vectorized = vectorizer.fit_transform(train_data.data)

num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

new_posts = """Disk drive problems. Hi, I have a problem with my hard disk.
After 1 year it is working only sporadically now.
I tried to format it, but now it doesn't boot any more.
Any ideas? Thanks."""
new_posts_vec = vectorizer.transform([new_posts])

# K-Means聚类
num_clusters = train_data.target.max() + 1
km = KMeans(n_clusters=num_clusters, n_init=1, init='random', verbose=1)
km.fit(vectorized)
print('#K-Means total matched %i' % len(train_data.target == km.labels_))
# 分类预测
new_post_label = km.predict(new_posts_vec)
# 获取相同类别文章的下标
Exemple #2
0
import numpy as np
from aiml.learnig.my_stemmer import StemmedCountVectorizer
from aiml.learnig.my_stemmer import StemmedTfidfVectorizer
from aiml.learnig.utils import dis_raw
import os

vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')

# 训练集
dir = '/home/ring/bmls-2nd/ch03/data/toy'
content = np.array([(open(os.path.join(dir, file)).read(), file)
                    for file in os.listdir(dir)])
training = vectorizer.fit_transform(content[:, 0])
filename = content[:, 1]

# 待测试文本
test_content = 'imaging databases'
test_vec = vectorizer.transform([test_content])
# 输出TF-IDF值
print(vectorizer.vocabulary_)
print(vectorizer._tfidf.idf_[vectorizer.vocabulary_['toy']])

best_distance = np.infty
best_text = ''
num_samples, num_features = training.shape
for i in range(num_samples):
    distance = dis_raw(test_vec, training[i])
    if distance < best_distance:
        best_distance = distance
        best_text = filename[i]