/
clustering.py
49 lines (40 loc) · 1.84 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from numpy import linalg
from nltk.stem import SnowballStemmer
import numpy as np
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = TfidfVectorizer.build_analyzer(self)
english_stemmer = SnowballStemmer('english')
return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))
class DocumentClusterizer():
def __init__(self):
self.vectorizer = StemmedTfidfVectorizer()
self.km = KMeans(50, init='random', n_init=1, verbose=1)
def train(self, data):
self.data = data
self.vectorized = self.vectorizer.fit_transform(data)
self.km.fit(self.vectorized)
def find_most_similar(self, example):
print 'EXAMPLE:', example
example = self.vectorizer.transform([example])
pred = self.km.predict(example)[0]
similar_indices = (self.km.labels_==pred).nonzero()[0]
similar = []
for i in similar_indices:
dist = linalg.norm((self.vectorized[i]-example).toarray())
similar.append((dist, self.data[i]))
similar = sorted(similar)
print 'SIMILAR:', similar[0]
def exercise():
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = fetch_20newsgroups(subset='train', categories=groups)
clusterizer = DocumentClusterizer()
clusterizer.train(train_data.data)
test_data = fetch_20newsgroups(subset='test', categories=groups)
for i in range(10):
sample = test_data.data[np.random.randint(len(test_data.data))]
clusterizer.find_most_similar(sample)