-
Notifications
You must be signed in to change notification settings - Fork 0
/
lessons_clustering.py
98 lines (81 loc) · 2.57 KB
/
lessons_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
import csv
import codecs
import numpy as np
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
FILENAME = 'lessons_view.csv'
NUM_CLUSTERS = 10
LSA_DIM = 500
MAX_DF = 0.8
MAX_FEATURES = 100000
MINIBATCH = True
def get_tweets_from_csv(filename):
ret = csv.reader(open(filename))
print ret
# tweets = []
# for r in ret:
# if len(r[2]) != 0:
# tweets.append(r[2].decode('utf-8'))
tweets = [r[2].decode('utf-8') for r in ret]
# print tweets
for tweet in tweets[:]:
if u'@' in tweet:
tweets.remove(tweet)
if len(tweet) <= 3:
tweets.remove(tweet)
return tweets
def analyzer(text):
ret = []
tagger = MeCab.Tagger('-Ochasen')
node = tagger.parseToNode(text.encode('utf-8'))
node = node.next
while node.next:
ret.append(node.feature.split(',')[-3].decode('utf-8'))
node = node.next
return ret
def main(filename):
# load tweets
tweets = get_tweets_from_csv(filename)
# print tweets
# feature extraction
vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF)
vectorizer.max_features = MAX_FEATURES
X = vectorizer.fit_transform(tweets)
# dimensionality reduction by LSA
lsa = TruncatedSVD(LSA_DIM)
X = lsa.fit_transform(X)
X = Normalizer(copy=False).fit_transform(X)
# clustering by KMeans
if MINIBATCH:
km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
else:
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True)
km.fit(X)
labels = km.labels_
transformed = km.transform(X)
dists = np.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
# sort by distance
clusters = []
for i in range(NUM_CLUSTERS):
cluster = []
ii = np.where(labels==i)[0]
dd = dists[ii]
di = np.vstack([dd,ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(tweets[int(j)])
clusters.append(cluster)
return clusters
if __name__ == '__main__':
clusters = main(FILENAME)
f = codecs.open('%s.txt' % FILENAME, 'w', 'utf-8')
for i,tweets in enumerate(clusters):
for tweet in tweets:
f.write('%d: %s\n' % (i, tweet.replace('/n', '')))
f.close()