forked from nkyparissas/Text_Categorization_using_GMMs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.py
237 lines (200 loc) · 6.2 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 2 18:08:50 2018
Technical University of Crete
Course : Pattern Recognition
Spring 2017-2018
Project : 15.2
Call Routing System
@authors : Apostolidou Maria (2012030114)
Kyparissas Nikolas (2012030112)
"""
import sys
import numpy as np
from dictionary_sort import *
from tfidf import *
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
import csv
'''
Inputs from console/bash
The parameters
'''
# downsize the dictionary before svd
# the dictionary (and tf idf tables) are sorted according to importance of the words
# the #discard least important words are deleted to reduce the dimensionality
# of the data at a first level
discard = int(sys.argv[1])
# number of components for the svd reduction
# #svd_components must be < #features
svd_components = int(sys.argv[2])
# number of gaussian components for the gmms
# gmm_components must be <= #samples
gmm_components = int(sys.argv[3])
"""
Create the dictionary
"""
dictionary = []
labels = []
with open('data/final.train') as train_file :
"""
Create the dictionary
"""
num_of_lines = 0 # count the rows / document number
training_data = train_file.readlines()
for line in training_data:
num_of_lines += 1
words = line.split()
words_no_class = words[1:] # discard the label of the sample
labels.append(words[0])
for word in words_no_class:
if word not in dictionary:
dictionary.append(word)
train_file.closed
'''
Find the term document matrix from the training data
'''
term_document = term_doc('data/final.train',num_of_lines,dictionary)
"""
Find the term frequencies from the training data
"""
TF = tf(num_of_lines,dictionary,term_document)
"""
Inverse document Frequency IDF from the training data
"""
IDF = idf(num_of_lines,dictionary,term_document)
"""
Sort the dictionary
"""
# Sort according to TF-IDF values
# Sort over TF value for all our classes (documents)
# Maybe this is not the best approach to the problem
[TF,IDF,dictionary] = dictionary_sort(TF,IDF,dictionary)
"""
Downsize features
Discart the least important
"""
# discard that many elements from the end of TF IDF and dictionary
if discard > 0 :
TF = TF[:,:-discard]
IDF = IDF[:-discard]
dictionary = dictionary[:-discard]
'''
Create the TF-IDF MATRIX from the training data
to be used with svd and gmms
'''
TFIDF = tfidf(TF,IDF)
'''
Singular Value Decomposition
'''
svd = TruncatedSVD(n_components=svd_components)
TFIDFsvd = svd.fit_transform(TFIDF)
'''
extract data for each class separately
GMMs will be trained separately on each classes TFIDF samples
'''
TFIDF_class = []
for class_num in range(1,16):
TFIDF_class.append(samples_from_class(TFIDFsvd,class_num,labels))
'''
GMM training
We train #classes = 15 GMMS to estimate the distribution of the features
Each row of the TFIDFsummed is a feature vector on which we train a GMM
'''
GMMS = []
for class_num in range(1,16):
# ATTENTION: indexes of TF go from 0 - 14
# whereas the class numbers go from 1 - 15
GMMS.append(GaussianMixture(n_components= gmm_components).fit(TFIDF_class[class_num-1]))
'''
Testing
'''
test_labels = []
with open('data/final.test') as test_file :
testsamples = test_file.readlines()
num_of_test_data = 0; #count rows
for line in testsamples:
num_of_test_data += 1
testwords = line.split()
test_labels.append(testwords[0])
test_file.closed
'''
Find the term document matrix from the test data
'''
test_term_document = term_doc('data/final.test',num_of_test_data,dictionary)
"""
Find the term frequencies from the test data
"""
test_TF = tf(num_of_test_data,dictionary,test_term_document)
"""
Inverse document Frequency IDF from the test data
"""
test_IDF = idf(num_of_test_data,dictionary,test_term_document)
'''
Create the TF-IDF MATRIX from the test data
'''
test_TFIDF = tfidf(test_TF,test_IDF)
'''
Dimensionality Reduction for the test samples
'''
test_TFIDFsvd = svd.transform(test_TFIDF)
'''
For each test sample find the loglikelihood probabilities for
each classe's GMM
'''
# contains the loglikelihoods for each GMM => class
classloglike = []
# ATTENTION: indexes of TF go from 0 - 14
# whereas the class numbers go from 1 - 15
for class_idx in range(15):
classloglike.append(GMMS[class_idx].score_samples(test_TFIDFsvd))
'''
ML classify
Classify each test sample to the class for which
the loglikelihood probability (that the sample came from the GMM that
represents this class) is maximum
'''
# contains the decision indexes
classified_idx = np.array(len(test_labels))
classified_idx = np.argmax(classloglike,axis=0)
# convert the decision indexes to labels
# ATTENTION: indexes of TF go from 0 - 14
# whereas the class numbers go from 1 - 15
classified_labels = [x+1 for x in classified_idx]
'''
Accuracy
compare the actual test labels with the labels that we decided during
classification and find the accuracy of the classification task
'''
# used to save the accuracy level for each class
class_accuracy = np.zeros(15)
total_class_instances = np.zeros(15)
accurate = 0
# ATTENTION : the test_labels are a string
# while the classified_labels are a list of int
# convert int to str to compare
for classified,original in zip(map(str,classified_labels),test_labels):
total_class_instances[int(original)-1] += 1
if classified == original :
accurate += 1
class_accuracy[int(original)-1] += 1
percentage = accurate/len(classified_labels)
print("Number of accurate classificiations: ",end="")
print(accurate)
print("Accuracy level: ",end="")
percentage = percentage*100
print(percentage,end="")
print(" %")
print("Accuracy results for each class: ")
for i in range(15):
print("class ",end="" )
print(i,end="")
print(" : ", end="")
print(class_accuracy[i]/total_class_instances[i]*100,end="")
print(" %")
# write accuracy results to csv file
# for accuracy analysis
with open('results.csv','a',newline='') as csvfile:
writer = csv.writer(csvfile,delimiter=';')
writer.writerow([discard,svd_components,gmm_components,percentage])