/
dev_testing.py
191 lines (152 loc) · 6.69 KB
/
dev_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import nltk
from nltk.metrics.scores import accuracy
import xml_parser
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
def dice_distance_m(label1, label2):
"""Distance metric comparing set-similarity """
return 2 *(len(label1.union(label2)) - len(label1.intersection(label2))) / (len(label1) + len(label2))
def dice_distance(data):
q_train,a_train,q_test,a_test = data
best_answers = []
for test_q in q_test :
best = 1
a_id = 0
el_index = 0
for el_index in range(len(q_train)) :
if set(test_q.split()) != 0 and set(q_train[el_index].split()) :
res_aux = dice_distance_m(set(test_q.split()),set(q_train[el_index].split()))
if res_aux < best :
best = res_aux
a_id = a_train[el_index]
best_answers.append(a_id)
#print("Accuracy : ", accuracy(a_test, best_answers))
return accuracy(a_test, best_answers)
def jaccard_distance(data):
q_train,a_train,q_test,a_test = data
best_answers = []
for test_q in q_test :
best = 1
a_id = 0
el_index = 0
for el_index in range(len(q_train)) :
if set(test_q.split()) != 0 and set(q_train[el_index].split()) :
res_aux = nltk.jaccard_distance(set(test_q.split()),set(q_train[el_index].split()))
if res_aux < best :
best = res_aux
a_id = a_train[el_index]
best_answers.append(a_id)
return accuracy(a_test, best_answers)
def tfidf_cosine(data):
q_train,a_train,q_test,a_test = data
best_answers = []
sentences = q_train[:]
for test_q in q_test :
best = 0.25
a_id = '0'
el_index = 0
sentences.append(test_q)
# Tf-idf-weighted document-term matrix.
# one sentences per line, one term per column
tfidf = TfidfVectorizer().fit_transform(sentences)
# compute cosine similarity between the query and all other sentences
vals = cosine_similarity(tfidf[-1], tfidf[:-1])[0]
# get index of highest similarity
a_id = vals.argmax()
# compute if similarity is significant
# otherwise, query is not recognized
if(vals[a_id] < best):
best_answers.append('0')
else:
best_answers.append(a_train[a_id])
sentences = sentences[:-1]
#print("Accuracy : ", accuracy(a_test, best_answers))
return accuracy(a_test, best_answers)
def user_op():
opcao = -1
while opcao < 0 or opcao > 12 :
opcao = int(input("Introduza um opcao : "))
return opcao
def test_algoritm():
result = xml_parser.get_documents_xml_file('KB.xml')
faqs = xml_parser.get_all_documents_content(result)
''' data without stemming and removing stopwords'''
data_w = xml_parser.get_train_test(faqs,True,False)
''' data with stremming and removing stopwords'''
data_s_w = xml_parser.get_train_test(faqs,True,True)
''' data without stemming and keeping stopwords'''
data = xml_parser.get_train_test(faqs,False,False)
''' data with stremming and keeping stopwords'''
data_s = xml_parser.get_train_test(faqs,False,True)
user_opcao = -1
while(user_opcao != 0) :
print("========== Lista de op algoritmos para testar ================ ")
print(" Jaccard Distance : Op 1 ")
print(" Jaccard Distance (with Stemming) : Op 2")
print(" Dice Distance : Op 3")
print(" Dice Distance (with Stemming) : Op 4")
print(" TF-IDF and Cosine Similarity : Op 5")
print(" TF-IDF and Cosine Similarity (with Stemming) : Op 6")
print(" Jaccard Distance (with no Stopwords): Op 7 ")
print(" Jaccard Distance (with Stemming and no Stopwords) : Op 8")
print(" Dice Distance (with no Stopwords): Op 9")
print(" Dice Distance (with Stemming and no Stopwords) : Op 10")
print(" TF-IDF and Cosine Similarity (with no Stopwords): Op 11")
print(" TF-IDF and Cosine Similarity (with Stemming and no Stopwords) : Op 12")
print(" Sair : 0")
user_opcao = user_op()
if user_opcao == 0 :
return 0
elif user_opcao == 1 :
#print("Accuracy : ",jaccard_distance(q_train,a_train,q_test,a_test))
print("Accuracy : ",jaccard_distance(data))
elif user_opcao == 2 :
print("Accuracy : ",jaccard_distance(data_s))
elif user_opcao == 3 :
print("Accuracy : ",dice_distance(data))
elif user_opcao == 4 :
print("Accuracy : ",dice_distance(data_s))
elif user_opcao == 5 :
print("Accuracy : ",tfidf_cosine(data))
elif user_opcao == 6 :
print("Accuracy : ",tfidf_cosine(data_s))
elif user_opcao == 7 :
print("Accuracy : ",jaccard_distance(data_w))
elif user_opcao == 8 :
print("Accuracy : ",jaccard_distance(data_s_w))
elif user_opcao == 9 :
print("Accuracy : ",dice_distance(data_w))
elif user_opcao == 10 :
print("Accuracy : ",dice_distance(data_s_w))
elif user_opcao == 11 :
print("Accuracy : ",tfidf_cosine(data_w))
elif user_opcao == 12 :
print("Accuracy : ",tfidf_cosine(data_s_w))
#test_algoritm()
def get_average_accuracy():
result = xml_parser.get_documents_xml_file('KB.xml')
faqs = xml_parser.get_all_documents_content(result)
print("\nJaccard Distance\n")
results = [ jaccard_distance(xml_parser.get_train_test(faqs,False,False)) for _ in range(10) ]# a,b,c,d = xml_parser.get_train_test(faqs,False,False)]
print(results)
print("\nAverage Accuracy : ", sum(results)/10,"\n")
results = []
def av_acc(metric, faqs, noStop, Stem):
results = []
num_acc = 10
for i in range(num_acc):
data = xml_parser.get_train_test(faqs,noStop,Stem)
acc = metric(data)
print(i+1,"\t Accuracy : {:.4f}".format(acc))
results.append(acc)
print("\nAverage Accuracy : {:.4f} \n".format( sum(results)/num_acc))
return sum(results)/num_acc
funcs = [jaccard_distance, dice_distance, tfidf_cosine]
names = ['jaccard_distance', 'dice_distance', 'tfidf_cosine']
result = xml_parser.get_documents_xml_file('KB.xml')
faqs = xml_parser.get_all_documents_content(result)
for noStop in [True, False]:
for Stem in [True, False]:
for i, metric in enumerate(funcs):
print("\n"+names[i]+"\tnoStop =",noStop,"\tStem =",Stem,"\n")
av_acc(metric, faqs, noStop, Stem)