forked from mattyws/tcc_deep
/
TestLSTMMongoDBSection.py
135 lines (107 loc) · 5.29 KB
/
TestLSTMMongoDBSection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
'''
Script used to test a model for the IPC Section level.
The script load a keras trained model.
'''
import pickle
import numpy
from sklearn.metrics.classification import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
import DeepLearning as dl
from DeepLearning.database import MongoLoadDocumentMeta , MongoDBMetaEmbeddingGenerator
from DeepLearning.helper import TimerCounter, classMap
import os
import pandas as pd
import numpy as np
'''
Configurations
'''
maxWords = 150
embeddingSize = 200
timer = TimerCounter() # Timer to count how long it takes to perform each process
training_documents_collection = 'shuffled_training_embedding_old'
testing_documents_collection = 'testing_embedding_old'
model_saved_name = "../TrainedLSTM/keras_rnn_shuffled_old.model"
result_directory = "../TrainedLSTM/results/keras_rnn_shuffled_old/"
result_file_name = "result_rnn_word2vec_shuffled"
epochs = 12
layers = 2
training_acc_overtime = [0.3379, 0.3939, 0.4105, 0.4222, 0.4398, 0.4620, 0.4766, 0.4856, 0.4945, 0.5007, 0.5097, 0.5177,
0.5269, 0.5274, 0.5321, 0.5390, 0.5403, 0.5436, 0.5465, 0.5484]
if not os.path.exists(result_directory):
os.mkdir(result_directory)
mongodb = MongoLoadDocumentMeta('patents')
documents = mongodb.get_all_meta('training_docs100')
result_string = ""
print("=============================== Filtering data and performing operations ===============================")
# Gets the first letter for the first IPC class and adding it to the ipc_sectons set variable
ipc_sections = set()
for doc in documents:
if len(doc['ipc_classes']) > 0:
ipc_sections.add(doc['ipc_classes'][0][0])
else:
print(doc['filename'])
print(ipc_sections)
ipc_sections = list(ipc_sections)
# Creating a class_map variable, which contains a mapping of the IPC class to a number. The classes are ordered inside
# the classMap method. This mapping is important because of keras library particularities.
class_map = classMap(ipc_sections)
ipc_sections.sort()
embedding_generator = MongoDBMetaEmbeddingGenerator(documents, "section", class_map, len(ipc_sections), serve_forever=True)
print("=============================== Create training classes ===============================")
#Build a factory for a model adapter
model_factory = dl.factory.factory.create('MultilayerKerasRecurrentNN', input_shape=(maxWords, embeddingSize),
numNeurouns=len(ipc_sections), numOutputNeurons=len(ipc_sections), layers=layers)
model = model_factory.create()
model = model.load(model_saved_name)
# Geting the test documents collection
test_documents = mongodb.get_all_meta(testing_documents_collection)
test_embedding_generator = MongoDBMetaEmbeddingGenerator(test_documents, "section", class_map, len(ipc_sections))
print("=============================== Predicting test data ===============================")
# Predicting the class for each word vector in the database
real = []
all_class = []
pred = []
# for doc, ipc in test_embedding_generator:
# result = model.predict_one(doc)
# pred.append(class_map[result]) #adding the result to the predicted vector
# real.append(class_map[numpy.argmax(ipc)]) #Adding the real value to de real class vector
for doc in test_documents:
result = model.predict_one(pickle.loads(doc['embedding']))
pred.append(class_map[result]) #adding the result to the predicted vector
real.append(doc['ipc_classes'][0][0])
all_class.append(doc['ipc_classes'])
print(pred)
print(real)
#Calculating the metric F1, Precision, Accuracy and Recall
accuracy = accuracy_score(real, pred)
recall = recall_score(real, pred, average='weighted')
recall_per_class = recall_score(real, pred, average=None)
precision = precision_score(real, pred, average='weighted')
precision_per_class = precision_score(real, pred, average=None)
f1 = f1_score(real, pred, average='weighted')
f1_per_class = f1_score(real, pred, average=None)
results_per_class = dict()
for i in range(0, len(recall_per_class)):
if not class_map[i] in results_per_class.keys():
results_per_class[class_map[i]] = []
results_per_class[class_map[i]].append(recall_per_class[i])
results_per_class[class_map[i]].append(precision_per_class[i])
results_per_class[class_map[i]].append(f1_per_class[i])
matrix = confusion_matrix(real, pred, labels=ipc_sections.sort())
#ploting
ts = pd.Series(training_acc_overtime, index=range(len(training_acc_overtime)))
plot = ts.plot(x='Iteração', y='Acurácia', ylim=(0, 1.0))
fig = plot.get_figure()
fig.savefig(result_directory+"training_acc_overtime.png")
df2 = pd.DataFrame([results_per_class[x] for x in ipc_sections], index=ipc_sections ,columns=['Recall', 'Precisão', 'F-Score'])
plot = df2.plot.bar()
fig = plot.get_figure()
fig.savefig(result_directory+"result_per_class.png")
print("Accuracy " + str(accuracy), "Recall " + str(recall), "Precision " + str(precision), "F1 " + str(f1))
result_string += "Accuracy " + str(accuracy) + " Recall " + str(recall) + " Precision " + str(precision) + " F1 " + str(f1) + "\n"
f = open(result_directory+result_file_name, "w")
f.write("Database: " + training_documents_collection +"\n")
f.write("embedding matrix: " + str(maxWords) + "x" + str(embeddingSize)+"\n")
f.write("epochs: " + str(epochs)+"\n")
f.write("layers : " + str(layers)+"\n")
f.write(result_string)
f.close()