/
gerador_lista_invertida.py
148 lines (112 loc) · 4.89 KB
/
gerador_lista_invertida.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import configparser
from collections import OrderedDict
from lxml import etree as ET
from xml.dom import minidom
import codecs
import nltk
import logging
import time
import unidecode
import re
import csv
#classe para ler múltiplos "LEIA" do arquivo de configuração
class MultiOrderedDict(OrderedDict):
def __setitem__(self, key, value):
if isinstance(value, list) and key in self:
self[key].extend(value)
else:
super(OrderedDict,self).__setitem__(key, value)
class word_document:
def __init__(self,word):
self.word = word
self.documents = []
def add_document(self,document):
self.documents.append(document)
def remover_acentos(txt):
return unidecode.unidecode(txt)
def ler_arquivo_clg():
logging.info("Program started!")
#iniciando array de palavras vs documents
words_documents = []
#lendo o arquivo com os leia e saida
config = configparser.RawConfigParser(strict=False,dict_type=MultiOrderedDict)
logging.info("Reading GLI.CFG")
config.read(['GLI.CFG'])
entradas = config.get("DEFAULT","LEIA");
saida = config.get("DEFAULT", "ESCREVE");
stemmer_config = config.get("DEFAULT", "STEMMER");
if (stemmer_config[0] == 'true'):
stemmer = 1
else:
stemmer = 0
logging.info("GLI.CFG has been read")
logging.info("Reading cfc-2.dtd")
# parte de ler o xml usando o dtd
f = codecs.open('db\cfc-2.dtd')
dtd = ET.DTD(f)
logging.info("cfc-2.dtd read")
logging.info("Starting reading xml")
begin_time = time.perf_counter()
for entrada in entradas:
#print("printando a entrada " + entrada)
logging.info("Reading " + entrada + " xml file")
root = ET.parse(entrada)
if(dtd.validate(root)):
xmldoc = minidom.parse(entrada)
itemlist = xmldoc.getElementsByTagName('RECORD')
for s in itemlist:
recordnum = s.getElementsByTagName('RECORDNUM')
recordnum = int(recordnum[0].firstChild.nodeValue)
abstract = s.getElementsByTagName('ABSTRACT')
if(len(abstract) > 0):
text_to_parse = abstract[0].firstChild.nodeValue
else:
extract = s.getElementsByTagName('EXTRACT')
if(len(extract) > 0):
text_to_parse = extract[0].firstChild.nodeValue
else:
continue
text_to_parse = text_to_parse.upper()
text_to_parse = re.sub('[^A-Z\ \']+', " ", text_to_parse)
text_words = text_to_parse.split()
for word in text_words:
word_found = False
for wd in words_documents:
if (stemmer == 0):
if (wd.word == word):
wd.documents.append(recordnum)
word_found = True
break
else:
if (wd.word == nltk.stem(word)):
wd.documents.append(recordnum)
word_found = True
break
if (word_found == False):
if (stemmer == 0):
w = word_document(word)
w.documents.append(recordnum)
words_documents.append(w)
else:
w = word_document(nltk.stem(word))
w.documents.append(recordnum)
words_documents.append(w)
#print(s.attributes['RECORDNUM'].value)
else:
logging.info(entrada + " xml file didn't pass on dtd validation")
#print(dtd.error_log.filter_from_errors())
end_time = time.perf_counter()
total_time = end_time - begin_time
logging.info("Inverted list created a list with " + str(len(words_documents)) + " words")
logging.info("Inverted list made " + str(len(words_documents) / total_time) + " words per second")
logging.info("Inverted list made " + str(len(entradas) / total_time) + " documents per second")
logging.info("Writing on csv")
with open(saida[0], 'w',newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=';',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for wd in words_documents:
spamwriter.writerow([wd.word,wd.documents])
logging.info("Finished!")
logging.basicConfig(filename='log\gerador_lista_invertida.log', level=logging.INFO,
format='%(asctime)s\t%(levelname)s\t%(message)s')
ler_arquivo_clg()