-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
256 lines (215 loc) · 8.63 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from GUI import View
from operator import itemgetter
from ReadFile import ReadFile
from Parse import Parse
from Indexer import Indexer
import shutil
import Merge
import os
import ParallelMain
import Parse
from Searcher import Searcher
from gensim.models import Word2Vec
"""
~~~~~~~~~~~~~~~~~~~~~~~~ Module Description ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This Module runs the whole program,
It contains the Main class which is the Controller of the MVC model for
Event Driven Programming, and a script which starts the project
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
class Main:
"""
Class Description :
Implements the Controller of the MVC model, runs the project.
"""
"""
Desctiption
This method is for initializing Main's properties
"""
def __init__(self):
self.main_path = ''
self.posting_path = ''
self.to_stem = False
self.indexer = None
self.reader = ReadFile()
self.languages = set()
self.searcher = None
self.queries_docs_results = []
self.avg_doc_length = 0
self.with_semantics = False
self.save_path = ''
"""
Description :
This method manages the program
"""
def start(self):
self.indexer = Indexer(self.posting_path)
if self.to_stem:
self.indexer.to_stem = True
dirs_list = os.listdir(self.main_path + '\\corpus')
# Create temp postings Multiprocessing
dirs_dict = ParallelMain.start(self.main_path, self.posting_path, self.to_stem, dirs_list)
# Merging dictionaries that were created by the processes
docs = {}
files_names = []
post_files_lines = []
total_length = 0
for dir in dirs_dict.keys():
tmp_docs_dict = dirs_dict[dir][2]
for doc_id in tmp_docs_dict:
docs[doc_id] = tmp_docs_dict[doc_id]
total_length += docs[doc_id].length
for lang in dirs_dict[dir][3]:
self.languages.add(lang)
old_post_files_lines = dirs_dict[dir][0]
for i in range(0, len(old_post_files_lines)):
files_names.append(dir + "\\Posting" + str(i) if not self.to_stem else dir + "\\sPosting" + str(i))
post_files_lines.append(old_post_files_lines[i])
self.avg_doc_length = total_length / len(docs)
# Gets Cities that appear in the corpus
i = 0
while i < len(dirs_list):
self.reader.read_cities(self.main_path + '\\corpus', dirs_list[i])
i += 1
terms_dicts = [dirs_dict["\\Postings1"][1], dirs_dict["\\Postings2"][1], dirs_dict["\\Postings3"][1],
dirs_dict["\\Postings4"][1]]
terms_dict = Merge.start_merge(files_names, post_files_lines, terms_dicts, self.posting_path, self.to_stem)
self.indexer.docs_avg_length = self.avg_doc_length
self.indexer.terms_dict = terms_dict
self.indexer.docs_dict = docs
self.indexer.index_cities(self.reader.cities)
self.indexer.post_pointers(self.languages)
# self.searcher = Searcher(self.main_path, self.posting_path, self.indexer.terms_dict, self.indexer.cities_dict,
# self.indexer.docs_dict, self.avg_doc_length, self.to_stem, self.with_semantics)
# self.searcher.model = Word2Vec.load('model.bin')
# path = self.posting_path + '\FinalPost' + '\Final_Post'
# linecache.getline(path, 500000)
"""
Description :
This method calls the Indexer function for loading saved files to the programs main memory
"""
def load(self):
self.indexer = Indexer(self.posting_path)
if self.to_stem:
self.indexer.to_stem = True
self.languages = self.indexer.load()
self.avg_doc_length = self.indexer.docs_avg_length
self.searcher = Searcher(self.main_path, self.posting_path, self.indexer.terms_dict, self.indexer.cities_dict,
self.indexer.docs_dict, self.avg_doc_length, self.to_stem, self.with_semantics)
self.searcher.model = Word2Vec.load(self.posting_path + '//model.bin')
"""
Description :
This method erases all of the files in the Posting path
"""
def reset(self):
shutil.rmtree(self.posting_path)
if not os.path.exists(self.posting_path):
os.makedirs(self.posting_path)
self.indexer = None
"""
Description :
This method returns the terms dictionary, used by GUI IndexView for showing the dictionary.
"""
def get_terms_dict(self):
return self.indexer.terms_dict
"""
Description :
This method returns the Languages of the corpus, used by GUI IndexView for showing the lagnuages.
"""
def get_languages(self):
# should return string with languages separated with '\n'
return self.languages
"""
Description :
This method gets the corpus path from the GUI
"""
def set_corpus_path(self, path):
self.main_path = path
"""
Description :
This method gets the posting path from the GUI
"""
def set_posting_path(self, path):
self.posting_path = path
"""
Description :
This method gets the stemming bool from the GUI
"""
def set_stemming_bool(self, to_stem):
self.to_stem = to_stem
def set_with_semantics(self, with_semantics):
self.with_semantics = with_semantics
self.searcher.with_semantics = with_semantics
def report(self):
num_count = 0
i = 0
freq = {}
for term in self.indexer.terms_dict.keys():
if Parse.isFloat(term):
num_count += 1
freq[term] = self.indexer.terms_dict[term][1]
freq_list = sorted(freq.items(), key=itemgetter(1))
with open('frequency.txt', 'wb') as f:
for n in freq_list:
f.write(str(n[0]) + ": " + str(n[1]) + '\n')
print "Num of terms which are nums: " + str(num_count)
print "Num of countries: " + str(len(self.indexer.countries))
print "Num of capitals: " + str(self.indexer.num_of_capitals)
def set_save_path(self, dir_path):
self.save_path = dir_path
def save(self):
file_name = ''
if self.to_stem:
file_name += 's'
if self.with_semantics:
file_name += 's'
file_name = '\\' + file_name + 'results.txt'
with open(self.save_path + file_name, 'a+') as f:
for query_result in self.queries_docs_results:
for doc in query_result[2]:
line = " {} 0 {} 1 42.38 {}\n".format(query_result[0], doc[0], 'rg')
f.write(line)
def get_cities_list(self):
if self.indexer is None:
return None
return self.indexer.cities_dict.keys()
def start_query_search(self, query, chosen_cities):
return self.searcher.search(query, chosen_cities)
def start_file_search(self, queries_path_entry, chosen_cities):
queries_list = []
current_queries_results = []
with open(queries_path_entry, 'rb') as f:
lines = f.readlines()
id = 0
i = 0
query = ''
narr = ''
while i < len(lines):
if '<num>' in lines[i]:
id = lines[i].split(':')[1].replace('\n', '')
elif '<title>' in lines[i]:
query = lines[i].replace('<title>', '').replace('\n', '')
elif '<desc>' in lines[i]:
i += 1
while not '<narr>' in lines[i]:
query = '{} {}'.format(query, lines[i].replace('<title>', '').replace('\n', ''))
i += 1
queries_list.append((id, query))
i += 1
for query_tuple in queries_list:
docs_result = self.start_query_search(query_tuple[1], chosen_cities)
tmp = (query_tuple[0], query_tuple[1], docs_result)
current_queries_results.append(tmp)
self.queries_docs_results.append(tmp)
return self.queries_docs_results
def get_doc_five_entities(self, doc_id):
return self.searcher.docs_dict[doc_id].five_entities
"""
Script Description:
This script starts the program by initializing Main object, GUI IndexView object
and calling a function to open the index window.
"""
if __name__ == "__main__":
controller = Main()
view = View(controller)
view.start_index_view()