-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
701 lines (585 loc) · 27.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
# main file for IR course project
import ReadFile
import time
import traceback
import os
import Indexer
from Tkinter import *
import tkFileDialog
import tkMessageBox
from sys import stdout
from datetime import datetime
import CacheTerm
import TermDict
import shared_variables as globs
from DocnoParser import DocnoParser
import cPickle as Pickle
from Searcher import Searcher
import string
import wikipedia
from numpy import random
STOP_WORDS_FILE_NAME = 'stop_words.txt'
TEMP_POSTING_PREFIX = 'temp_posting_'
POSTING_PREFIX = 'posting_'
STEMMING_PREFIX = 'stemmed_'
DICTIONARY_PREFIX = 'terms_dict'
CACHE_PREFIX = 'cache'
DOCUMENTS_PREFIX = 'documents_dict'
def gui():
# run the main gui of the program
global docs_path
global postings_path
global is_stemming
global dict_cache_path
global is_docno
global is_expansion
global manual_query_path
global query_guies
root = Tk()
root.title('Information Retrieval BGU')
query_guies = []
# create all buttons and features of the program
run_full_index_button = Button(root, text="Run Entire Index", command=run_index)
run_full_index_button.grid(row=0, sticky='E')
docs_label = Label(text='Corpus + Stop Words Directory')
docs_label.grid(row=1, column=0)
docs_path = StringVar(value='C:/')
docs_entry = Entry(textvariable=docs_path, width=50)
docs_entry.grid(row=1, column=1)
doc_path_button = Button(text="Browse", command=browse_docs)
doc_path_button.grid(row=1, column=2)
postings_label = Label(text='Postings + Dictionary + Cache Directory')
postings_label.grid(row=2, column=0)
postings_path = StringVar(value='C:/')
postings_entry = Entry(textvariable=postings_path, width=50)
postings_entry.grid(row=2, column=1)
postings_path_button = Button(text="Browse", command=browse_postings)
postings_path_button.grid(row=2, column=2)
dict_cache_path = StringVar(value='C:/')
is_stemming = BooleanVar()
stem_button = Checkbutton(root, text="Stem?", variable=is_stemming)
stem_button.grid(row=3, columnspan=2)
reset_button = Button(text="Reset", command=reset, fg='red')
reset_button.grid(row=4, column=0)
show_cache_button = Button(text="Show Cache", command=show_cache)
show_cache_button.grid(row=5, column=0)
show_dict_button = Button(text="Show Dictionary", command=show_dict)
show_dict_button.grid(row=6, column=0)
save_dict_cache_button = Button(text="Save Dictionary & Cache", command=save_dict_cache)
save_dict_cache_button.grid(row=7, column=0)
load_dict_cache_button = Button(text="Load Dictionary & Cache", command=load_dict_cache)
load_dict_cache_button.grid(row=8, column=0)
manual_query_label = Label(text='Manual Query')
manual_query_label.grid(row=9, column=0)
manual_query_path = StringVar(value='')
manual_query_entry = Entry(textvariable=manual_query_path, width=50)
manual_query_entry.grid(row=9, column=1)
manual_query_button = Button(text="Run", command=run_manual_query)
manual_query_button.grid(row=9, column=2)
file_query_label = Label(text='File Query')
file_query_label.grid(row=10, column=0)
file_query_button = Button(text="Browse", command=run_file_query)
file_query_button.grid(row=10, column=1)
is_expansion = BooleanVar()
is_expansion_button = Checkbutton(root, text="Expansion?", variable=is_expansion)
is_expansion_button.grid(row=11, column=0)
is_docno = BooleanVar()
is_docno_button = Checkbutton(root, text="Docno?", variable=is_docno)
is_docno_button.grid(row=11, column=1)
reset_part_two_button = Button(text="Reset Part Two", fg='red', command=reset_part_two)
reset_part_two_button.grid(row=12, column=0)
quit_button = Button(text='QUIT', fg='red', command=root.destroy)
quit_button.grid(row=13, column=0)
root.mainloop()
def run_index():
# run an entire index build
global docs_path
global postings_path
global is_stemming
global indexer
global dict_cache_path
try:
# check validation conditions
if (not check_corpus_directory(docs_path.get())) or (not check_postings_directory(postings_path.get())):
return
result = tkMessageBox.askquestion("Run Index",
"Are you sure?\n dont worry if the GUI"
" is stuck or not responding - it is working", icon='warning')
if result != 'yes':
return
print ('START TIME - ' + time.strftime("%H:%M:%S"))
start_time = datetime.now()
# reset the current memory of the project
if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
globs.main_dictionary.clear()
if (globs.cache is not None) and (bool(globs.cache)):
globs.cache.clear()
if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
globs.documents_dict.clear()
# start indexing
globs.stop_words = load_stop_words(docs_path.get())
indexer = Indexer.Indexer(postings_path.get(), is_stemming.get())
read_file = ReadFile.ReadFile(get_corpus_dir(docs_path.get()),
indexer, globs.constants, globs.stop_words, is_stemming.get())
read_file.index_folder()
globs.num_of_documents = len(read_file.documents_dict)
globs.documents_dict = read_file.documents_dict
del read_file
indexer.unite_temp_postings()
globs.main_dictionary = indexer.main_dict
indexer.build_document_weight(globs.documents_dict)
# in case want to print stats, uncomment this
# with open('{}{}'.format('stats', 'stem' if is_stemming.get() else ''),'w') as my_stats_file:
# my_stats_file.write('term,tf,df\n'.format())
# for key,val in main_dictionary.iteritems():
# my_stats_file.write('{},{},{}\n'.format(key,val.tf,val.df))
globs.cache = indexer.cache_dict
globs.average_doc_size = globs.average_doc_size/globs.num_of_documents
dict_cache_path = postings_path
print ('END TIME - ' + time.strftime("%H:%M:%S"))
end_time = datetime.now()
print_stats_at_end_of_indexing(end_time - start_time)
except Exception as err:
tkMessageBox.showinfo('ERROR', err)
traceback.print_exc(file=stdout)
def print_stats_at_end_of_indexing(time_delta):
# print the stats of the indexing session
global indexer
index_size = indexer.total_byte_size
cache_size = len(str(globs.cache))
tkMessageBox.showinfo('NICE',
'{} docs were indexed.\n{} is the size of index in bytes.\n{} is the cache size in bytes.'
'\n{} is the number of seconds it took for the indexing'
.format(globs.num_of_documents, index_size, cache_size, time_delta.seconds))
def get_corpus_dir(path):
# return the directory of the corpus
files = os.listdir(path)
for file_name in files:
corpus_dir = os.path.join(path, file_name)
if os.path.isdir(corpus_dir):
return corpus_dir
def check_corpus_directory(path):
# check validation conditions for the corpus directory
if (not check_if_directory(path)) or \
(not check_if_stop_words_file_exists(path))\
or (not check_if_one_dir(path)) or (not check_if_more_than_two_files(path)):
return False
return True
def check_postings_directory(path):
# check validation conditions for the postings directory
if not check_if_directory(path):
return False
return True
def check_if_directory(dir_path):
# check if a path is a valid directory
is_dir = os.path.isdir(dir_path)
if not is_dir:
tkMessageBox.showinfo('ERROR', 'stop_words & corpus directory - {} is not a valid directory'.format(dir_path))
return is_dir
def check_if_stop_words_file_exists(stop_words_path):
# check if the stop_words file exists in a given directory
is_stop_words = os.path.isfile(os.path.join(stop_words_path, STOP_WORDS_FILE_NAME))
if not is_stop_words:
tkMessageBox.showinfo(
'ERROR', '{} does not exist in {} directory'.format(STOP_WORDS_FILE_NAME, stop_words_path))
return is_stop_words
def check_if_one_dir(path):
# check if a directory exists in some path
files = os.listdir(path)
for one_file in files:
if os.path.isdir(os.path.join(path, one_file)):
return True
tkMessageBox.showinfo(
'ERROR', 'There is no directory in {}'.format(path))
return False
def check_if_more_than_two_files(path):
# check if there are more than two files
files = os.listdir(path)
count = 0
for one_file in files:
count += 1
if count > 2:
tkMessageBox.showinfo('ERROR', 'There is more than two files in {}'.format(path))
return False
return True
def browse_docs():
# assign the documents/corpus path
global docs_path
docs_dir = tkFileDialog.askdirectory()
if docs_dir is None or docs_dir == '':
return
docs_path.set(docs_dir)
def browse_postings():
# assign the potsings path
global postings_path
postings_dir = tkFileDialog.askdirectory()
if postings_dir is None or postings_dir == '':
return
postings_path.set(postings_dir)
def reset():
# reset the memory and delete the postings dear
global postings_path
try:
result = tkMessageBox.askquestion("Reset",
"Are you sure you want to reset?\n", icon='warning')
if result != 'yes':
return
if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
globs.main_dictionary.clear()
if (globs.cache is not None) and (bool(globs.cache)):
globs.cache.clear()
if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
globs.documents_dict.clear()
if check_postings_directory(postings_path.get()):
for del_file in os.listdir(postings_path.get()):
del_file_path = os.path.join(postings_path.get(), del_file)
os.unlink(del_file_path)
tkMessageBox.showinfo('OK', 'folder was deleted and memory has been reset')
except Exception as err:
tkMessageBox.showinfo('ERROR', err)
traceback.print_exc(file=stdout)
def show_dict():
# show main dictionary
global term_name_listbox
global tf_listbox
try:
if (globs.main_dictionary is None) or (not bool(globs.main_dictionary)):
tkMessageBox.showinfo('ERROR', 'currently, no dictionary is loaded')
return
main_dict_gui = Toplevel()
main_dict_gui.title("Main Dictionary")
scrollbar = Scrollbar(main_dict_gui)
scrollbar.pack(side=RIGHT, fill=Y)
term_name_listbox = Listbox(main_dict_gui, yscrollcommand=scrollbar.set)
term_name_listbox.insert(END, 'Term Name')
tf_listbox = Listbox(main_dict_gui, yscrollcommand=scrollbar.set)
tf_listbox.insert(END, 'Term Frequency')
for term_name, term_dict_item in sorted(globs.main_dictionary.iteritems()):
term_name_listbox.insert(END, str(term_name))
tf_listbox.insert(END, str(term_dict_item.tf))
term_name_listbox.pack(side=LEFT, fill=BOTH, expand=True)
term_name_listbox.bind("<MouseWheel>", main_dictionary_mutual_mouse_wheel)
tf_listbox.pack(side=LEFT, fill=BOTH, expand=True)
tf_listbox.bind("<MouseWheel>", main_dictionary_mutual_mouse_wheel)
scrollbar.config(command=main_dictionary_mutual_scroll)
except Exception as err:
tkMessageBox.showinfo('ERROR', err)
traceback.print_exc(file=stdout)
def main_dictionary_mutual_scroll(*args):
# force both listboxes to be scrolled together
global term_name_listbox
global tf_listbox
term_name_listbox.yview(*args)
tf_listbox.yview(*args)
def main_dictionary_mutual_mouse_wheel(event):
# a function that will make the two term & tf lists scroll together by the mouse
global term_name_listbox
global tf_listbox
term_name_listbox.yview("scroll", event.delta, "units")
tf_listbox.yview("scroll", event.delta, "units")
return "break"
def show_cache():
# show the cache
global cache_name_listbox
global cache_info_listbox
try:
if (globs.cache is None) or (not bool(globs.cache)):
tkMessageBox.showinfo('ERROR', 'currently, no cache is loaded')
return
cache_gui = Toplevel()
cache_gui.title("Cache")
scrollbar = Scrollbar(cache_gui)
scrollbar.pack(side=RIGHT, fill=Y)
cache_name_listbox = Listbox(cache_gui, yscrollcommand=scrollbar.set)
cache_name_listbox.insert(END, 'Term Name')
cache_info_listbox = Listbox(cache_gui, yscrollcommand=scrollbar.set)
cache_info_listbox.insert(END, 'tf|is_header|first_location|docno')
for term_name, term_cache_item in sorted(globs.cache.iteritems()):
cache_name_listbox.insert(END, str(term_name))
cache_info_listbox.insert(END, str(term_cache_item.doc_list[0:30]) + '...')
cache_name_listbox.pack(side=LEFT, fill=BOTH, expand=True)
cache_name_listbox.bind("<MouseWheel>", cache_mutual_mouse_wheel)
cache_info_listbox.pack(side=LEFT, fill=BOTH, expand=True)
cache_info_listbox.bind("<MouseWheel>", cache_mutual_mouse_wheel)
scrollbar.config(command=cache_mutual_scroll)
except Exception as err:
tkMessageBox.showinfo('ERROR', err)
traceback.print_exc(file=stdout)
def cache_mutual_scroll(*args):
# force both list boxes to be scrolled together
global cache_name_listbox
global cache_info_listbox
cache_name_listbox.yview(*args)
cache_info_listbox.yview(*args)
def cache_mutual_mouse_wheel(event):
# a function that will make the two term & cache_info lists scroll together
global cache_name_listbox
global cache_info_listbox
cache_name_listbox.yview("scroll", event.delta, "units")
cache_info_listbox.yview("scroll", event.delta, "units")
return "break"
def save_dict_cache():
# save the cache and terms dictionary to disk
global indexer
global dict_cache_path
try:
if not check_dictionaries_loaded():
return
dict_cache_path.set(tkFileDialog.askdirectory())
if (dict_cache_path.get() is None) or (dict_cache_path.get() == ''):
return
write_cache_dict_to_disk()
write_main_dict_to_disk()
write_documents_dict_to_disk()
tkMessageBox.showinfo('NICE', 'write finished successfully')
except Exception as err:
tkMessageBox.showinfo('ERROR', err)
traceback.print_exc(file=stdout)
def check_dictionaries_loaded():
if (globs.cache is None) or (globs.main_dictionary is None) or (globs.documents_dict is None)\
or (not bool(globs.cache)) or (not bool(globs.main_dictionary)) or (not bool(globs.documents_dict)):
tkMessageBox.showinfo('ERROR', 'cache is None - {}.\nmain dictionary is None - {}'
'.\ndocuments_dict is None - {}'
.format(globs.cache is None,
globs.main_dictionary is None, globs.documents_dict is None))
return False
return True
def write_documents_dict_to_disk():
global dict_cache_path
global is_stemming
dict_path = os.path.join(dict_cache_path.get(),
STEMMING_PREFIX if is_stemming.get() is True else '') + DOCUMENTS_PREFIX
with open(dict_path, 'wb') as dict_file:
Pickle.dump(globs.documents_dict, dict_file)
Pickle.dump(len(globs.documents_dict), dict_file)
Pickle.dump(globs.average_doc_size, dict_file)
def write_main_dict_to_disk():
# write main terms dictionary to disk
global dict_cache_path
global is_stemming
dict_path = os.path.join(dict_cache_path.get(),
STEMMING_PREFIX if is_stemming.get() is True else '') + DICTIONARY_PREFIX
with open(dict_path, 'w') as dict_file:
for key, val in globs.main_dictionary.iteritems():
dict_file.write('{}:{}\n'.format(key, val))
def write_cache_dict_to_disk():
# write the cache dictionary to disk
global dict_cache_path
global is_stemming
dict_path = os.path.join(dict_cache_path.get(),
STEMMING_PREFIX if is_stemming.get() is True else '') + CACHE_PREFIX
with open(dict_path, 'w') as dict_file:
for key, val in globs.cache.iteritems():
dict_file.write('{}:{}\n'.format(key, val))
def load_dict_cache():
global dict_cache_path
try:
temp_path = tkFileDialog.askdirectory()
if (temp_path is None) or (temp_path == ''):
return
dict_cache_path.set(temp_path)
if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
globs.main_dictionary.clear()
if (globs.cache is not None) and (bool(globs.cache)):
globs.cache.clear()
if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
globs.documents_dict.clear()
# the utils is hardcoded due to a clarification stated in the forum
print ('START LOAD STOP WORDS - ' + time.strftime("%H:%M:%S"))
globs.stop_words = load_stop_words('utils')
cache_path = os.path.join(dict_cache_path.get(),
STEMMING_PREFIX if is_stemming.get() is True else '') + CACHE_PREFIX
print ('START LOAD CACHE - ' + time.strftime("%H:%M:%S"))
globs.cache = CacheTerm.create_cache_from_disk(cache_path)
dict_path = os.path.join(dict_cache_path.get(),
STEMMING_PREFIX if is_stemming.get() is True else '') + DICTIONARY_PREFIX
print ('START LOAD MAIN DICTIONARY - ' + time.strftime("%H:%M:%S"))
globs.main_dictionary = TermDict.create_main_dictionary_from_disk(dict_path, globs.cache.keys())
documents_dict_path = os.path.join(dict_cache_path.get(),
STEMMING_PREFIX if is_stemming.get() is True else '') + DOCUMENTS_PREFIX
print ('START LOAD DOCUMENTS DICTIONARY - ' + time.strftime("%H:%M:%S"))
with open(documents_dict_path, 'rb') as fp:
globs.documents_dict = Pickle.load(fp)
globs.num_of_documents = Pickle.load(fp)
globs.average_doc_size = Pickle.load(fp)
print ('FINISH LOADING - ' + time.strftime("%H:%M:%S"))
tkMessageBox.showinfo('OK', 'cache & dictionary loaded')
except Exception as err:
tkMessageBox.showinfo('ERROR', err)
traceback.print_exc(file=stdout)
def load_stop_words(path):
# input a path and create a stop words set
path = os.path.join(path, STOP_WORDS_FILE_NAME)
stop_words_temp = set()
with open(path, 'r') as stop_words_file:
for word in stop_words_file:
stop_words_temp.add(''.join(char for char in word if char.isalpha()))
return stop_words_temp
def run_manual_query():
# parse a manual query - could be a docno or a query
# TODO verify that main dictionary is loaded
global is_docno
global manual_query_path
global docs_path
global is_expansion
query = manual_query_path.get().strip()
if not bool(query):
tkMessageBox.showinfo('ERROR', 'the manual query field is empty')
return
if not check_dictionaries_loaded():
return
if is_docno.get() and is_expansion.get():
tkMessageBox.showinfo('ERROR', 'both expansion and docno are ticked')
return
if is_docno.get():
run_docno(query)
return
if is_expansion.get():
run_expansion(query)
return
queries_gui([run_one_query(query)])
def run_one_query(query, num_of_docs=50, query_id=None, query_desc=None):
# run one query, return it ranked documents and the time it took to calculate it
global is_stemming
global dict_cache_path
print ('{} - RUN query {} - '.format(time.strftime("%H:%M:%S"), query))
query_id = 999 if query_id is None else query_id
start_time = datetime.now()
searcher = Searcher(query, is_stemming.get(), dict_cache_path.get(), query_desc)
ranked_documents = searcher.search_query()
elapsed_time = (datetime.now() - start_time).seconds
return ranked_documents[:num_of_docs], query, elapsed_time, query_id
def queries_gui(queries_results):
# function to create a gui for the output of the query results and allow saving them to a file
global query_guies
query_gui = Toplevel()
query_guies.append(query_gui)
query_gui.title("Query GUI")
scrollbar_docno = Scrollbar(query_gui)
scrollbar_docno.pack(side=RIGHT, fill=Y)
results_text = Text(query_gui, yscrollcommand=scrollbar_docno.set, height=20, width=140)
scrollbar_docno.config(command=results_text.yview)
for i, (ranked_documents, query, elapsed_time, query_id) in enumerate(queries_results):
results_text.insert(END,
'\n{}.Query - \"{}\". {} documents returned in {} seconds\nDocnos:'
.format(i+1, query, len(ranked_documents), elapsed_time))
for j, document in enumerate(ranked_documents):
if j % 10 == 0:
results_text.insert(END, '\n')
results_text.insert(END, '{} '.format(document.docno))
results_text.pack(side=LEFT, fill=BOTH, expand=True)
save_results_button = Button(query_gui, text="Save", command=lambda: save_results(queries_results), fg='blue')
save_results_button.pack()
def save_results(queries_results):
# save query results to a file
options = {'defaultextension': '.txt'}
file_name = tkFileDialog.asksaveasfilename(**options)
if file_name is None or file_name == "":
return
with open(file_name, 'w') as results_file:
for i, (ranked_documents, query, elapsed_time, query_id) in enumerate(queries_results):
for j, document in enumerate(ranked_documents):
written_row = " ".join([str(query_id), '0', str(document.docno), str(j), '42.38', 'mt', '\n'])
results_file.write(written_row)
# save the file name in case we want to reset
globs.results_files.append(file_name)
def run_docno(docno):
# given a docno, find the top 5 senteces out of it
global is_stemming
global dict_cache_path
global query_guies
document = globs.documents_dict.get(docno.upper(), False)
if document is False:
tkMessageBox.showinfo('ERROR', 'no such docno - {}'.format(docno))
return
docno_parser = DocnoParser(docno, document.file_name, is_stemming.get(), document.max_tf)
if not (docno_parser.load_docno(dict_cache_path.get())):
tkMessageBox.showinfo('ERROR', 'something went wrong'.format(docno))
return
top_5_sentences = docno_parser.find_top_5_sentences()
docno_gui = Toplevel()
query_guies.append(docno_gui)
docno_gui.title("Top 5 Sentences in Docno {}".format(docno))
scrollbar_docno = Scrollbar(docno_gui)
scrollbar_docno.pack(side=RIGHT, fill=Y)
docno_text = Text(docno_gui, yscrollcommand=scrollbar_docno.set, height=20, width=100)
docno_text.insert(END, 'Sentences Ranking for docno - {}\n'.format(docno))
scrollbar_docno.config(command=docno_text.yview)
for i, sentence_with_rank in enumerate(sorted(top_5_sentences, key=lambda tup: tup[2])):
sentence = re.sub(' +', ' ', sentence_with_rank[0])
docno_text.insert(END, "{}. Score:{}\n {}\n".format(i+1, sentence_with_rank[1], sentence))
docno_text.pack(side=LEFT, fill=BOTH, expand=True)
def run_file_query():
# given a file that contains trec queries, evaluate them
filename = tkFileDialog.askopenfilename()
if filename is None or filename == '':
return
queries_results = []
with open(filename, 'r') as queries_file:
queries = re.findall(r'<top>(.*?)</top>', queries_file.read(), re.DOTALL)
for query_file in queries:
query_id, query, query_desc = extract_query_params_from_query(query_file)
queries_results.append(run_one_query(query=query, query_id=query_id, query_desc=query_desc))
queries_gui(queries_results)
def extract_query_params_from_query(query_file):
# given a query in the trec format, extract the query id, query title and query description
num_line = re.findall(r'<num>(.*?)\n', query_file)[0]
query_id = re.findall('\d+', num_line)[0]
query = re.findall(r'<title>(.*?)\n', query_file)[0].strip()
desc = re.findall(r'<desc>(.*?)<narr>', query_file, re.DOTALL)[0].strip()
# remove meaningless words from the desc
irrelevant_words = {'description', 'find', 'documents', 'discuss', 'called', 'identify'}
desc = ' '.join([word for word in re.split('\n|\s', desc)
if word.lower().strip().translate(None, string.punctuation) not in irrelevant_words])
return query_id, query, desc
def run_expansion(query):
# search for a word in wikipedia, pick 4 random categories of it and query them
if len(query.split()) > 1:
tkMessageBox.showinfo('ERROR', 'more than one word')
return
try:
wiki_page = wikipedia.page(query)
not_start_with = ('cs1', 'articles', 'all', 'wikipedia', 'use', 'webarchive', 'coordinates', 'disambiguation',
'pages using', 'spoken articles', 'citation overkill')
wiki_categories = [category for category in wiki_page.categories
if not category.lower().startswith(not_start_with)]
if len(wiki_categories) > 4:
wiki_categories = random.choice(wiki_categories, size=4, replace=False).tolist()
queries_results = []
num_of_docs = 70/len(wiki_categories)
for i, category in enumerate(wiki_categories):
queries_results.append(run_one_query(query=category, num_of_docs=num_of_docs, query_id=i+100))
queries_gui(queries_results)
except wikipedia.WikipediaException as err:
print err
tkMessageBox.showinfo('Wiki Error',
'there was a wikipedia error - {}.\n loading from regular database'.format(err))
queries_gui([run_one_query(query)])
# no value was found
def reset_part_two():
# reset cache & documents dict & main dictionary. close Guis, delete results files
global query_guies
if not check_dictionaries_loaded():
return
result = tkMessageBox.askquestion("Reset Part 2",
"Are you sure you want to reset?\n", icon='warning')
if result != 'yes':
return
if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
globs.main_dictionary.clear()
if (globs.cache is not None) and (bool(globs.cache)):
globs.cache.clear()
if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
globs.documents_dict.clear()
for one_gui in query_guies:
one_gui.destroy()
query_guies = []
for results_file in globs.results_files:
os.unlink(results_file)
globs.results_files = []
tkMessageBox.showinfo('OK', 'dict & cache was reset, results are deleted')
if __name__ == '__main__':
globs.init_globs()
gui()
# TODO - in report & labs, download nltk punkt like this - import nltk, nltk.download('punkt')
# TODO - install wikipedia open source, numpy random