/
query.py
162 lines (123 loc) · 4.85 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# TODO: an optimization - use "skip pointers" in the index searched
from Inf141_tokenizer import PartA as tk
from stopwatch.stopwatch import Stopwatch
from lexicon import readLexicon
import serializer as sz
from posting import Posting
import os
import sys
import indexer as ind
import lexicon as lex
# TODO: multithreading to search indices in different lexical ranges at the same time?
# Returns a list of postings where all terms appeared
# The score of each positing is the sum of the scores of the postings for each term
def searchQuery(query, index, lexicon):
query = tk.tokenize(query)
indexItems = []
for token in query:
# (token, doc_freq, posting_list)
indexItems.append(searchToken(token, index, lexicon))
# AND the postings lists
intersectIndexItems(indexItems)
# Compute the tf-idf
return sorted(indexItems[0][2], key=lambda t: t.score, reverse=True)
def searchToken(token, index, lexicon):
# Check if the token searched for is in the lexicon
if token in lexicon:
# Seek to the place in the file that the lexicon specifies
index.seek(lexicon[token])
# Read the line and split it on spaces
line = index.readline()
return sz.deserializeIndexItem(line)
else:
return token, 0, []
def intersectIndexItems(indexItems):
if len(indexItems) > 1:
# Intersect the first two posting lists
intersectTwoPostingsLists([indexItems[0][2]], [indexItems[1][2]])
# If there are three or more posting lists, intersect those as well
if len(indexItems) > 2:
prevItems = [indexItems[0][2], indexItems[1][2]]
for item in indexItems[2:]:
intersectTwoPostingsLists(prevItems, [item[2]])
prevItems.append(item[2])
def intersectTwoPostingsLists(list1, list2):
index = 0
# Make sure list1 has the smaller list
if len(list1[0]) > len(list2[0]):
temp = list1
list1 = list2
list2 = temp
while index < len(list1[0]):
# Loop through the second list until a posting is found that is greater than or equal to the current post
while index < len(list2[0]) and list2[0][index].document < list1[0][index].document:
for li in list2:
li.pop(index)
# Check to make sure there are enough elements in the second list
if index < len(list2[0]):
# If the documents are equal, advance the index
if list1[0][index].document == list2[0][index].document:
index += 1
# If the documents are unequal, remove those items
else:
for li in list1:
li.pop(index)
for li in list2:
li.pop(index)
# For each list in the second list, slice off the excess
if index < len(list2[0]):
for i in range(len(list2)):
del list2[i][index:]
def resultString(query, results, t):
string = f"Results for query: '{query}'\n"
string += f"Completed in {t} secs\n"
string += f"Total results: {len(results)}\n"
size = min(5, len(results))
if size > 0:
string += f"Top results:\n"
for i in range(size):
string += f"{results[i].document}\n"
else:
string += "No results found\n"
return string
def checkBuildData():
print("Checking if index and lexicon data exist...")
if not os.path.exists("index.txt"):
print("The index has not been built yet! Building index and lexicon...")
ind.main()
lex.main()
if not os.path.exists("lexicon.txt"):
print("The lexicon has not been built yet! Building the lexicon...")
lex.main()
if __name__ == "__main__":
print("Welcome to our searching engine!")
checkBuildData()
userInput = "not x"
fout = open("searchReport.txt", "w")
indexFile = open("index.txt", "r")
watch = Stopwatch()
print("Getting lexicon for faster searching...")
watch.restart()
theLex = readLexicon("lexicon.txt")
print(f"Finished reading lexicon in {watch.read()} secs")
currentArg = 1
while userInput != "x":
if currentArg < len(sys.argv):
print("Retrieving query from command line")
userInput = sys.argv[currentArg]
currentArg += 1
else:
userInput = input("Enter the terms to search for (type 'x' to exit): ")
if userInput != "x":
print(f"Searching query '{userInput}'...")
watch.restart()
output = searchQuery(userInput, indexFile, theLex)
output = resultString(userInput, output, watch.read())
print()
print(output)
print("Results output to file: searchReport.txt")
print()
fout.write(f"{output}\n")
else:
print("Thanks for using our search application!")
fout.close()