/
searcher_Thesaurus.py
142 lines (119 loc) · 5.16 KB
/
searcher_Thesaurus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from ranker import Ranker
from nltk.corpus import wordnet
import utils
from indexer import Indexer
from nltk.corpus import lin_thesaurus as thes
# DO NOT MODIFY CLASS NAME
class Searcher:
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit. The model
# parameter allows you to pass in a precomputed model that is already in
# memory for the searcher to use such as LSI, LDA, Word2vec models.
# MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
def __init__(self, parser, indexer, model=None):
self._parser = parser
self._indexer = indexer
self._ranker = Ranker()
self._model = model
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def search(self, query, k=None):
"""
Executes a query over an existing index and returns the number of
relevant docs and an ordered list of search results (tweet ids).
Input:
query - string.
k - number of top results to return, default to everything.
Output:
A tuple containing the number of relevant search results, and
a list of tweet_ids where the first element is the most relavant
and the last is the least relevant result.
"""
query_as_list = self._parser.parse_sentence(query)
q_thesaurus = self.do_thesaurus(query_as_list)
self.upper_lower_case(query_as_list, self._indexer)
self.upper_lower_case(q_thesaurus, self._indexer)
# print("query as list: ", query_as_list)
# print("wordnet :", q_wordnet)
# Find relevant docs
relevant_docs = self._relevant_docs_from_posting(query_as_list + q_thesaurus)
n_relevant = len(relevant_docs)
# Send all to ranking
ranked_doc_ids = Ranker.rank_relevant_docs(query_as_list, q_thesaurus, relevant_docs, self._indexer, k)
return n_relevant, ranked_doc_ids
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def _relevant_docs_from_posting(self, query_as_list):
"""
This function loads the posting list and count the amount of relevant documents per term.
:param query_as_list: parsed query tokens
:return: dictionary of relevant documents mapping doc_id to document frequency.
"""
relevant_docs = {}
# Go over every term in the query
for term in query_as_list:
posting_list = self._indexer.get_term_posting_list(term)
# Check if the term exists in the corpus
if posting_list is None:
continue
# Go over every doc that has the term
for doc in posting_list:
docId = doc[0]
if docId not in relevant_docs:
relevant_docs[docId] = 1
else:
relevant_docs[docId] += 1
return relevant_docs
@staticmethod
def do_thesaurus(query):
lowered = []
toAdd = set()
# lower every word in query
for word in query:
lowered.append(word.lower())
# Go over every word in query
for word in lowered:
counterNoMoreThen4 = 0
dictionary = thes.synonyms(word)[1][1]
# find similar expressions and their scores
listOfScores = thes.scored_synonyms(word)[1][1]
dictOfScored = dict(listOfScores)
# print("\n word: ",word)
# print(dictOfScored)
# print(dictionary)
# Go over the thesaurus words
#for idx, syn in enumerate(dictionary):
# related.append(syn)
# Go over the scored dictionary
for key in dictOfScored:
# Check if similar enough and no more then 4 per word
if dictOfScored[key] > 0.21 and key not in lowered and counterNoMoreThen4 < 4:
counterNoMoreThen4 += 1
# if the similar term contains ' '
if key.__contains__(' '):
splited = key.split()
# add only relevant term
for term in splited:
if term not in lowered:
toAdd.add(term)
else:
toAdd.add(key)
elif counterNoMoreThen4 == 4:
# Too many terms for word
continue
#print("word: ",word," similar:",list(toAdd))
# Lower term in listToAdd
listToAdd = list(toAdd)
for i, term in enumerate(toAdd):
listToAdd[i] = term.lower()
#print("list: ", listToAdd)
#print("how much: ", len(listToAdd))
return listToAdd
@staticmethod
def upper_lower_case(list_of_words, indexer):
for i, w in enumerate(list_of_words):
if w.lower() in indexer.inverted_idx:
list_of_words[i] = w.lower()
elif w.upper() in indexer.inverted_idx:
list_of_words[i] = w.upper()
# @Todo what with words that not in inverted