-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_engine.py
110 lines (95 loc) · 4.66 KB
/
search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import time
import sys
from reader import ReadFile
from configuration import ConfigClass
from parser_module import Parse
from indexer import Indexer
from searcher import Searcher
import utils
from glove import GloveStrategy
import glob
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None):
"""
This function build the inverted index over the corpus.
send each tweet to parsing and indexing.
if the stemming is True the parsing will use the stemmer on the tokens.
:param glove_dict: Glove file including all word vectors
:param corpus_path: root folder containing the raw tweet files
:param output_path for the inverted index, posting files and tweets dictionary
:param stemming if True use stemmer on terms
"""
config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path)
r = ReadFile(corpus_path=config.get_corpusPath())
p = Parse(stemming)
indexer = Indexer(config)
all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet")
all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths]
start_time = time.time()
file_counter = 0
for file_name in all_files_names:
file_start_time = time.time()
# print("start file :", file_counter)
documents_list = [document for document in r.read_file(file_name=file_name)]
# Iterate over every document in the file
for idx, document in enumerate(documents_list):
parsed_document = p.parse_doc(document)
indexer.add_new_doc(parsed_document, glove_dict)
# print("end file number ", file_counter, " in: ", time.time() - file_start_time)
file_counter += 1
total_time = time.time() - start_time
indexer.finish_indexing()
# print('Finished parsing and indexing after {0} seconds. Starting to export files'.format(total_time))
def load_index():
inverted_index = utils.load_obj("inverted_index")
return inverted_index
def load_tweet_dict():
"""
read the tweet vector files and insert the vectors to the tweet dictionary
:return tweet_Dictionary including the Glove vector data
"""
tweet_dict = utils.load_obj("docDictionary")
buckets = []
for i in range(tweet_dict["metadata"]["tweet_vector_buckets"]):
buckets.append(utils.load_obj("avgVector" + str(i)))
for tweet_id in tweet_dict.keys():
if tweet_id == "metadata":
continue
address = tweet_dict[tweet_id][5]
tweet_dict[tweet_id][5] = buckets[address[0]][address[1]]
return tweet_dict
def search_and_rank_query(query, inverted_index, k, glove_dict="", tweet_dict=None):
"""
This function gets a query and returns the top k tweets that are relevant for the query.
First parses the query and then searches for all relevant tweets for that query.
:param query: query to search
:param inverted_index: inverted index file including all of the terms in the dictionary
:param k: how many results we'd like to return
:param glove_dict: the official Glove file including all word vectors
:param tweet_dict: extra information about each tweet that exists in the engine
:return: top k results
"""
p = Parse()
query_as_list, entities = p.parse_text(query)
full_query = query_as_list + entities
searcher = Searcher(inverted_index, tweet_dict)
relevant_docs = searcher.relevant_docs_from_posting(full_query, glove_dict=glove_dict)
ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs)
retrieve_list = searcher.ranker.retrieve_top_k(ranked_docs, k)
return retrieve_list
def main(corpus_path="testData", output_path="posting", stemming=True, queries='', num_docs_to_retrieve=10,
index_corpus=True, query_engine=False):
glove_dict = GloveStrategy("glove.twitter.27B.25d.txt").embeddings_dict # *LOCAL RUN*
# glove_dict = GloveStrategy("../../../../glove.twitter.27B.25d.txt").embeddings_dict # *SUBMISSION SYS RUN*
if index_corpus:
run_engine(corpus_path=corpus_path, output_path=output_path, stemming=stemming, glove_dict=glove_dict)
if query_engine:
tweet_dict = load_tweet_dict()
inverted_index = load_index()
while True:
query = input("Please enter a query: ")
k = int(input("Please enter number of docs to retrieve: "))
results = search_and_rank_query(query, inverted_index, k, glove_dict, tweet_dict)
for idx, doc_tuple in enumerate(results):
print('{}. tweet id: {}'.format(idx+1, doc_tuple[0]))
if input("Run another query? Y/N: ").lower() != 'y':
break