def search(self, query): """ Search for the query terms in files Args: query (str): query input, "user input goes here" Returns: list: a list of files in descending order of relevancy """ # parse words filtered_query = self.parse_words([query]) # remove duplicate words using a hash table word_table = HashTable() for word in filtered_query: word_table.put(word, word) word_table_keys = word_table.keys() parsed_query_terms = [] # changes from string to a list # add all words from hash table to list using keys() for key in word_table_keys: parsed_query_terms.append(word_table[key][0]) # pass query terms to get_scores() tuples = self.get_scores(parsed_query_terms) # pass resulting list of tuples to rank() results = self.rank(tuples) # rank's result will be displayed in descending order on screen for a_tuple in results: print(a_tuple[0])
def get_scores(self, terms): """ Creates list of scores for each file in corpus. The score = (weighted frequency / total word count in file) Compute the score for each term in a query and sum all the scores. Args: terms (list): a list of strings, raw input string from user query Returns: list: a list of tuples, each containing the filename and its relevancy score """ # scores = HashMap() score_table = HashTable( ) # contains tuples of (filename, weighted_frequency) for query_term in terms: # fetch a hash table of "term" from self.term_freqs query_term_table = self.term_freqs[query_term][1] # for each file in the hash table, add weighted frequency to scores[file] qt_table_keys = query_term_table.keys() for key in qt_table_keys: # key is a file name weighted_frequency = self.get_wf(query_term_table[key][1]) if weighted_frequency != 0: # if this is the second query_term if score_table.contains(key): # new frequency + old frequency old_freq = score_table[key][1] updated_freq = weighted_frequency + old_freq score_table.put(key, updated_freq) # if score_table[key] is empty, use put (if first query_term) else: score_table.put(key, weighted_frequency) # for each file in scores, do scores[file] /= self.doc_length[file] score_table_keys = score_table.keys() score_list = [] for key in score_table_keys: # key is a filename normalized_score = score_table[key][1] / self.doc_length[key][1] score_table[key] = normalized_score score_list.append(score_table[key]) # return scores, which is a list of tuples neglecting terms with frequencies of 0 return score_list