Ejemplo n.º 1
0
 def get_scores(self, terms):
     """Creates a list of scores for each file in corpus.
     The score = weighted frequency / the total word count in the file
     The score is computed for each term and all scores are summed.
     Arguments:
         terms (list): A list of str
     Returns:
         list: a list of tuples, each containing the file_path_name and
               its relevancy score
     """
     scores = HashTable()
     results = []
     for term in terms:
         if term in self.term_freqs:
             for file in self.term_freqs[term].keys():
                 term_f = self.term_freqs[term][file]
                 weighted_f = self.get_wf(term_f)
                 if file not in scores:
                     scores[file] = weighted_f
                 else:
                     scores[file] += weighted_f
     for file in scores.keys():
         scores[file] /= self.doc_length[file]
         results.append((file, scores[file]))
     return results
Ejemplo n.º 2
0
 def search(self, query):
     """Search for the query items in files.
     Arguments:
         query (str): query input: e.g. "Computer Science"
     Returns:
          list: a list of tuples: (file_path_name, score) sorted
                in descending order or relevancy, excluding files whose
                relevancy score is 0
     """
     query_terms = query.lower().strip().split(' ')
     duplicate_check = HashTable()
     for term in query_terms:
         if term not in duplicate_check:
             duplicate_check[term] = 1
         else:
             duplicate_check[term] += 1
     return self.rank(self.get_scores(duplicate_check.keys()))