Beispiel #1
0
 def get_n_idf_ranked_list(self, doc_ids):
     # scan posting list to get all relevant terms
     rel_terms = []
     rel_df = []
     for term_id in self.posting_list:
         for doc_id in doc_ids:
             if doc_id in [doc[0] for doc in self.posting_list[term_id]]:
                 if term_id in rel_terms:
                     rel_df[len(rel_df) - 1 ] += 1
                 else:
                     rel_terms.append(term_id)
                     rel_df.append(1)
     n_idf_ranking = []
     for idx, term in enumerate(rel_terms):
         n_idf_ranking.append([rel_df[idx] * qp.calculate_idf(self.get_df_by_term_id(term), self.get_collection_size()), term])
     n_idf_ranking.sort(reverse=True)
     return n_idf_ranking
Beispiel #2
0
 def get_f_idf_ranked_list(self, doc_ids):
     # scan posting list to get all relevant terms
     rel_terms = []
     rel_tf = []
     for term_id in self.posting_list:
         for doc_id in doc_ids:
             abridged_list = [doc[0] for doc in self.posting_list[term_id]]
             if doc_id in abridged_list:
                 if term_id in rel_terms:
                     rel_tf[len(rel_tf) - 1 ] += self.posting_list[term_id][abridged_list.index(doc_id)][1]
                 else:
                     rel_terms.append(term_id)
                     rel_tf.append(self.posting_list[term_id][abridged_list.index(doc_id)][1])
     f_idf_ranking = []
     for idx, term in enumerate(rel_terms):
         f_idf_ranking.append([rel_tf[idx] * qp.calculate_idf(self.get_df_by_term_id(term), self.get_collection_size()), term])
     f_idf_ranking.sort(reverse=True)
     return f_idf_ranking