def rank_cosine_sim(self, doc_ids, q_terms): dbManager = DbManager() builder = StructureBuilder() docs_relevant_scores = {} q_freqs = dict() # set the query terms frequencies for q_term in q_terms: if q_term in q_freqs: q_freqs[q_term] += 1 else: q_freqs[q_term] = 1 # set max frequency sorted_q_freqs = sorted(q_freqs.items(), key=operator.itemgetter(1), reverse=True) max_q_freq = sorted_q_freqs[0][1] for doc_id in doc_ids: tf_idf_sum = 0 denom_di_sum = 0 denom_qi_sum = 0 for q_term in q_terms: q_doc_freq = self.get_q_doc_freq(q_term, doc_id) if q_doc_freq == None: continue # not found on index max_freq_doc = dbManager.get_max_freq_doc(doc_id) if max_freq_doc != None: self.max_freq_docs[doc_id] = max_freq_doc # number of documents in DC in which q_term appears at least once. n_docs_q_term = len(self.q_terms_freqs[q_term]) tf_idf_doc = self.calc_tf_idf(q_doc_freq, max_freq_doc, self.docs_count, n_docs_q_term) tf_idf_q = self.calc_tf_idf(q_freqs[q_term], max_q_freq, self.docs_count, n_docs_q_term) tf_idf = tf_idf_doc * tf_idf_q tf_idf_sum += tf_idf denom_di_sum += tf_idf_doc**2 denom_qi_sum += tf_idf_q**2 #tf_idf_sum += tf_idf_doc denom = math.sqrt(denom_di_sum) * math.sqrt(denom_qi_sum) score = tf_idf_sum / denom docs_relevant_scores[doc_id] = round(score, 3) sorted_docs_total_freqs = sorted(docs_relevant_scores.items(), key=operator.itemgetter(1), reverse=True) return sorted_docs_total_freqs
def rank(self, doc_ids, q_terms): """ docs => array, every doc => should have an array of terms [term] => must have frequency Calculate Relevance Score 1) Calculate TF I need for each document the list of words (non-stop, stemmed/lemmatized) and their frequencies I need the frequency of the most frequently-occurred term of each document (constant per document) 2) Calculate IDF I need the number of documents in DC (constant) I need the number of documents in DC in which w appears at least once. """ dbManager = DbManager() builder = StructureBuilder() docs_relevant_scores = {} for doc_id in doc_ids: tf_idf_sum = 0 for q_term in q_terms: q_doc_freq = self.get_q_doc_freq(q_term, doc_id) if q_doc_freq == None: continue # not found on index max_freq_doc = dbManager.get_max_freq_doc(doc_id) if max_freq_doc != None: self.max_freq_docs[doc_id] = max_freq_doc # number of documents in DC in which q_term appears at least once. n_docs_q_term = len(self.q_terms_freqs[q_term]) tf_idf_doc = self.calc_tf_idf(q_doc_freq, max_freq_doc, self.docs_count, n_docs_q_term) tf_idf_sum += tf_idf_doc docs_relevant_scores[doc_id] = round(tf_idf_sum, 3) sorted_docs_total_freqs = sorted(docs_relevant_scores.items(), key=operator.itemgetter(1), reverse=True) return sorted_docs_total_freqs