def get_document(self, doc_id): dbManager = DbManager() return dbManager.get_document(doc_id)
def add_snippets(self, ranked_docs, query): dbManager = DbManager() builder = StructureBuilder() docs_with_snippets = [] tf_idf_q_terms = {} q_terms = builder.get_stemmed_tems(query) for q_term in q_terms: # number of documents in DC in which q_term appears at least once. n_docs_q_term = len(self.q_terms_freqs[q_term] ) if q_term in self.q_terms_freqs else 0 if n_docs_q_term != 0: freq_d = len([q for q in q_terms if q == q_term]) max_q_freq = self.get_local_max_freq(q_terms) tf_idf_q_terms[q_term] = self.calc_tf_idf( freq_d, max_q_freq, self.docs_count, n_docs_q_term) else: tf_idf_q_terms[q_term] = 0 for ranked_doc in ranked_docs: doc_id = ranked_doc[0] docs_relevant_scores = {} doc = dbManager.get_document(doc_id) if doc == None: continue sentences = self.get_doc_sentences(doc) title = sentences.pop(0)['content'] for sentence in sentences: senetence_content = sentence['content'] # if the sentence has less than 2 character then it is probabily not an actual sentence. if len(senetence_content) <= 2: continue sentence_id = sentence['id'] tf_idf_sum = 0 denom_di_sum = 0 denom_qi_sum = 0 index_sentence = builder.get_stemmed_terms_frequencies_from_doc( sentence) for q_term in q_terms: # check the not stemmed words if q_term in index_sentence.Terms: q_sentence_freq = index_sentence.get_term_freq(q_term) max_freq = index_sentence.get_max_freq() # if the query term doesn't have frequency on the sentence and there is no max freq. then disregard this q_term if (q_sentence_freq == 0 and max_freq == 0): continue tf_idf_doc = self.calc_tf_idf( q_sentence_freq, max_freq, self.docs_count, len(self.q_terms_freqs[q_term])) tf_idf_q = tf_idf_q_terms[q_term] # The two sentences in d that have the highest cosine similarity with respect to q; with TF-IDF as the term weighting scheme. tf_idf_sum += tf_idf_doc * tf_idf_q denom_di_sum += tf_idf_doc**2 denom_qi_sum += tf_idf_q**2 denom = math.sqrt(denom_di_sum) * math.sqrt(denom_qi_sum) score = tf_idf_sum / denom if denom != 0 else 0 docs_relevant_scores[sentence_id] = round(score, 3) sorted_docs_total_freqs = sorted(docs_relevant_scores.items(), key=operator.itemgetter(1), reverse=True) top_sentences = sorted_docs_total_freqs[0:2] top_snippets = [ s['content'] for s in sentences if s['id'] == top_sentences[0][0] or s['id'] == top_sentences[1][0] ] docs_with_snippets.append({ "docId": doc_id, "score": ranked_doc[1], "title": title, "snippets": top_snippets }) return docs_with_snippets