Beispiel #1
0
 def get_document(self, doc_id):
     dbManager = DbManager()
     return dbManager.get_document(doc_id)
Beispiel #2
0
    def add_snippets(self, ranked_docs, query):
        dbManager = DbManager()
        builder = StructureBuilder()
        docs_with_snippets = []
        tf_idf_q_terms = {}
        q_terms = builder.get_stemmed_tems(query)

        for q_term in q_terms:
            # number of documents in DC in which q_term appears at least once.
            n_docs_q_term = len(self.q_terms_freqs[q_term]
                                ) if q_term in self.q_terms_freqs else 0
            if n_docs_q_term != 0:
                freq_d = len([q for q in q_terms if q == q_term])
                max_q_freq = self.get_local_max_freq(q_terms)
                tf_idf_q_terms[q_term] = self.calc_tf_idf(
                    freq_d, max_q_freq, self.docs_count, n_docs_q_term)
            else:
                tf_idf_q_terms[q_term] = 0

        for ranked_doc in ranked_docs:
            doc_id = ranked_doc[0]
            docs_relevant_scores = {}
            doc = dbManager.get_document(doc_id)
            if doc == None: continue
            sentences = self.get_doc_sentences(doc)
            title = sentences.pop(0)['content']

            for sentence in sentences:
                senetence_content = sentence['content']
                # if the sentence has less than 2 character then it is probabily not an actual sentence.
                if len(senetence_content) <= 2: continue
                sentence_id = sentence['id']
                tf_idf_sum = 0
                denom_di_sum = 0
                denom_qi_sum = 0
                index_sentence = builder.get_stemmed_terms_frequencies_from_doc(
                    sentence)
                for q_term in q_terms:
                    # check the not stemmed words
                    if q_term in index_sentence.Terms:
                        q_sentence_freq = index_sentence.get_term_freq(q_term)
                        max_freq = index_sentence.get_max_freq()
                        # if the query term doesn't have frequency on the sentence and there is no max freq. then disregard this q_term
                        if (q_sentence_freq == 0 and max_freq == 0):
                            continue

                        tf_idf_doc = self.calc_tf_idf(
                            q_sentence_freq, max_freq, self.docs_count,
                            len(self.q_terms_freqs[q_term]))
                        tf_idf_q = tf_idf_q_terms[q_term]
                        # The two sentences in d that have the highest cosine similarity with respect to q; with TF-IDF as the term weighting scheme.

                        tf_idf_sum += tf_idf_doc * tf_idf_q
                        denom_di_sum += tf_idf_doc**2
                        denom_qi_sum += tf_idf_q**2

                denom = math.sqrt(denom_di_sum) * math.sqrt(denom_qi_sum)
                score = tf_idf_sum / denom if denom != 0 else 0
                docs_relevant_scores[sentence_id] = round(score, 3)

            sorted_docs_total_freqs = sorted(docs_relevant_scores.items(),
                                             key=operator.itemgetter(1),
                                             reverse=True)
            top_sentences = sorted_docs_total_freqs[0:2]
            top_snippets = [
                s['content'] for s in sentences
                if s['id'] == top_sentences[0][0]
                or s['id'] == top_sentences[1][0]
            ]

            docs_with_snippets.append({
                "docId": doc_id,
                "score": ranked_doc[1],
                "title": title,
                "snippets": top_snippets
            })
        return docs_with_snippets