def __get_doc_scores(self, posting_list, query: Query):
     click.secho("[Search Engine] Computing search scores ...",
                 fg="bright_blue")
     query_tf_idf = {}
     norm_query_vector = 0
     query_vocabulary = query.get_vocabulary()
     for token in query_vocabulary:
         tf_idf = query.get_tf(token) * self.collection.get_idf(token)
         query_tf_idf[token] = tf_idf
         norm_query_vector += tf_idf**2
     norm_query_vector = sqrt(norm_query_vector)
     doc_scores = {}
     for doc_id in posting_list:
         score = 0
         for token in query_vocabulary:
             if self.weighting_model == "tw-idf":
                 weight = self.collection.get_tw_idf(target_term=token,
                                                     target_doc_id=doc_id,
                                                     b=0.003)
             elif self.weighting_model == "tf-idf":
                 weight = self.collection.get_piv_plus(target_term=token,
                                                       target_doc_id=doc_id,
                                                       b=0.2)
             else:
                 weight = self.collection.get_bm25_plus(
                     target_term=token,
                     target_doc_id=doc_id,
                     b=0.75,
                     k1=1.2)
             score += query_tf_idf[token] * weight
         score /= self.collection.documents_norms[doc_id] * norm_query_vector
         doc_scores[doc_id] = score
     return doc_scores
 def __get_posting_list(self, query: Query):
     final_posting_list = []
     vocabulary = query.get_vocabulary()
     for token in vocabulary:
         if not final_posting_list:
             final_posting_list = self.collection.get_posting_list(token)
         else:
             posting_list = self.collection.get_posting_list(token)
             final_posting_list = merge_or_postings_list(
                 final_posting_list, posting_list)
     return final_posting_list
Ejemplo n.º 3
0
    def get_list_of_documents(self, query: Query):
        """Return documents where the words of the query appear"""
        target_documents_list = []
        # Get words of query
        vocabulary = query.get_vocabulary()
        for word in vocabulary:
            if not target_documents_list:
                target_documents_list = self.collection.get_documents_containing_term(word)
                print(
                    f"[Search Engine] the word {word} is present in {len(target_documents_list)} items"
                    )

            else:
                documents_list = self.collection.get_documents_containing_term(word)
                print(
                    f"[Search Engine] the word {word} is present in {len(documents_list)} items"
                    )
                print("Merge ...")
                # merge the two lists and order the final list
                target_documents_list = sorted(list(set(target_documents_list) | set(documents_list)))
        return target_documents_list
Ejemplo n.º 4
0
 def compute_scores(self, list_of_docs, query: Query):
     """ Scores each document, depending of the tokens it contains."""
     print("Search Engine is computing search scores ...")
     query_tf_idf = {}
     vocab_query = query.get_vocabulary()
     # get the tf_idf for words in the query
     for word in vocab_query:
         tf_idf = query.get_term_frequency(word) * self.collection.compute_idf(word)
         query_tf_idf[word] = tf_idf
     # score the documents which contain the words
     doc_scores = {}
     for doc_id in list_of_docs:
         score = 0
         for word in vocab_query:
             normalized_tf = self.collection.log_normalization(
                 term=word, id_document=doc_id
             )
             if normalized_tf == 0:  # the word is not in the document
                 doc_tf_idf = 0
             else:
                 doc_tf_idf = normalized_tf * self.collection.compute_idf(word) #tf-idf for the word in the document
             score += query_tf_idf[word] * doc_tf_idf
         doc_scores[doc_id] = score
     return doc_scores