Exemple #1
0
    def rank_cosine_sim(self, doc_ids, q_terms):
        dbManager = DbManager()
        builder = StructureBuilder()
        docs_relevant_scores = {}
        q_freqs = dict()
        # set the query terms frequencies
        for q_term in q_terms:
            if q_term in q_freqs:
                q_freqs[q_term] += 1
            else:
                q_freqs[q_term] = 1

        # set max frequency
        sorted_q_freqs = sorted(q_freqs.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
        max_q_freq = sorted_q_freqs[0][1]

        for doc_id in doc_ids:
            tf_idf_sum = 0
            denom_di_sum = 0
            denom_qi_sum = 0
            for q_term in q_terms:
                q_doc_freq = self.get_q_doc_freq(q_term, doc_id)
                if q_doc_freq == None: continue  # not found on index
                max_freq_doc = dbManager.get_max_freq_doc(doc_id)
                if max_freq_doc != None:
                    self.max_freq_docs[doc_id] = max_freq_doc
                    # number of documents in DC in which q_term appears at least once.
                    n_docs_q_term = len(self.q_terms_freqs[q_term])

                    tf_idf_doc = self.calc_tf_idf(q_doc_freq, max_freq_doc,
                                                  self.docs_count,
                                                  n_docs_q_term)
                    tf_idf_q = self.calc_tf_idf(q_freqs[q_term], max_q_freq,
                                                self.docs_count, n_docs_q_term)
                    tf_idf = tf_idf_doc * tf_idf_q
                    tf_idf_sum += tf_idf
                    denom_di_sum += tf_idf_doc**2
                    denom_qi_sum += tf_idf_q**2
                    #tf_idf_sum += tf_idf_doc
            denom = math.sqrt(denom_di_sum) * math.sqrt(denom_qi_sum)
            score = tf_idf_sum / denom
            docs_relevant_scores[doc_id] = round(score, 3)
        sorted_docs_total_freqs = sorted(docs_relevant_scores.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
        return sorted_docs_total_freqs
Exemple #2
0
    def rank(self, doc_ids, q_terms):
        """
        docs => array, every 
            doc => should have an array of terms
                [term] => must have frequency
                
        Calculate Relevance Score
        1) Calculate TF
        I need for each document the list of words (non-stop, stemmed/lemmatized) and their frequencies
        I need the frequency of the most frequently-occurred term of each document (constant per document)
        2) Calculate IDF
        I need the number of documents in DC (constant)
        I need the number of documents in DC in which w appears at least once.
        """
        dbManager = DbManager()
        builder = StructureBuilder()
        docs_relevant_scores = {}

        for doc_id in doc_ids:
            tf_idf_sum = 0
            for q_term in q_terms:
                q_doc_freq = self.get_q_doc_freq(q_term, doc_id)
                if q_doc_freq == None: continue  # not found on index
                max_freq_doc = dbManager.get_max_freq_doc(doc_id)
                if max_freq_doc != None:
                    self.max_freq_docs[doc_id] = max_freq_doc
                    # number of documents in DC in which q_term appears at least once.
                    n_docs_q_term = len(self.q_terms_freqs[q_term])

                    tf_idf_doc = self.calc_tf_idf(q_doc_freq, max_freq_doc,
                                                  self.docs_count,
                                                  n_docs_q_term)
                    tf_idf_sum += tf_idf_doc
            docs_relevant_scores[doc_id] = round(tf_idf_sum, 3)
        sorted_docs_total_freqs = sorted(docs_relevant_scores.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
        return sorted_docs_total_freqs