コード例 #1
0
    def create_with_ponderation_normal_frequency(self, index):
        # w = tf / max_document (tf)
        reverse_index = Reverse_index(self.index_type)
        reverse_index.idf = self.create_idf_counter(index)
        reverse_index.other_infos['norms'] = defaultdict(
            lambda: defaultdict(float))
        id_full_list = []
        max_frequency_in_document = defaultdict(int)

        # First, create unnormalized reverse index...
        for (document_id, tf_counter) in index:
            for term in tf_counter:
                tf_ponderation = tf_counter[term]
                reverse_index.add_entry(term, document_id, tf_ponderation)
                max_frequency_in_document[document_id] = max(
                    max_frequency_in_document[document_id], tf_ponderation)

                id_full_list.append(document_id)

        # Then, normalize each term by the maximum frequency occurence in the document
        for word in reverse_index.get_all_words():
            for document_id in reverse_index.get_entry(word):
                reverse_index.get_entry(
                    word)[document_id] = reverse_index.get_entry(
                        word)[document_id] / float(
                            max_frequency_in_document[document_id])
                reverse_index.other_infos['norms'][document_id][
                    'linear'] += tf_ponderation
                reverse_index.other_infos['norms'][document_id][
                    'quadratic'] += tf_ponderation * tf_ponderation

        reverse_index.set_id_set(set(id_full_list))

        return reverse_index
コード例 #2
0
    def create_with_ponderation_normal_frequency(self, index):
        # w = tf / max_document (tf)
        reverse_index = Reverse_index(self.index_type)
        reverse_index.idf = self.create_idf_counter(index)
        reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float))
        id_full_list = []
        max_frequency_in_document = defaultdict(int)

        # First, create unnormalized reverse index...
        for (document_id, tf_counter) in index:
            for term in tf_counter:
                tf_ponderation = tf_counter[term]
                reverse_index.add_entry(term, document_id, tf_ponderation)
                max_frequency_in_document[document_id] = max(max_frequency_in_document[document_id], tf_ponderation)

                id_full_list.append(document_id)

        # Then, normalize each term by the maximum frequency occurence in the document
        for word in reverse_index.get_all_words():
            for document_id in reverse_index.get_entry(word):
                reverse_index.get_entry(word)[document_id] = reverse_index.get_entry(word)[document_id] / float(max_frequency_in_document[document_id])
                reverse_index.other_infos['norms'][document_id]['linear'] += tf_ponderation
                reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_ponderation * tf_ponderation

        reverse_index.set_id_set(set(id_full_list))

        return reverse_index