コード例 #1
0
    def create_with_ponderation_tf_idf(self, index, compute_norm=True):
        N = len(index)
        reverse_index = Reverse_index(self.index_type)
        reverse_index.idf = self.create_idf_counter(index)
        reverse_index.other_infos['norms'] = defaultdict(
            lambda: defaultdict(float))
        id_full_list = []

        for (document_id, tf_counter) in index:
            for term in tf_counter:
                tf_idf_ponderation = (
                    1 + self.custom_log(tf_counter[term])) * log10(
                        float(N) / reverse_index.idf[term])
                reverse_index.add_entry(term, document_id, tf_idf_ponderation)

                id_full_list.append(document_id)
                if compute_norm:
                    reverse_index.other_infos['norms'][document_id][
                        'linear'] += tf_idf_ponderation
                    reverse_index.other_infos['norms'][document_id][
                        'quadratic'] += tf_idf_ponderation * tf_idf_ponderation

        reverse_index.set_id_set(set(id_full_list))

        return reverse_index
コード例 #2
0
    def create_with_ponderation_tf_idf(self, index, compute_norm=True):
        N = len(index)
        reverse_index = Reverse_index(self.index_type)
        reverse_index.idf = self.create_idf_counter(index)
        reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float))
        id_full_list = []

        for (document_id, tf_counter) in index:
            for term in tf_counter:
                tf_idf_ponderation = (1 + self.custom_log(tf_counter[term])) * log10(float(N) / reverse_index.idf[term])
                reverse_index.add_entry(term, document_id, tf_idf_ponderation)

                id_full_list.append(document_id)
                if compute_norm:
                    reverse_index.other_infos['norms'][document_id]['linear'] += tf_idf_ponderation
                    reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_idf_ponderation * tf_idf_ponderation

        reverse_index.set_id_set(set(id_full_list))

        return reverse_index
コード例 #3
0
    def create_with_ponderation_normal_frequency(self, index):
        # w = tf / max_document (tf)
        reverse_index = Reverse_index(self.index_type)
        reverse_index.idf = self.create_idf_counter(index)
        reverse_index.other_infos['norms'] = defaultdict(
            lambda: defaultdict(float))
        id_full_list = []
        max_frequency_in_document = defaultdict(int)

        # First, create unnormalized reverse index...
        for (document_id, tf_counter) in index:
            for term in tf_counter:
                tf_ponderation = tf_counter[term]
                reverse_index.add_entry(term, document_id, tf_ponderation)
                max_frequency_in_document[document_id] = max(
                    max_frequency_in_document[document_id], tf_ponderation)

                id_full_list.append(document_id)

        # Then, normalize each term by the maximum frequency occurence in the document
        for word in reverse_index.get_all_words():
            for document_id in reverse_index.get_entry(word):
                reverse_index.get_entry(
                    word)[document_id] = reverse_index.get_entry(
                        word)[document_id] / float(
                            max_frequency_in_document[document_id])
                reverse_index.other_infos['norms'][document_id][
                    'linear'] += tf_ponderation
                reverse_index.other_infos['norms'][document_id][
                    'quadratic'] += tf_ponderation * tf_ponderation

        reverse_index.set_id_set(set(id_full_list))

        return reverse_index
コード例 #4
0
    def create_with_ponderation_normal_frequency(self, index):
        # w = tf / max_document (tf)
        reverse_index = Reverse_index(self.index_type)
        reverse_index.idf = self.create_idf_counter(index)
        reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float))
        id_full_list = []
        max_frequency_in_document = defaultdict(int)

        # First, create unnormalized reverse index...
        for (document_id, tf_counter) in index:
            for term in tf_counter:
                tf_ponderation = tf_counter[term]
                reverse_index.add_entry(term, document_id, tf_ponderation)
                max_frequency_in_document[document_id] = max(max_frequency_in_document[document_id], tf_ponderation)

                id_full_list.append(document_id)

        # Then, normalize each term by the maximum frequency occurence in the document
        for word in reverse_index.get_all_words():
            for document_id in reverse_index.get_entry(word):
                reverse_index.get_entry(word)[document_id] = reverse_index.get_entry(word)[document_id] / float(max_frequency_in_document[document_id])
                reverse_index.other_infos['norms'][document_id]['linear'] += tf_ponderation
                reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_ponderation * tf_ponderation

        reverse_index.set_id_set(set(id_full_list))

        return reverse_index