コード例 #1
0
    def compare(self, text_a, text_b):
        texta = self.split(self.lower(self.remove_punctuation(self.remove_spec_chars(text_a))))
        textb = self.split(self.lower(self.remove_punctuation(self.remove_spec_chars(text_b))))

        stemmer = Stemmer()

        texta = stemmer.stem_words(texta)
        textb = stemmer.stem_words(textb)

        set_a = set(texta)
        set_b = set(textb)

        jaccard_similarity_result = len(set_a & set_b) / len(set_a | set_b)

        return jaccard_similarity_result
コード例 #2
0
    def compare(self, text_a, text_b):
        texta = self.split(
            self.lower(self.remove_punctuation(
                self.remove_spec_chars(text_a))))
        textb = self.split(
            self.lower(self.remove_punctuation(
                self.remove_spec_chars(text_b))))

        texta = self.remove_stopwords(texta)
        textb = self.remove_stopwords(textb)
        var_1 = text_a
        var_2 = text_b

        stemmer = Stemmer()

        texta = stemmer.stem_words(texta)
        textb = stemmer.stem_words(textb)

        general_set = set()
        for i in texta:
            general_set.add(i)
        for i in textb:
            general_set.add(i)

        map_a = {}
        map_b = {}
        for i in general_set:
            map_a[i] = 0
            map_b[i] = 0

        for i in texta:
            map_a[i] = map_a[i] + 1
        for i in textb:
            map_b[i] = map_b[i] + 1

        ab = 0
        a = 0
        b = 0
        for i in general_set:
            ab += map_a[i] * map_b[i]
        for i in general_set:
            a += map_a[i]**2
        for i in general_set:
            b += map_b[i]**2

        cosine_similarity_coefficient = ab / (math.sqrt(a) * math.sqrt(b))

        return cosine_similarity_coefficient
コード例 #3
0
    def compare(self, text_a, text_b):
        texta = self.split(
            self.lower(self.remove_punctuation(
                self.remove_spec_chars(text_a))))
        textb = self.split(
            self.lower(self.remove_punctuation(
                self.remove_spec_chars(text_b))))

        texta = self.remove_stopwords(texta)
        textb = self.remove_stopwords(textb)

        stemmer = Stemmer()

        texta = stemmer.stem_words(texta)
        textb = stemmer.stem_words(textb)

        general_set = set()
        for i in texta:
            general_set.add(i)
        for i in textb:
            general_set.add(i)

        map_a = {}
        map_b = {}
        for i in general_set:
            map_a[i] = 0
            map_b[i] = 0

        for i in texta:
            map_a[i] = map_a[i] + 1
        for i in textb:
            map_b[i] = map_b[i] + 1

        ab = 0
        a = 0
        b = 0
        for i in general_set:
            if map_a[i] > 0 and map_b[i] > 0:
                ab += 1
            if map_a[i] > 0:
                a += 1
            if map_b[i] > 0:
                b += 1

        dice_similarity_coefficient = (2 * ab) / (a + b)

        return dice_similarity_coefficient
コード例 #4
0
    def compare(self, text_a, text_b):
        texta = self.split(
            self.lower(self.remove_punctuation(
                self.remove_spec_chars(text_a))))
        textb = self.split(
            self.lower(self.remove_punctuation(
                self.remove_spec_chars(text_b))))

        stemmer = Stemmer()

        texta = stemmer.stem_words(texta)
        textb = stemmer.stem_words(textb)

        set_a = set(texta)
        set_b = set(textb)

        matching_coefficient = len(set_a & set_b) / (min(
            len(set_a), len(set_b)))

        return matching_coefficient