Example #1
0
    def test_several_string_distances(self):

        real_value = "Michael Beat Stolz"
        real_world_input_string = "Michael Test Stolz"
        common_mistakes_1 = "Michael Beat Stoltz"
        common_mistakes_2 = "Micheal Beat Stolz"
        common_mistakes_3 = "Michel Beat Stoltz"
        common_mistakes_4 = "Beat Michael Stolz"

        all_mistakes = [
            real_world_input_string, common_mistakes_1, common_mistakes_2,
            common_mistakes_3, common_mistakes_4
        ]

        # given
        lev = Levenshtein()
        jw = JaroWinkler()
        jaro = Jaro()
        soft_tf_idf = SoftTfIdf(None,
                                sim_func=Jaro().get_raw_score,
                                threshold=0.8)

        for idx, mistake in enumerate(all_mistakes):
            print("====================Mistake Nr {} ======================".
                  format(idx))
            print("Levenshtein (Edit Distance) {}".format(
                lev.get_sim_score(mistake, real_value)))
            print("JaroWinkler {}".format(jw.get_sim_score(
                mistake, real_value)))
            print("Jaro {}".format(jaro.get_sim_score(mistake, real_value)))
            print("SoftTfIdf Cosine {}".format(
                soft_tf_idf.get_raw_score(bag_of_words(mistake),
                                          bag_of_words(real_value))))
            print("==========================================")
Example #2
0
def tf_idf_with_corpse():
    with open('tf_idf_similarity_jaro_95', 'rb') as handle:
        tf_idf_similarity = pickle.load(handle)

        abt = 'ge futura indoor tv antenna tv24746 specially designed to receive digital tv signal 20db gain amplification noise eliminator circuitry filter designed to mount horizontally or vertically'
        buy = 'ge 24746 futura tm indoor hdtv antenna ge'

        print(
            tf_idf_similarity.calculate_similarity(bag_of_words(abt),
                                                   bag_of_words(buy)))
Example #3
0
    def test_get_word_vector_similarities_tf_idf(self):
        all_bag_of_words = []

        s1_tokenized = bag_of_words(self.s1_complex)
        s2_tokenized = bag_of_words(self.s2_complex)

        all_bag_of_words.append(s1_tokenized)
        all_bag_of_words.append(s2_tokenized)

        sim_engine = WordVectorSimilarity(all_bag_of_words,
                                          sim_func=Jaro().get_raw_score,
                                          threshold=0.8)

        word_similarities_vector = sim_engine.get_word_vector_similarities_tf_idf(
            s1_tokenized, s2_tokenized)

        print(word_similarities_vector)

        self.assertEqual(len(word_similarities_vector), 8)
Example #4
0
    def test_get_word_vector_similarities(self):
        all_bag_of_words = []

        s1_tokenized = bag_of_words(self.s1_simple)
        s2_tokenized = bag_of_words(self.s2_simple)
        s3_tokenized = bag_of_words(self.s3_simple)

        all_bag_of_words.append(s1_tokenized)
        all_bag_of_words.append(s2_tokenized)
        all_bag_of_words.append(s3_tokenized)

        sim_engine = WordVectorSimilarity(all_bag_of_words,
                                          sim_func=JaroWinkler().get_raw_score,
                                          threshold=0.8)

        word_similarities_vector = sim_engine.get_word_vector_similarities_simple(
            s1_tokenized, s2_tokenized)

        print(word_similarities_vector)
Example #5
0
 def test_clean_word_set(self):
     original_input = "Panasonic 2-Line Integrated Telephone - KXTSC14W/ Call Waiting/ 50-Station Caller ID/ Voice " \
                      "Mail Message-Waiting Indicator/ Speakerphone/ 3-Line LCD Display/ White Finish "
     print(original_input)
     as_word_bag = bag_of_words(original_input)
     print(as_word_bag)
Example #6
0
 def transform_to_bag_of_words_name(self):
     # manual data analysis has shown that a lot of good keys are lost because they get splitted by a '-'.
     name_pruned = concat_dasherized_expressions(self.name)
     return bag_of_words(name_pruned)
Example #7
0
 def transform_to_bag_of_words(self):
     # manual data analysis has shown that a lot of good keys are lost because they get splitted by a '-'.
     name_pruned = concat_dasherized_expressions(self.name)
     return bag_of_words(" ".join(
         [name_pruned, self.description, self.manufacturer]))
Example #8
0
 def transform_to_bag_of_words_name(self):
     return bag_of_words(self.name)
Example #9
0
 def transform_to_bag_of_words(self):
     return bag_of_words(self.description)