def test_several_string_distances(self): real_value = "Michael Beat Stolz" real_world_input_string = "Michael Test Stolz" common_mistakes_1 = "Michael Beat Stoltz" common_mistakes_2 = "Micheal Beat Stolz" common_mistakes_3 = "Michel Beat Stoltz" common_mistakes_4 = "Beat Michael Stolz" all_mistakes = [ real_world_input_string, common_mistakes_1, common_mistakes_2, common_mistakes_3, common_mistakes_4 ] # given lev = Levenshtein() jw = JaroWinkler() jaro = Jaro() soft_tf_idf = SoftTfIdf(None, sim_func=Jaro().get_raw_score, threshold=0.8) for idx, mistake in enumerate(all_mistakes): print("====================Mistake Nr {} ======================". format(idx)) print("Levenshtein (Edit Distance) {}".format( lev.get_sim_score(mistake, real_value))) print("JaroWinkler {}".format(jw.get_sim_score( mistake, real_value))) print("Jaro {}".format(jaro.get_sim_score(mistake, real_value))) print("SoftTfIdf Cosine {}".format( soft_tf_idf.get_raw_score(bag_of_words(mistake), bag_of_words(real_value)))) print("==========================================")
def tf_idf_with_corpse(): with open('tf_idf_similarity_jaro_95', 'rb') as handle: tf_idf_similarity = pickle.load(handle) abt = 'ge futura indoor tv antenna tv24746 specially designed to receive digital tv signal 20db gain amplification noise eliminator circuitry filter designed to mount horizontally or vertically' buy = 'ge 24746 futura tm indoor hdtv antenna ge' print( tf_idf_similarity.calculate_similarity(bag_of_words(abt), bag_of_words(buy)))
def test_get_word_vector_similarities_tf_idf(self): all_bag_of_words = [] s1_tokenized = bag_of_words(self.s1_complex) s2_tokenized = bag_of_words(self.s2_complex) all_bag_of_words.append(s1_tokenized) all_bag_of_words.append(s2_tokenized) sim_engine = WordVectorSimilarity(all_bag_of_words, sim_func=Jaro().get_raw_score, threshold=0.8) word_similarities_vector = sim_engine.get_word_vector_similarities_tf_idf( s1_tokenized, s2_tokenized) print(word_similarities_vector) self.assertEqual(len(word_similarities_vector), 8)
def test_get_word_vector_similarities(self): all_bag_of_words = [] s1_tokenized = bag_of_words(self.s1_simple) s2_tokenized = bag_of_words(self.s2_simple) s3_tokenized = bag_of_words(self.s3_simple) all_bag_of_words.append(s1_tokenized) all_bag_of_words.append(s2_tokenized) all_bag_of_words.append(s3_tokenized) sim_engine = WordVectorSimilarity(all_bag_of_words, sim_func=JaroWinkler().get_raw_score, threshold=0.8) word_similarities_vector = sim_engine.get_word_vector_similarities_simple( s1_tokenized, s2_tokenized) print(word_similarities_vector)
def test_clean_word_set(self): original_input = "Panasonic 2-Line Integrated Telephone - KXTSC14W/ Call Waiting/ 50-Station Caller ID/ Voice " \ "Mail Message-Waiting Indicator/ Speakerphone/ 3-Line LCD Display/ White Finish " print(original_input) as_word_bag = bag_of_words(original_input) print(as_word_bag)
def transform_to_bag_of_words_name(self): # manual data analysis has shown that a lot of good keys are lost because they get splitted by a '-'. name_pruned = concat_dasherized_expressions(self.name) return bag_of_words(name_pruned)
def transform_to_bag_of_words(self): # manual data analysis has shown that a lot of good keys are lost because they get splitted by a '-'. name_pruned = concat_dasherized_expressions(self.name) return bag_of_words(" ".join( [name_pruned, self.description, self.manufacturer]))
def transform_to_bag_of_words_name(self): return bag_of_words(self.name)
def transform_to_bag_of_words(self): return bag_of_words(self.description)