def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score, threshold=0.5, dampen = True): self.__corpus_list = corpus_list self.__document_frequency = {} self.__compute_document_frequency() self.__corpus_size = 0 if self.__corpus_list is None else ( len(self.__corpus_list)) self.sim_func = sim_func self.threshold = threshold self.dampen = dampen super(SoftTfIdf, self).__init__()
def get_raw_score(self, string1, string2): """Computes the raw Jaro-Winkler score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro-Winkler similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jw = JaroWinkler() >>> jw.get_raw_score('MARTHA', 'MARHTA') 0.9611111111111111 >>> jw.get_raw_score('DWAYNE', 'DUANE') 0.84 >>> jw.get_raw_score('DIXON', 'DICKSONX') 0.8133333333333332 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 jw_score = Jaro().get_raw_score(string1, string2) min_len = min(len(string1), len(string2)) # prefix length can be at max 4 j = min(min_len, 4) i = 0 while i < j and string1[i] == string2[i] and string1[i]: i += 1 if i: jw_score += i * self.prefix_weight * (1 - jw_score) return jw_score
def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score, threshold=0.95): self.__corpus_list = corpus_list self.__document_frequency = {} self.__compute_document_frequency() # TODO remove me. Just to get the most common words # import operator # sorted_x = sorted(self.__document_frequency.items(), key=operator.itemgetter(1)) # sorted_zz = sorted_x.reverse() self.__corpus_size = 0 if self.__corpus_list is None else (len( self.__corpus_list)) self.sim_func = sim_func self.threshold = threshold
def soft_tfidf_norm(bag1, bag2): # if the strings match exactly return 1.0 if sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if sim_check_for_empty(bag1, bag2): return 0 sim_func = Jaro().get_raw_score threshold = 0.5 # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # find unique elements in the input lists and their document frequency local_df = {} for element in tf_x: local_df[element] = local_df.get(element, 0) + 1 for element in tf_y: local_df[element] = local_df.get(element, 0) + 1 # if corpus is not provided treat input string as corpus curr_df, corpus_size = (local_df, 2) # calculating the term sim score against the input string 2, # construct similarity map similarity_map = {} for term_x in tf_x: max_score = 0.0 for term_y in tf_y: score = sim_func(term_x, term_y) # adding sim only if it is above threshold and # highest for this element if score > threshold and score > max_score: similarity_map[term_x] = (score, term_x, term_y) max_score = score # position of first string, second string and sim score # in the tuple first_string_pos = 1 second_string_pos = 2 sim_score_pos = 0 result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 # soft-tfidf calculation for element in local_df.keys(): # denominator idf = corpus_size / curr_df[element] v_x = idf * tf_x.get(element, 0) v_x_2 += v_x * v_x v_y = idf * tf_y.get(element, 0) v_y_2 += v_y * v_y used_x = {} used_y = {} for sim in sorted(similarity_map.values(), reverse=True): if used_x.get(sim[first_string_pos]) is not None or used_y.get(sim[second_string_pos]) is not None: continue idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1) idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1) v_x = idf_first * tf_x.get(sim[first_string_pos], 0) v_y = idf_second * tf_y.get(sim[second_string_pos], 0) result += v_x * v_y * sim[sim_score_pos] used_x[sim[first_string_pos]] = True used_y[sim[second_string_pos]]= True return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
def __init__(self, sim_func=Jaro().get_raw_score, threshold=0.5): self.sim_func = sim_func self.threshold = threshold super(GeneralizedJaccard, self).__init__()
def setup(self): self.jaro = Jaro()
class TimeJaro: def setup(self): self.jaro = Jaro() def time_short_short(self): self.jaro.get_raw_score(_short_string_1, _short_string_2) def time_medium_medium(self): self.jaro.get_raw_score(_medium_string_1, _medium_string_2) def time_long_long(self): self.jaro.get_raw_score(_long_string_1, _long_string_2) def time_short_medium(self): self.jaro.get_raw_score(_short_string_1, _medium_string_1) def time_short_long(self): self.jaro.get_raw_score(_short_string_1, _long_string_1) def time_medium_long(self): self.jaro.get_raw_score(_medium_string_1, _long_string_1)
af = Affine() me = MongeElkan() nw = NeedlemanWunsch() sw = SmithWaterman() bd = BagDistance() cos = Cosine() pr = PartialRatio() sf = SoftTfIdf() edx = Editex() gj = GeneralizedJaccard() jw = JaroWinkler() lev = Levenshtein() dice = Dice() jac = Jaccard() jaro = Jaro() pts = PartialTokenSort() rat = Ratio() sound = Soundex() tfidf = TfIdf() ts = TokenSort() tv_ind = TverskyIndex() over_coef = OverlapCoefficient() # It's long print('Loading word2vec model...') model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) print('Word2vec model are loaded.')