def match(title, text): if type(title) == float or type(text) == float: return 1 strs = list(distance.lcsubstrings(title, text)) if len(strs) > 0: return len(strs[0]) / len(title) else: return 0
def get_longest_substr_ratio(a, b): # lcsubstrings : find the longest common substrings in two sequences # lcsubstrings("sedentar", "dentist") ==>{'dent'} strs = list(distance.lcsubstrings(a, b)) if len(strs) == 0: return 0 else: return len(strs[0]) / (min(len(a), len(b)) + 1)
def get_longest_substr_ratio(a, b): """ Function to get longest common substring ratio, i.e., lcs_substr_ratio. """ lcs = list(distance.lcsubstrings(a, b)) if len(lcs) == 0: return 0 else: return len(lcs[0]) / (min(len(a), len(b)) + 1)
def get_longest_substr_ratio(a: str, b: str) -> float: """ Computes the ration of longest common substring length and the length of the smaller string Args: a: String for question 1 b: String for question 2 Returns: Returns longest common substring ration for two question strings """ strings = list(distance.lcsubstrings(a, b)) if len(strings) == 0: return 0 else: return len(strings[0]) / (min(len(a), len(b)) + 1)
def distance_vec(s1, s2): edit_distance = distance.levenshtein(s1, s2) jaccard_distance = distance.jaccard(s1, s2) sorensen_distance = distance.sorensen(s1, s2) # hamming_distnace = distance.hamming(s1, s2) fc_distance = distance.fast_comp(s1, s2, transpositions=True) substring_distince = distance.lcsubstrings(s1, s2, positions=True)[0] common_words_distcance = len(get_common_words(s1, s2)) tf_distance = tf_similarity(s1, s2) tfidf_distance = tfidf_similarity(s1, s2) vec = np.array([ edit_distance, # 编辑距离 jaccard_distance, # jaccard距离 sorensen_distance, # sorensen # hamming_distnace, # 汉明距离 fc_distance, # fast commaon substring_distince, # 最长公共子串长度 common_words_distcance, # 公共词个数 tf_distance, # 单文本tf tfidf_distance # 单文本tfidf ]) return vec
def get_longest_substr_ratio(a, b): strs = list(distance.lcsubstrings(a, b)) if len(strs) == 0: return 0 else: return len(strs[0]) / (min(len(a), len(b)) + 1) #longest_substr_ratio
texts = df.loc[:, ["Text", "Article URL"]] texts = texts.dropna(axis=0, how="any") text = texts["Text"] num = len(text) text_test = list(text[0:num]) results_matrix_lengths = np.zeros((len(text_test), len(text_test))) results_matrix_strings = np.empty((len(text_test), len(text_test)), dtype=str) print(type(results_matrix_strings)) for i in range(0, num): print(i) for j in range(i + 1, num): substrings = distance.lcsubstrings(text_test[i], text_test[j], False) if (j % 500 == 0): print(datetime.datetime.now()) if (len(substrings) > 0): s = list(substrings) if (len(s[0]) > 100): print(s[0]) print(type(len(s[0]))) results_matrix_lengths[i, j] = len(s[0]) l = "" for string in s: l = l + "\n" + str(string) results_matrix_strings[i, j] = l df = pd.DataFrame(results_matrix_lengths, index=texts.loc[0:num - 1, "Article URL"], columns=texts.loc[0:num - 1, "Article URL"])
elif token_1[-1] == token_2[-1]: last_word_eq.append(1) else: last_word_eq.append(0) if len(token_1)== 0 or len(token_2) == 0: first_word_eq.append(0) elif token_1[0] == token_2[0]: first_word_eq.append(1) else: first_word_eq.append(0) abs_len_diff.append(abs(len(token_1)-len(token_2))) mean_len.append((len(token_1)+len(token_2) / 2)) strs = list(distance.lcsubstrings(data['question1'][x],data['question2'][x])) if len(strs) ==0: longest_substr_ratio.append(0) else: longest_substr_ratio.append(len(strs[0])/min(len(data['question1'][x]),len(data['question2'][x]))) # %% data['cwc_min'] = cwc_min data['cwc_max'] = cwc_max data['csc_min'] = csc_min data['csc_max'] = csc_max data['ctc_min'] = ctc_min data['ctc_max'] = ctc_max data['last_word_eq'] = last_word_eq data['first_word_eq'] = first_word_eq data['abs_len_diff'] = abs_len_diff data['mean_len'] = mean_len
def get_longest_substr_ratio(question1, question2): strs = list(distance.lcsubstrings(question1, question2)) if len(strs) == 0: return 0.0 else: return len(strs[0]) / (min(len(question1), len(question2)) + 0.0001)
lambda r: distance.jaccard(r['paper_title'], r['press_headline']), axis=1) logger.success('Make feature vector: f_jaccard_distance') # sorensen距离 train['f_sorensen_distance'] = train.apply( lambda r: distance.sorensen(r['paper_title'], r['press_headline']), axis=1) logger.success('Make feature vector: f_sorensen_distance') # fast compare train['f_fc_distance'] = train.apply(lambda r: distance.fast_comp( r['paper_title'], r['press_headline'], transpositions=True), axis=1) logger.success('Make feature vector: f_fc_distance') # 最长公共子串长度 train['f_substring_distince'] = train.apply(lambda r: distance.lcsubstrings( r['paper_title'], r['press_headline'], positions=True)[0], axis=1) logger.success('Make feature vector: f_substring_distince') # 公有词个数 train['f_common_words'] = train.apply( lambda r: len(get_common_words(r['paper_title'], r['press_headline'])), axis=1) logger.success('Make feature vector: f_common_words') logger.success('Make feature vector...') logger.info('Make train matrix...') col = [c for c in train.columns if c.startswith('f_')] pos_train = train[train['label'] == 1]