Beispiel #1
0
def match(title, text):
    if type(title) == float or type(text) == float:
        return 1
    strs = list(distance.lcsubstrings(title, text))
    if len(strs) > 0:
        return len(strs[0]) / len(title)
    else:
        return 0
Beispiel #2
0
def get_longest_substr_ratio(a, b):
    # lcsubstrings : find the longest common substrings in two sequences
    # lcsubstrings("sedentar", "dentist") ==>{'dent'}
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)
Beispiel #3
0
def get_longest_substr_ratio(a, b):
    """
    Function to get longest common substring ratio, i.e., lcs_substr_ratio.
    """
    lcs = list(distance.lcsubstrings(a, b))
    if len(lcs) == 0:
        return 0
    else:
        return len(lcs[0]) / (min(len(a), len(b)) + 1)
def get_longest_substr_ratio(a: str, b: str) -> float:
    """ Computes the ration of longest common substring length and the length of the smaller string
        Args:
            a: String for question 1
            b: String for question 2
        Returns:
            Returns longest common substring ration for two question strings
    """
    strings = list(distance.lcsubstrings(a, b))
    if len(strings) == 0:
        return 0
    else:
        return len(strings[0]) / (min(len(a), len(b)) + 1)
Beispiel #5
0
def distance_vec(s1, s2):
    edit_distance = distance.levenshtein(s1, s2)
    jaccard_distance = distance.jaccard(s1, s2)
    sorensen_distance = distance.sorensen(s1, s2)
    # hamming_distnace = distance.hamming(s1, s2)
    fc_distance = distance.fast_comp(s1, s2, transpositions=True)
    substring_distince = distance.lcsubstrings(s1, s2, positions=True)[0]
    common_words_distcance = len(get_common_words(s1, s2))
    tf_distance = tf_similarity(s1, s2)
    tfidf_distance = tfidf_similarity(s1, s2)
    vec = np.array([
        edit_distance,  # 编辑距离
        jaccard_distance,  # jaccard距离
        sorensen_distance,  # sorensen
        # hamming_distnace,     # 汉明距离
        fc_distance,  # fast commaon
        substring_distince,  # 最长公共子串长度
        common_words_distcance,  # 公共词个数
        tf_distance,  # 单文本tf
        tfidf_distance  # 单文本tfidf
    ])
    return vec
Beispiel #6
0
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)  #longest_substr_ratio
Beispiel #7
0
texts = df.loc[:, ["Text", "Article URL"]]
texts = texts.dropna(axis=0, how="any")
text = texts["Text"]

num = len(text)
text_test = list(text[0:num])

results_matrix_lengths = np.zeros((len(text_test), len(text_test)))
results_matrix_strings = np.empty((len(text_test), len(text_test)), dtype=str)
print(type(results_matrix_strings))

for i in range(0, num):
    print(i)
    for j in range(i + 1, num):
        substrings = distance.lcsubstrings(text_test[i], text_test[j], False)
        if (j % 500 == 0):
            print(datetime.datetime.now())
        if (len(substrings) > 0):
            s = list(substrings)
            if (len(s[0]) > 100):
                print(s[0])
                print(type(len(s[0])))
                results_matrix_lengths[i, j] = len(s[0])
                l = ""
                for string in s:
                    l = l + "\n" + str(string)
                results_matrix_strings[i, j] = l
df = pd.DataFrame(results_matrix_lengths,
                  index=texts.loc[0:num - 1, "Article URL"],
                  columns=texts.loc[0:num - 1, "Article URL"])
Beispiel #8
0
    elif token_1[-1] == token_2[-1]:
        last_word_eq.append(1)
    else:
        last_word_eq.append(0)
    
    if len(token_1)== 0 or len(token_2) == 0:
        first_word_eq.append(0)
    elif token_1[0] == token_2[0]:
        first_word_eq.append(1)
    else:
        first_word_eq.append(0)
    
    abs_len_diff.append(abs(len(token_1)-len(token_2)))
    mean_len.append((len(token_1)+len(token_2) / 2))

    strs = list(distance.lcsubstrings(data['question1'][x],data['question2'][x]))
    if len(strs) ==0:
        longest_substr_ratio.append(0)
    else:
        longest_substr_ratio.append(len(strs[0])/min(len(data['question1'][x]),len(data['question2'][x])))
# %%
data['cwc_min'] = cwc_min
data['cwc_max'] = cwc_max
data['csc_min'] = csc_min
data['csc_max'] = csc_max
data['ctc_min'] = ctc_min
data['ctc_max'] = ctc_max
data['last_word_eq'] = last_word_eq
data['first_word_eq'] = first_word_eq
data['abs_len_diff'] = abs_len_diff
data['mean_len'] = mean_len
Beispiel #9
0
def get_longest_substr_ratio(question1, question2):
    strs = list(distance.lcsubstrings(question1, question2))
    if len(strs) == 0:
        return 0.0
    else:
        return len(strs[0]) / (min(len(question1), len(question2)) + 0.0001)
Beispiel #10
0
    lambda r: distance.jaccard(r['paper_title'], r['press_headline']), axis=1)
logger.success('Make feature vector: f_jaccard_distance')

# sorensen距离
train['f_sorensen_distance'] = train.apply(
    lambda r: distance.sorensen(r['paper_title'], r['press_headline']), axis=1)
logger.success('Make feature vector: f_sorensen_distance')

# fast compare
train['f_fc_distance'] = train.apply(lambda r: distance.fast_comp(
    r['paper_title'], r['press_headline'], transpositions=True),
                                     axis=1)
logger.success('Make feature vector: f_fc_distance')

# 最长公共子串长度
train['f_substring_distince'] = train.apply(lambda r: distance.lcsubstrings(
    r['paper_title'], r['press_headline'], positions=True)[0],
                                            axis=1)
logger.success('Make feature vector: f_substring_distince')

# 公有词个数
train['f_common_words'] = train.apply(
    lambda r: len(get_common_words(r['paper_title'], r['press_headline'])),
    axis=1)
logger.success('Make feature vector: f_common_words')

logger.success('Make feature vector...')

logger.info('Make train matrix...')
col = [c for c in train.columns if c.startswith('f_')]

pos_train = train[train['label'] == 1]