Beispiel #1
0
def main():
    input_s = t2s.convert(args.input)
    words = pseg.cut(input_s)

    word_list = []
    for word, flag in words:
        if word is not ' ':
            word_dict = {'word': s2t.convert(word), 'tag': flag}
            word_list.append(word_dict)

    result = False
    for word in word_list:
        for keyword in keywords:
            if word['tag'] in 'eng':
                distance = jaro_winkler_similarity(word['word'], keyword)
                if distance > threshold:
                    result = True
            else:
                if word['word'] in keyword:
                    result = True

    if result is True:
        print("Match")
    else:
        print("Not match")
Beispiel #2
0
def hide_word(text, words):
    """ Заменяет слова words знаками вопроса для теста """
    start_word = words
    eng = 'abcdefghijklmnopqrstuvwxyz'
    words_to_replace = []
    text = text.replace('́', '')
    words = re.sub(r'\(.*?\)', '', words).strip()
    words = [words] + re.split(r'[\s\.\,«»]+', words)
    norm_words = [morph.parse(word) for word in words]
    for word in word_tokenize(text):
        for a in norm_words:
            for a1 in a:
                for b in morph.parse(word):
                    # print(
                    #     word, a1.normal_form, b.normal_form,
                    #     distance.edit_distance(a1.normal_form, b.normal_form),
                    #     distance.jaro_similarity(a1.normal_form, b.normal_form),
                    #     distance.jaro_winkler_similarity(a1.normal_form, b.normal_form),
                    # )
                    #print(word, a1.normal_form, b.normal_form, ss.stem(word), a1.normal_form == b.normal_form)
                    if a1.normal_form == b.normal_form:
                        words_to_replace.append(word)
                    elif ss.stem(a1.normal_form) == ss.stem(b.normal_form):
                        words_to_replace.append(word)
                    elif distance.jaro_winkler_similarity(
                            a1.normal_form, b.normal_form) >= 0.80:
                        words_to_replace.append(word)
    words_to_replace = list(set(words_to_replace + [start_word]))
    # print(words_to_replace)
    # print(sorted(words_to_replace, key=len, reverse=True))
    for w in sorted(words_to_replace, key=len, reverse=True):
        text = text.replace(w, '???')
    text = re.sub(r'[\?\»][\?\s]+[\?\»]', '???', text)
    return text
Beispiel #3
0
def get_best_match(ent, p_list):
    max_val = -1
    max_entity = ""
    max_list = list()
    for p in p_list:
        sim_val = jaro_winkler_similarity(ent, p[0])
        log("\t\tSim value between : " + ent + " | " + p[0] + "   =>   " + str(sim_val))
        if max_val < sim_val:
            max_val = sim_val
            max_entity = p

    for p in p_list:
        sim_val = jaro_winkler_similarity(ent, p[0])
        if sim_val == max_val:
            max_list.append([max_val, p])

    return max_list
Beispiel #4
0
def isKeyword(tokens, template):
    """Check if there is a keyword match in tokens"""
    result = False
    for token in tokens:
        if token["tag"] in "eng":
            for keyword in template["eng"]:
                distance = jaro_winkler_similarity(token["word"], keyword)
                if distance >= template["threshold"]:
                    result = True
        else:
            for keyword in template["cht"]:
                if token["word"] in keyword:
                    result = True

    return result
Beispiel #5
0
def get_dists(keyword):
    dists = []
    for word in words_preprocessed:
        dists.append({
            "edit_dist":
            edit_distance(word, keyword),
            "jaro_simi":
            jaro_similarity(word, keyword),
            "jaro_winkler_simi":
            jaro_winkler_similarity(word, keyword),
            "jaccard_dist":
            jaccard_distance(set(word), set(keyword)),
            "word":
            word,
            "keyword":
            keyword
        })
    return pd.DataFrame(dists).sort_values("edit_dist").iloc[0:3, :]
def run_jaro_winkler_similarity(lhs, entities):
    max_similarity = [
        ('', '', 0.0),
    ]

    for entity in entities:
        similarity = distance.jaro_winkler_similarity(lhs, entity['author'])
        if max_similarity[0][2] < similarity:
            max_similarity = [
                (entity['author'], entity['url'], similarity),
            ]
        elif max_similarity[0][2] == similarity:
            max_similarity.append(
                (entity['author'], entity['url'], similarity))

    print(
        'Jaro Winkler similarity with {0} and {1} entities results in {2} miximum similarity of {3}'
        .format(lhs, len(entities), len(max_similarity), max_similarity[0][2]))

    return max_similarity
Beispiel #7
0
def is_rough_match(text, name, thresh = 0.8):
    
    text = text.lower()
    name = name.lower()
    
    for sw in STOP_WORDS:
        text = re.sub(' +', ' ', re.sub('(^| )' + sw + '($| )', ' ', text)).strip()
        name = re.sub(' +', ' ', re.sub('(^| )' + sw + '($| )', ' ', name)).strip()
    
    if text == name:
        return Match.Full
    
    splits = name.split(' ')
    
    for i in range(len(splits)):    
        if text == ' '.join(splits[i:]):
            return Match.Partial
        
        if jaro_winkler_similarity(text, ' '.join(splits[i:])) > thresh:
            return Match.Partial
    
    return Match.NoMatch
    
Beispiel #8
0
def nltk_jarowinklersimilarity(parent: str, child: str):
    return np.asarray([[jaro_winkler_similarity(parent, child)]])
Beispiel #9
0
def nltk_jarowinklersimilarity(parent: str, child: str):
    return jaro_winkler_similarity(parent, child)