Beispiel #1
0
def GetScore(src_name, input_name, min_score):
    src_name = src_name.translate(str.maketrans('', '', string.punctuation))
    input_name = input_name.translate(
        str.maketrans('', '', string.punctuation))
    jarowinkler = JaroWinkler()

    result = []

    total_score_scr_part = 0
    for input_name_part in input_name.split():
        column = []
        for src_name_part in src_name.split():
            winkler_part = jarowinkler.similarity(
                input_name_part, src_name_part)
            difference = ParsedDifference(input_name_part, src_name_part)

            avg = (winkler_part + difference) / 2

            column.append(avg)
        result.append(max(column))

    full_inputted_jaro = jarowinkler.similarity(input_name, src_name)
    score = Average(result)
    if (full_inputted_jaro > score):
        score = full_inputted_jaro
    return score * 100
def check_repeat_similar(table_info: TableInfo, target_list: List,
                         target_tag: str) -> TableInfo:
    # 已重复,直接跳出
    if table_info.result == 1:
        return table_info

    jk = JaroWinkler()
    similar_list: List[RepeatInfo] = list()
    for t_name in target_list:
        similar_point = jk.similarity(t_name, table_info.t_name)
        restore_result(t_name, similar_point, similar_list)
    # 处理查询后的结果
    similar_msg_list = list()
    similar_result = 0
    for repeat_info in similar_list:
        if repeat_info.similar_point > 0.97:
            similar_result = 1
            similar_msg_list.append('与' + repeat_info.t_name + '(' +
                                    target_tag + ')完全重复')
            # 完全重复就不需要在查找疑似重复
            break
        elif repeat_info.similar_point > 0.7:
            similar_result = similar_result if similar_result == 1 else 2
            similar_msg_list.append('与' + repeat_info.t_name + '(' +
                                    target_tag + ')疑似重复')

    table_info.result = similar_result
    table_info.msg = similar_msg_list
    return table_info
Beispiel #3
0
def textCheck(segmentRootDir, inputDir, outputDir):
    mkdir(outputDir)

    #用来判断文字相似度
    jarowinkler = JaroWinkler()

    for dirpath, dirs, files in os.walk(
            segmentRootDir):  # 递归遍历当前目录和所有子目录的文件和目录
        #print(files)
        for name in files:  # files保存的是所有的文件名
            if os.path.splitext(name)[1] == '.json':
                filename = os.path.join(dirpath,
                                        name)  # 加上路径,dirpath是遍历时文件对应的路径
                bookname = os.path.split(dirpath)[1]
                textFileName = os.path.join(inputDir, bookname + '.txt')
                # 文字校对
                if checkSegment(filename, textFileName):
                    bookInfoPath = os.path.join(outputDir, 'bookinfo')
                    bookInfoFileName = os.path.join(bookInfoPath,
                                                    bookname + '.json')
                    mkdir(bookInfoPath)
                    # 生成bookinfo
                    if buildBookInfo(filename, bookInfoFileName):
                        mp3FileName = os.path.join(inputDir, bookname + '.mp3')
                        #按章节切分mp3
                        mp3Path = os.path.join(outputDir, bookname)
                        mkdir(mp3Path)
                        buildMP3(bookInfoFileName, mp3FileName, mp3Path)
                        #按章节生成VTT字幕
                        buildVTT(bookInfoFileName, mp3Path)

                else:
                    break
    def fuzzy_line_equality_detection(self, lines):
        new_lines = []

        jarowinkler = JaroWinkler()
        #Compare all lines against each other
        for k in range(len(lines.split("\n"))):
            max_sim = 0
            for l in range(len(lines.split("\n"))):
                if k == l: continue
                jaro_sim = jarowinkler.similarity(
                    lines.split("\n")[k].lower(),
                    lines.split("\n")[l].lower())

                #Get maximum similarity
                if jaro_sim > max_sim:
                    max_sim = jaro_sim

            #If maximum similarity >= similarity threshold: make all tokens technical(T)
            if max_sim >= self.similarity_threshold and lines.split(
                    "\n")[k].replace(" ", ""):
                new_lines.append(" ".join(
                    [w + "_T" for w in lines.split("\n")[k].split(" ")]))
            else:
                new_lines.append(" ".join(
                    [w + "_N" for w in lines.split("\n")[k].split(" ")]))

        return "\n".join(new_lines)
def correct(word, dictionary):
    if is_correct(word, dictionary):
        return (word, 1.0)
    else:
        jarowinkler = JaroWinkler()
        score = []
        for dict_word in dictionary:
            score.append(jarowinkler.similarity(word, dict_word))
        return (dictionary[score.index(max(score))], max(score))
def jw(df):
    jarowinkler = JaroWinkler()
    df["jarowinkler_sim"] = [
        jarowinkler.similarity(i, j)
        for i, j in zip(df["Tags2"], df["UserInput"])
    ]
    df.sort_values(by=['jarowinkler_sim'], inplace=True, ascending=False)
    final = df.drop(['Category', 'ReviewText2', 'Tags2'], axis=1).iloc[:5, :]

    return final
def get_similarity_score(name_1, name_2):
    if name_1 == '' and name_2 == '':
        return 1
    if (name_1 == '' and name_2 != '') or (name_1 != '' and name_2 == ''):
        return 0
    c_name_1 = normalize_text(curate_author_name(name_1)).lower()
    c_name_2 = normalize_text(curate_author_name(name_2)).lower()
    jarowinkler = JaroWinkler()
    similarity_score = jarowinkler.similarity(c_name_1, c_name_2)
    return similarity_score
Beispiel #8
0
def find_most_apt(name, results):
	jarowinkler = JaroWinkler()
	deg = []
	for el in results:
		if name.upper() == el.upper():
			return el
		else:
			deg.append(jarowinkler.similarity(name.upper(), el.upper()))
	indd = int(deg.index(max(deg)))
	mostapt = results[indd]
	return mostapt
def are_names_similar(name_1,
                      name_2,
                      use_approximation_algorithm=False,
                      similarity_threshold=0.95):
    if name_1 == '' and name_2 == '':
        return True
    if (name_1 == '' and name_2 != '') or (name_1 != '' and name_2 == ''):
        return False
    c_name_1 = normalize_text(curate_author_name(name_1)).lower()
    c_name_2 = normalize_text(curate_author_name(name_2)).lower()
    if use_approximation_algorithm:
        jarowinkler = JaroWinkler()
        similarity_score = jarowinkler.similarity(c_name_1, c_name_2)
        return similarity_score > similarity_threshold
    else:
        return c_name_1 == c_name_2
def __affiliations_to_save(affiliations, new_affiliations):
    jarowinkler = JaroWinkler()
    similarity_threshold = 0.95
    affiliations_to_save = []
    for new_affiliation in new_affiliations:
        exist_affiliation = False
        for affiliation in affiliations:
            # normalize text before comparison
            affiliation_nor = normalize_text(affiliation)
            new_affiliation_nor = normalize_text(new_affiliation)
            similarity_score = jarowinkler.similarity(
                affiliation_nor.lower(), new_affiliation_nor.lower())
            if similarity_score >= similarity_threshold:
                exist_affiliation = True
        if not exist_affiliation:
            affiliations_to_save.append(new_affiliation)
    return affiliations_to_save
    def best_match(self, search_track, tracks):
        jw = JaroWinkler()
        title_similarities = []
        artists_similarities = []
        totals = []
        for track in tracks:
            title_similarity = jw.similarity(search_track.title.lower(), track.title.lower())
            title_similarities.append(title_similarity)
            artists_similarity = jw.similarity(search_track.artists.lower(), track.artists.lower())
            artists_similarities.append(artists_similarity)
            totals.append(artists_similarity + title_similarity)

        max_index = totals.index(max(totals))
        max_total = totals[max_index]
        if max_total > 1.5:
            return tracks[max_index]
        else:
            return None
Beispiel #12
0
def map_recipes_worker(json_file):
    # Load similarity tools
    dictionary = json.load(
        open("data/preprocessed/dictionary.json", encoding="utf-8"))
    dictionary_words = dictionary.keys()

    jarowinkler = JaroWinkler()
    print(f"Processing {json_file}")
    curr_json = json.load(json_file.open(encoding="utf-8"))
    processed_json = {}
    for user, recipes in tqdm(curr_json.items()):
        processed_json[user] = []
        for recipe in recipes:
            mapped = map_to_dictionary(jarowinkler, recipe, dictionary_words)
            if mapped:
                processed_json[user].append(mapped)
    processed_json = {k: v for k, v in processed_json.items() if len(v)}
    json_saver(f"data/preprocessed/mapped_{json_file.stem}.json",
               processed_json)
Beispiel #13
0
    def predict(self, queries="Noodles Mushroom Noodles"):
        self._load_models()
        self._load_dictionary()

        if not isinstance(queries, (list, tuple)):
            queries = [queries]

        jarowinkler = JaroWinkler()

        for query in queries:
            print(f"Input: {query}")
            # First map input to dictionary
            mapping = self.zh2en.keys() if is_chinese(query) else self.zh2en.values()
            query = map_to_dictionary(jarowinkler, query, mapping)
            # Need convert to Chinese if the input is English
            if not is_chinese(query):
                query = self.en2zh[query]
            results = self.model.wv.similar_by_word(query, topn=10)
            for idx, (name_zh, score) in enumerate(results, start=1):
                print(f"#{idx} - {self.zh2en[name_zh]}: {score:.3f}")
Beispiel #14
0
from .models import (Source, Work, Contributor)
import pandas as pd 
from similarity.jarowinkler import JaroWinkler
jarowinkler = JaroWinkler()

def worksManager(csv):
    """  
    Main Function To Manage all operation
    @param csv {File} a csv file with work_metadata format
    Priority
        * Check Match By ISWC
        * Check Match By Title
        * Validate Similarity of Titles
        * Check Existence of Contributors
        * Check Similarity of Contributors
    In case of Data Confilcts Saves the Source with Descriptive Status
        * ISWC Conflict
        * Title Conflict
        * Contributors Updated
        * New Work 
    """
    df = pd.read_csv(csv)
    for row in df.iterrows():
        work = row[1]
        iswc = work["iswc"]
        title = work["title"]
        contributors = work["contributors"].split('|')
        source = work["source"]
        sourceid = work["id"]
        worksByIswc = Work.objects.all().filter(iswc=iswc)
Beispiel #15
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
Beispiel #16
0
from enum import Enum
from collections import namedtuple
import numpy as np
if computaGenero:
    from genderize import Genderize
    genderize = Genderize()

if graficos:
    import matplotlib.pyplot as plt
    #plt.close('all')

if computaRecorrencia:
    from unidecode import unidecode
    from operator import itemgetter
    from similarity.jarowinkler import JaroWinkler
    jw = JaroWinkler()

if graficos:
    import itertools
    import calendar

# Colunas que não fazem parte da entrada devem ter o valor de Expressão em branco
# Colunas com valor de Expressão tem seus nomes substituídos pela Descrição
Coluna = namedtuple('Coluna', ['Descrição', 'Expressão'])


class Colunas(Enum):
    @property
    def Descrição(self):
        '''Nome da coluna.'''
        return self.value[0].Descrição
Beispiel #17
0
def similarityFind(srcText, srcStart, dstText, maxWords=30):
    jarowinkler = JaroWinkler()
    dstText = dstText.lower().strip()
    dstLen = len(dstText)
    lastword = dstText.split()[-1]
    maxSim = {'sim': 0, 'begin': -1, 'end': -1}

    try:
        idx = srcStart
        count = 0
        while count < maxWords:
            # 计算开始位置
            begin = idx
            while srcText[begin] == ' ':
                begin += 1

            end = begin + dstLen
            while srcText[end] != ' ':
                end += 1

            # 如果最后一个单词没有出现在查找范围中,适当的加大范围
            tempIdx = srcText[begin:end].lower().rfind(lastword)
            if tempIdx < 0:
                tempIdx = srcText[end:end + 15].lower().find(lastword)
                if tempIdx > 0:
                    end += tempIdx + len(lastword)
                    while srcText[end] != ' ':
                        end += 1
            else:
                # 标点符号结尾
                tempIdx2 = srcText[begin:end].lower().rfind(', ')
                if tempIdx2 > tempIdx:
                    end = begin + tempIdx2 + 1
                else:
                    tempIdx2 = srcText[begin:end].lower().rfind('. ')
                    if tempIdx2 > tempIdx:
                        end = begin + tempIdx2 + 1
                    else:
                        tempIdx2 = srcText[begin:end].lower().rfind('! ')
                        if tempIdx2 > tempIdx:
                            end = begin + tempIdx2 + 1

            # 去掉标点符号
            temp = srcText[begin:end].lower()
            temp = temp.replace('"', '')
            temp = temp.replace('!', '')
            temp = temp.replace('?', '')
            temp = temp.replace('.', '')
            temp = temp.replace(',', '')
            temp = temp.replace('“', '')
            temp = temp.replace('”', '')
            temp = temp.replace('’', '')
            print('try:%s' % (temp))

            # 检查是否相似
            sim = jarowinkler.similarity(temp, dstText)
            print('sim:', sim)
            if sim > maxSim['sim']:
                #相似度开始下降时返回结果。
                maxSim['sim'] = sim
                maxSim['begin'] = begin
                maxSim['end'] = end
            else:
                srcWordList = srcText[maxSim['begin']:maxSim['end']].split()
                if len(srcWordList) > 0 and lastword != srcWordList[-1]:
                    print('aaaaaaaaaaaaaaaa', srcWordList)
                    print('bbbbbbbbbbbbbbbb', lastword)
                    for i in range(len(srcWordList) - 1, -1, -1):
                        if srcWordList[i].find(lastword) >= 0:
                            temp = ' '.join(srcWordList[0:i + 1]).lower()
                            temp = temp.replace('"', '')
                            temp = temp.replace('!', '')
                            temp = temp.replace('?', '')
                            temp = temp.replace('.', '')
                            temp = temp.replace(',', '')
                            temp = temp.replace('“', '')
                            temp = temp.replace('”', '')
                            temp = temp.replace('’', '')
                            print('ccccccccccccccccc1', temp)
                            print('ccccccccccccccccc2', dstText)
                            sim = jarowinkler.similarity(temp, dstText)
                            print('ccccccccccccccccc3', sim)
                            if sim > maxSim['sim']:
                                maxSim['sim'] = sim
                                end = srcText.rfind(lastword, begin,
                                                    maxSim['end'])
                                while srcText[end] != ' ':
                                    end += 1
                                maxSim['end'] = end
                                print('eeeeeeeeeeeeeeeeeeee',
                                      srcText[maxSim['begin']:maxSim['end']])
                                break
                return maxSim

            # 继续从一下个单词开始比较。
            while srcText[begin] != ' ':
                begin += 1
            idx = begin
            count += 1
    except IndexError as e:
        print('error:', e)

    return maxSim
Beispiel #18
0
def checkSegment(filename, textFileName, segmentIdx=0, textIdx=0):
    #用来判断文字相似度
    jarowinkler = JaroWinkler()

    ret = True
    # 从json中读取分段信息。
    segment = []
    with open(filename, 'r', encoding='UTF-8') as f:
        segment = json.load(f)

        # 从txt中读取参考文字。
        with open(textFileName, 'r', encoding='UTF-8') as f:
            text = f.read()
            text = text.replace('\n', ' ')
            text = text.replace('\r', '')

            # 从指定位置开始校正。
            for i in range(segmentIdx, len(segment)):
                print('')
                print(i, textIdx)
                #print(text[textIdx:textIdx + 150])
                dstText = segment[i]['text'].lower().strip()
                print('audio say:[%s]' % dstText)

                ret = similarityFind(text, textIdx, dstText)
                print(ret)
                if ret['sim'] >= 0.8:
                    print('[%s]%s' % (text[ret['begin']:ret['end']],
                                      text[ret['end']:ret['end'] + 150]))
                    if not ('texc' in segment[i].keys()):
                        if not ('textCheck' in segment[i].keys()):
                            segment[i]['texc'] = text[
                                ret['begin']:ret['end']].strip()
                        else:
                            segment[i]['texc'] = segment[i]['textCheck']
                    if 'textCheck' in segment[i].keys():
                        del segment[i]['textCheck']
                    textIdx = ret['end']
                elif i + 1 < len(segment):
                    dstText = segment[i + 1]['text'].lower().strip()
                    print('next audio say:[%s]****' % dstText)
                    ret = similarityFind(text, textIdx, dstText)
                    print(ret)
                    if ret['sim'] >= 0.8:
                        print('[%s]%s' %
                              (text[textIdx:ret['begin']],
                               text[ret['begin']:ret['begin'] + 150]))
                        if not ('texc' in segment[i].keys()):
                            if not ('textCheck' in segment[i].keys()):
                                segment[i]['texc'] = text[
                                    textIdx:ret['begin']].strip()
                            else:
                                segment[i]['texc'] = segment[i]['textCheck']
                        if 'textCheck' in segment[i].keys():
                            del segment[i]['textCheck']
                        textIdx = ret['begin']
                    else:
                        print(segment[i + 1]['text'].lower().strip())
                        print(text[textIdx:textIdx + 150])
                        ret = False
                        break

    with open(filename, 'w', encoding='UTF-8') as f:
        json.dump(segment, f, indent=4, sort_keys=True, ensure_ascii=False)
        print('--------------------------------')
    print('**********************************')
    return ret
Beispiel #19
0
    def __init__(self):

        self.stemmer = SnowballStemmer("english")
        self.jaroWinkler = JaroWinkler()
        self.diceScore = None  #SorensenDice()
Beispiel #20
0
def search():
    global result
    result = []
    data = json.loads(request.get_data())
    jarowinkler = JaroWinkler()
    page_list = []
    suchwort = []
    first_set = []
    second_set = []

    # nlp = spacy.load('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')
    # nlp = spacy.load('en_core_web_sm', disable=["parser",'ner'])

    word = ' '.join([i.capitalize() for i in data['nlp']['source'].split(' ')])
    doc = nlp(word)
    for token in doc:
        # if token.tag_ in ['NNP','NNPS', 'NN', 'NNS']:
        if token.tag_ in ['NE','NNE', 'NN']:
            suchwort.append(token.text)
        
    print(word)
    if suchwort:
        if len(suchwort) >= 2:

            for d in dict_list_bereinigt:
                for key, value in d.items():
                    for i in value:
                        if jarowinkler.similarity(i.lower(), suchwort[-1].lower()) > 0.95:
                            first_set.append(key)

            for d in dict_list_bereinigt:
                for key, value in d.items():
                    for i in value:
                        if jarowinkler.similarity(i.lower(), suchwort[-2].lower()) > 0.95:
                            second_set.append(key)
            found_pages = list(set(first_set).intersection(set(second_set)))
        else:
            for d in dict_list_bereinigt:
                for key, value in d.items():
                    for i in value:
                        if jarowinkler.similarity(i.lower(), suchwort[-1].lower()) > 0.95:
                            first_set.append(key)
            found_pages = first_set

        searchlist = list(set(found_pages))
        page_list = [int(i[0]) for i in [i.split('.') for i in searchlist]]
        sentence = "Außerdem habe {} Seite(n) im Skript mit {} finden können".format(len(page_list),' '.join(suchwort))  
        pic_urls = [dictionary[sorted(searchlist)[i]] for i in range(0,len(searchlist),3)]    
        result.append({'type': 'text', 'content':sentence + ". Hier sind ein paar Beispiele " + " ".join(str(i) for i in sorted(page_list))})

        for i in pic_urls:
            myd = {'type': 'picture','content':''}
            myd['content'] = i
            result.append(myd)
            
    if len(page_list) == 0:
        result = [{'type': 'text','content': 'Ich konnte nichts im Skript zum Wort {} finden'.format(suchwort[0])}]

    replies=result
    # return replies
    return jsonify( 
    status=200, 
    replies=result, 
    conversation={ 
      'memory': { 'key': 'value' } 
    } 
  )
Beispiel #21
0
 def __init__(self):
     self.jaro_winkler = JaroWinkler()
     self.list_check = list()