Exemple #1
0
def GetScore(src_name, input_name, min_score):
    src_name = src_name.translate(str.maketrans('', '', string.punctuation))
    input_name = input_name.translate(
        str.maketrans('', '', string.punctuation))
    jarowinkler = JaroWinkler()

    result = []

    total_score_scr_part = 0
    for input_name_part in input_name.split():
        column = []
        for src_name_part in src_name.split():
            winkler_part = jarowinkler.similarity(
                input_name_part, src_name_part)
            difference = ParsedDifference(input_name_part, src_name_part)

            avg = (winkler_part + difference) / 2

            column.append(avg)
        result.append(max(column))

    full_inputted_jaro = jarowinkler.similarity(input_name, src_name)
    score = Average(result)
    if (full_inputted_jaro > score):
        score = full_inputted_jaro
    return score * 100
def check_repeat_similar(table_info: TableInfo, target_list: List,
                         target_tag: str) -> TableInfo:
    # 已重复,直接跳出
    if table_info.result == 1:
        return table_info

    jk = JaroWinkler()
    similar_list: List[RepeatInfo] = list()
    for t_name in target_list:
        similar_point = jk.similarity(t_name, table_info.t_name)
        restore_result(t_name, similar_point, similar_list)
    # 处理查询后的结果
    similar_msg_list = list()
    similar_result = 0
    for repeat_info in similar_list:
        if repeat_info.similar_point > 0.97:
            similar_result = 1
            similar_msg_list.append('与' + repeat_info.t_name + '(' +
                                    target_tag + ')完全重复')
            # 完全重复就不需要在查找疑似重复
            break
        elif repeat_info.similar_point > 0.7:
            similar_result = similar_result if similar_result == 1 else 2
            similar_msg_list.append('与' + repeat_info.t_name + '(' +
                                    target_tag + ')疑似重复')

    table_info.result = similar_result
    table_info.msg = similar_msg_list
    return table_info
    def fuzzy_line_equality_detection(self, lines):
        new_lines = []

        jarowinkler = JaroWinkler()
        #Compare all lines against each other
        for k in range(len(lines.split("\n"))):
            max_sim = 0
            for l in range(len(lines.split("\n"))):
                if k == l: continue
                jaro_sim = jarowinkler.similarity(
                    lines.split("\n")[k].lower(),
                    lines.split("\n")[l].lower())

                #Get maximum similarity
                if jaro_sim > max_sim:
                    max_sim = jaro_sim

            #If maximum similarity >= similarity threshold: make all tokens technical(T)
            if max_sim >= self.similarity_threshold and lines.split(
                    "\n")[k].replace(" ", ""):
                new_lines.append(" ".join(
                    [w + "_T" for w in lines.split("\n")[k].split(" ")]))
            else:
                new_lines.append(" ".join(
                    [w + "_N" for w in lines.split("\n")[k].split(" ")]))

        return "\n".join(new_lines)
    def best_match(self, search_track, tracks):
        jw = JaroWinkler()
        title_similarities = []
        artists_similarities = []
        totals = []
        for track in tracks:
            title_similarity = jw.similarity(search_track.title.lower(), track.title.lower())
            title_similarities.append(title_similarity)
            artists_similarity = jw.similarity(search_track.artists.lower(), track.artists.lower())
            artists_similarities.append(artists_similarity)
            totals.append(artists_similarity + title_similarity)

        max_index = totals.index(max(totals))
        max_total = totals[max_index]
        if max_total > 1.5:
            return tracks[max_index]
        else:
            return None
def correct(word, dictionary):
    if is_correct(word, dictionary):
        return (word, 1.0)
    else:
        jarowinkler = JaroWinkler()
        score = []
        for dict_word in dictionary:
            score.append(jarowinkler.similarity(word, dict_word))
        return (dictionary[score.index(max(score))], max(score))
def get_similarity_score(name_1, name_2):
    if name_1 == '' and name_2 == '':
        return 1
    if (name_1 == '' and name_2 != '') or (name_1 != '' and name_2 == ''):
        return 0
    c_name_1 = normalize_text(curate_author_name(name_1)).lower()
    c_name_2 = normalize_text(curate_author_name(name_2)).lower()
    jarowinkler = JaroWinkler()
    similarity_score = jarowinkler.similarity(c_name_1, c_name_2)
    return similarity_score
def jw(df):
    jarowinkler = JaroWinkler()
    df["jarowinkler_sim"] = [
        jarowinkler.similarity(i, j)
        for i, j in zip(df["Tags2"], df["UserInput"])
    ]
    df.sort_values(by=['jarowinkler_sim'], inplace=True, ascending=False)
    final = df.drop(['Category', 'ReviewText2', 'Tags2'], axis=1).iloc[:5, :]

    return final
Exemple #8
0
def find_most_apt(name, results):
	jarowinkler = JaroWinkler()
	deg = []
	for el in results:
		if name.upper() == el.upper():
			return el
		else:
			deg.append(jarowinkler.similarity(name.upper(), el.upper()))
	indd = int(deg.index(max(deg)))
	mostapt = results[indd]
	return mostapt
Exemple #9
0
class DuplicatesListPipeline(object):
    def __init__(self):
        self.jaro_winkler = JaroWinkler()
        self.list_check = list()

    def process_item(self, item, spider):
    
            name_tour = item['name_tour']
            url_tour = item['url_tour']
            number_date = item['number_date']
            if number_date is None:
                number_date = ''
            start_date = item['start_date']
            if start_date is None:
                start_date = ''

            check = name_tour + url_tour + str(number_date) + start_date

            list_new = self.list_check
            score = 0

            for value in list_new:
                name_tour_new = value['name_tour']
                if name_tour_new is None:
                    name_tour_new = ''
                url_tour_new = value['url_tour']
                number_date_new = value['number_date']
                if number_date_new is None:
                    print(type(number_date_new))
                    number_date_new = ''
                start_date_new = value['start_date']
                if start_date_new is None:
                    start_date_new = ''

                check_new = name_tour_new + url_tour_new + str(number_date_new) + start_date_new


                score =  self.jaro_winkler.similarity(check, check_new)

                if score >= 0.85:
                    print("SCORE: " + score)
                    
                    raise DropItem("Duplicate item found ")
                else:
                    self.list_check.append(item)
                    return item

            if len(list_new) == 0:
                self.list_check.append(item)
                return item
def are_names_similar(name_1,
                      name_2,
                      use_approximation_algorithm=False,
                      similarity_threshold=0.95):
    if name_1 == '' and name_2 == '':
        return True
    if (name_1 == '' and name_2 != '') or (name_1 != '' and name_2 == ''):
        return False
    c_name_1 = normalize_text(curate_author_name(name_1)).lower()
    c_name_2 = normalize_text(curate_author_name(name_2)).lower()
    if use_approximation_algorithm:
        jarowinkler = JaroWinkler()
        similarity_score = jarowinkler.similarity(c_name_1, c_name_2)
        return similarity_score > similarity_threshold
    else:
        return c_name_1 == c_name_2
def __affiliations_to_save(affiliations, new_affiliations):
    jarowinkler = JaroWinkler()
    similarity_threshold = 0.95
    affiliations_to_save = []
    for new_affiliation in new_affiliations:
        exist_affiliation = False
        for affiliation in affiliations:
            # normalize text before comparison
            affiliation_nor = normalize_text(affiliation)
            new_affiliation_nor = normalize_text(new_affiliation)
            similarity_score = jarowinkler.similarity(
                affiliation_nor.lower(), new_affiliation_nor.lower())
            if similarity_score >= similarity_threshold:
                exist_affiliation = True
        if not exist_affiliation:
            affiliations_to_save.append(new_affiliation)
    return affiliations_to_save
Exemple #12
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
        print("Matching name", name1, "from", index_path)
        numMatch = 0
        numMatchApprox = 0
        aux = True
        for name2, ids2 in map2.items():
            if name1.lower() == name2.lower():
                for id1 in ids1:
                    for id2 in ids2:
                        matches.add((index_path, id1, id2[0], id2[1]))
                numMatch += 1
                aux = False
        sim_threshold = 0.995
        while aux and sim_threshold >= 0.9:
            aux = True
            for name2, ids2 in map2.items():
                if jarowinkler.similarity(name1.lower()[::-1],
                                          name2.lower()[::-1]) > sim_threshold:
                    for id1 in ids1:
                        for id2 in ids2:
                            matches.add((index_path, id1, id2[0], id2[1]))
                    numMatchApprox += 1
                    aux = False
            sim_threshold -= 0.005
        num_index_places += 1
        if numMatch > 0 or numMatchApprox > 0: num_index_places_matched += 1
        if numMatch == 0 and numMatchApprox > 0:
            num_index_places_matched_approx += 1
        if numMatch == 1 or (numMatch == 0 and numMatchApprox == 1):
            num_index_places_matched_single += 1

results = []
for match in matches:
Exemple #14
0
class TextProcessing(object):
    def __init__(self):

        self.stemmer = SnowballStemmer("english")
        self.jaroWinkler = JaroWinkler()
        self.diceScore = None  #SorensenDice()

    def removeStopWords(slef, text):
        stopList = {'the', 'of', 'by', 'in', 'on', 'at', 'for', 'an'}
        textsplited = text.replace("_", " ").split(" ")
        return " ".join([w for w in textsplited if w not in stopList])

    def removeSpecialCharacters2(self, text):
        #_text = text.replace("\'", "")
        pos = text.split("__")
        if len(pos) > 1:
            _text = pos[1]
            pos = pos[0] + "__"
        else:
            pos = ''
            _text = text
        _text = _text.replace("_", " ")
        _text = _text.replace('&', ' and ').strip()
        _text = _text.replace('%', ' percentage ').strip()
        _text = _text.replace('#', ' no. ').strip()
        _text = _text.replace('$', ' currency ').strip()
        characters = [
            '/', '\\', '>', '<', "'s", "(s)", '\"', "[", "]", "(", ")", "{",
            "}", "."
        ]
        for c in characters:
            if c in _text:
                _text = _text.replace(c, ' ').strip()
        _text = re.sub('\\s+', ' ', _text).strip()
        return pos + _text.replace(" ", "_")
        #result = re.sub(r'[?|$|.|!\-\[\]/\(\)#,:]',r'', result)

    def stemWord(self, text):
        pos = text.split("__")
        if len(pos) > 1:
            _text = pos[1]
            pos = pos[0] + "__"
        else:
            pos = ''
            _text = text
        _text = " ".join(_text.split("_"))
        result = [self.stemmer.stem(t) for t in _text.split(" ")]
        result = " ".join(result)

        return pos + result.strip().replace(" ", "_")

    def cleanForSimilarity(self, text):
        if "protag_article" in text:
            return ""
        if "__" in text:
            cleant = text.split("__")[1]
        else:
            cleant = text

        if " :" in cleant:
            cleant = cleant.split(" :")[1]
        if "@en" in cleant:
            cleant = cleant.split("@")[0]
            cleant = [self.stemmer.stem(t) for t in cleant.split(" ")]
            cleant = "".join(cleant)

        cleant = cleant.replace("*", "").replace("_",
                                                 "").replace("spancol",
                                                             "").split("@")[0]

        return cleant

    def textSimilarity(self, text1, text2):
        #print("text inicial: ", text1, text2)
        t1 = self.cleanForSimilarity(text1)
        t2 = self.cleanForSimilarity(text2)
        if len(t1) == 0 and len(t2) == 0:
            return 0
        score1 = self.jaroWinkler.similarity(t1, t2)
        score2 = self.diceScore.similarity(t1, t2)
        mins = min([score1, score2])
        #print(t1,t2,mins)
        return mins

    def cleanCellHeader(self, cellText):
        _cellText = re.sub('\\s+', ' ', cellText).strip()
        _cellText = self.removeSpecialCharacters2(_cellText)
        _cellText = self.stemWord(_cellText)
        return _cellText

    def orderHeaders(self, headers):
        _headers = headers[:]
        hd = {hi: [] for hi in _headers}
        for i, hi in enumerate(_headers):
            hd[hi].append(i)
        headersD = {}
        for k, v in hd.items():
            if len(v) > 1:
                _v = v
                _v.sort()
                i = 1
                for posh in _v:
                    headersD[posh] = str(i) + "__" + k
                    i += 1
            else:
                headersD[v[0]] = k
        _headers = []
        for i in range(len(headers)):
            _headers.append(headersD.get(i))
        return _headers

    def cleanTableHeader(self, headers):
        dataTypes = [h.split("@")[len(h.split("@")) - 1] for h in headers]
        _headers = [h.split("@")[0] for h in headers]
        _headers = [self.removeSpecialCharacters2(h) for h in _headers]
        _headers = [self.stemWord(h) for h in _headers]
        _headers = ['spancol' if hi == "" else hi for hi in _headers]
        hd = {hi: [] for hi in _headers}
        for i, hi in enumerate(_headers):
            hd[hi].append(i)
        headersD = {}
        for k, v in hd.items():
            if len(v) > 1:
                _v = v
                _v.sort()
                i = 1
                for posh in _v:
                    headersD[posh] = str(i) + "__" + k
                    i += 1
            else:
                headersD[v[0]] = k
        _headers = []
        for k, v in headersD.items():
            _headers.append(v + "@" + dataTypes[k])
        return _headers
    temp_article.append(data[i][0])

print(len(data))

my_string = "human moblity prediction spatiotemporal next place future location point-of-interest hotspot forecasting modelling mobility behaviors traffic trajectory mobile phone"

p = []

filter_thresh_45 = []

for i in range(len(temp_article)):

    jarowinkler = JaroWinkler()

    sim = jarowinkler.similarity(my_string, temp_article[i])

    if sim > 0.45:

        filter_thresh_45.append(data[i])

normalized_levenshtein = NormalizedLevenshtein()

filter_normalized_levenshtein = []

for i in range(len(filter_thresh_45)):

    sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0])

    if sim >= 0.7:
Exemple #16
0
def search():
    global result
    result = []
    data = json.loads(request.get_data())
    jarowinkler = JaroWinkler()
    page_list = []
    suchwort = []
    first_set = []
    second_set = []

    # nlp = spacy.load('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')
    # nlp = spacy.load('en_core_web_sm', disable=["parser",'ner'])

    word = ' '.join([i.capitalize() for i in data['nlp']['source'].split(' ')])
    doc = nlp(word)
    for token in doc:
        # if token.tag_ in ['NNP','NNPS', 'NN', 'NNS']:
        if token.tag_ in ['NE','NNE', 'NN']:
            suchwort.append(token.text)
        
    print(word)
    if suchwort:
        if len(suchwort) >= 2:

            for d in dict_list_bereinigt:
                for key, value in d.items():
                    for i in value:
                        if jarowinkler.similarity(i.lower(), suchwort[-1].lower()) > 0.95:
                            first_set.append(key)

            for d in dict_list_bereinigt:
                for key, value in d.items():
                    for i in value:
                        if jarowinkler.similarity(i.lower(), suchwort[-2].lower()) > 0.95:
                            second_set.append(key)
            found_pages = list(set(first_set).intersection(set(second_set)))
        else:
            for d in dict_list_bereinigt:
                for key, value in d.items():
                    for i in value:
                        if jarowinkler.similarity(i.lower(), suchwort[-1].lower()) > 0.95:
                            first_set.append(key)
            found_pages = first_set

        searchlist = list(set(found_pages))
        page_list = [int(i[0]) for i in [i.split('.') for i in searchlist]]
        sentence = "Außerdem habe {} Seite(n) im Skript mit {} finden können".format(len(page_list),' '.join(suchwort))  
        pic_urls = [dictionary[sorted(searchlist)[i]] for i in range(0,len(searchlist),3)]    
        result.append({'type': 'text', 'content':sentence + ". Hier sind ein paar Beispiele " + " ".join(str(i) for i in sorted(page_list))})

        for i in pic_urls:
            myd = {'type': 'picture','content':''}
            myd['content'] = i
            result.append(myd)
            
    if len(page_list) == 0:
        result = [{'type': 'text','content': 'Ich konnte nichts im Skript zum Wort {} finden'.format(suchwort[0])}]

    replies=result
    # return replies
    return jsonify( 
    status=200, 
    replies=result, 
    conversation={ 
      'memory': { 'key': 'value' } 
    } 
  )
Exemple #17
0
def similarityFind(srcText, srcStart, dstText, maxWords=30):
    jarowinkler = JaroWinkler()
    dstText = dstText.lower().strip()
    dstLen = len(dstText)
    lastword = dstText.split()[-1]
    maxSim = {'sim': 0, 'begin': -1, 'end': -1}

    try:
        idx = srcStart
        count = 0
        while count < maxWords:
            # 计算开始位置
            begin = idx
            while srcText[begin] == ' ':
                begin += 1

            end = begin + dstLen
            while srcText[end] != ' ':
                end += 1

            # 如果最后一个单词没有出现在查找范围中,适当的加大范围
            tempIdx = srcText[begin:end].lower().rfind(lastword)
            if tempIdx < 0:
                tempIdx = srcText[end:end + 15].lower().find(lastword)
                if tempIdx > 0:
                    end += tempIdx + len(lastword)
                    while srcText[end] != ' ':
                        end += 1
            else:
                # 标点符号结尾
                tempIdx2 = srcText[begin:end].lower().rfind(', ')
                if tempIdx2 > tempIdx:
                    end = begin + tempIdx2 + 1
                else:
                    tempIdx2 = srcText[begin:end].lower().rfind('. ')
                    if tempIdx2 > tempIdx:
                        end = begin + tempIdx2 + 1
                    else:
                        tempIdx2 = srcText[begin:end].lower().rfind('! ')
                        if tempIdx2 > tempIdx:
                            end = begin + tempIdx2 + 1

            # 去掉标点符号
            temp = srcText[begin:end].lower()
            temp = temp.replace('"', '')
            temp = temp.replace('!', '')
            temp = temp.replace('?', '')
            temp = temp.replace('.', '')
            temp = temp.replace(',', '')
            temp = temp.replace('“', '')
            temp = temp.replace('”', '')
            temp = temp.replace('’', '')
            print('try:%s' % (temp))

            # 检查是否相似
            sim = jarowinkler.similarity(temp, dstText)
            print('sim:', sim)
            if sim > maxSim['sim']:
                #相似度开始下降时返回结果。
                maxSim['sim'] = sim
                maxSim['begin'] = begin
                maxSim['end'] = end
            else:
                srcWordList = srcText[maxSim['begin']:maxSim['end']].split()
                if len(srcWordList) > 0 and lastword != srcWordList[-1]:
                    print('aaaaaaaaaaaaaaaa', srcWordList)
                    print('bbbbbbbbbbbbbbbb', lastword)
                    for i in range(len(srcWordList) - 1, -1, -1):
                        if srcWordList[i].find(lastword) >= 0:
                            temp = ' '.join(srcWordList[0:i + 1]).lower()
                            temp = temp.replace('"', '')
                            temp = temp.replace('!', '')
                            temp = temp.replace('?', '')
                            temp = temp.replace('.', '')
                            temp = temp.replace(',', '')
                            temp = temp.replace('“', '')
                            temp = temp.replace('”', '')
                            temp = temp.replace('’', '')
                            print('ccccccccccccccccc1', temp)
                            print('ccccccccccccccccc2', dstText)
                            sim = jarowinkler.similarity(temp, dstText)
                            print('ccccccccccccccccc3', sim)
                            if sim > maxSim['sim']:
                                maxSim['sim'] = sim
                                end = srcText.rfind(lastword, begin,
                                                    maxSim['end'])
                                while srcText[end] != ' ':
                                    end += 1
                                maxSim['end'] = end
                                print('eeeeeeeeeeeeeeeeeeee',
                                      srcText[maxSim['begin']:maxSim['end']])
                                break
                return maxSim

            # 继续从一下个单词开始比较。
            while srcText[begin] != ' ':
                begin += 1
            idx = begin
            count += 1
    except IndexError as e:
        print('error:', e)

    return maxSim