Example #1
0
    def generate_dataframe_with_scores2(self, data, tokens, question_category):

        data_df = pd.DataFrame(columns=["url", "score", "category"])
        for t in tokens:
            #print("Genereating for token:", t)
            for (url, keywords, category) in data:
                text = ""
                for word in keywords:
                    text += word

                text = text.replace("'", "").replace(",", "")
                #print(text)
                data_df = data_df.append(
                    {
                        'url':
                        url,
                        #'keywords': text,
                        'score':
                        round(lev.jaro(os.path.basename(url), t), 3) * 100 +
                        20 if not category == question_category else
                        round(lev.jaro(os.path.basename(url), t), 3) * 100,
                        'category':
                        category
                    },
                    ignore_index=True)
        data_df = data_df.sort_values(by=['score'], ascending=False)
        return data_df
Example #2
0
def calculateMethoddistance1(methodinfo1, methodinfo2):
    distance1_singlemethodlist = []
    distance1_singlemethodlist.append(Levenshtein.jaro(methodinfo1.getMethodname(), methodinfo2.getMethodname()))
    distance1_singlemethodlist.append(Levenshtein.jaro(methodinfo1.getReturntype(), methodinfo2.getReturntype()))
    if(methodinfo1.getTotalparameter() != 0):
        distance1_singlemethodlist.append(abs(methodinfo1.getTotalparameter() - methodinfo2.getTotalparameter())/methodinfo1.getTotalparameter())
        distance1_singlemethodlist.append(calculateparameter(methodinfo1, methodinfo2) / (methodinfo1.getTotalparameter() + methodinfo2.getTotalparameter()))
    elif(methodinfo2.getTotalparameter() == 0):
        distance1_singlemethodlist.append(0)
        distance1_singlemethodlist.append(0)
    else:
        distance1_singlemethodlist.append(1)
        distance1_singlemethodlist.append(1)

    if(methodinfo1.getMethodLOC() != 0):
        distance1_singlemethodlist.append(abs(methodinfo1.getMethodLOC() - methodinfo2.getMethodLOC())/methodinfo1.getMethodLOC())
    elif(methodinfo2.getMethodLOC() == 0):
        distance1_singlemethodlist.append(0)
    else:
        distance1_singlemethodlist.append(1)

    distance1_method = 0.0
    for i in distance1_singlemethodlist:
        distance1_method = distance1_method + i/5
    return distance1_method
Example #3
0
    def getTreffer(self): 
        #liste von ids wird zurückgegeben
        #print "get Treffer"
        
        daten = self.Datenbank.getDataAsList("select deutsch, fremd from vokabeln where id like "+ str(self.ids))
        
        #print "vergleich zwischen "+str(daten[0][1]) +" und "+str(self.wort)
        
        if self.richtung == 1:
            if leve.distance(daten[0][1], self.wort) <= int(self.distanz) and leve.jaro(daten[0][1], self.wort) > round((self.minTreffer/100), 2):
                self.direktTreffer = True
                #print self.ids
                return [self.ids]
        else:
            if leve.distance(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) <= int(self.distanz) \
                    and leve.jaro(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) > round((self.minTreffer/100), 2):
                #print "Leven Vergleich zwischen "+ str(daten[0][0])+ " und "+ str(self.wort)
                self.direktTreffer = True
                #print self.id
                return [self.ids] 

       
        rueckgabe = []
        for i in self.liste:
            #print "Aktueller vergleich "+unicode(i[0]) +" und "+unicode(self.wort)
            if leve.distance(i[0], self.wort) <= int(self.distanz) and leve.jaro(i[0], self.wort) > 0.7:
                rueckgabe.append(i[1])
        return rueckgabe
Example #4
0
 def prettyprint(self):
     print "Timestamp:     " + self.data["timeseed"]
     print "Expected Data: " + self.data["expected_data"]
     print "PSK31 Data:    " + self.data["psk_data"]
     print "PSK31 Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["psk_data"]))
     print "DOMEX8 Data:   " + self.data["domex_data"]
     print "DOMEX Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"]))
def test_compare_implementations():
    # Compare the implementations of python-Levenshtein to our
    # pure-Python implementations
    if Levenshtein is False:
        raise unittest.SkipTest
    # Test on strings with randomly placed common char
    for string1, string2 in _random_common_char_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=False) == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=True) == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(
            string1, string2) == Levenshtein.ratio(string1, string2))
    # Test on random strings
    for string1, string2 in _random_string_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=False) == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=True) == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(
            string1, string2) == Levenshtein.ratio(string1, string2))
Example #6
0
def top_sample(ratio=0.2):
    data_cases = []
    names_cases = []
    data_controls = []
    names_controls = []
    path_cases = run_path + "/cases_encoding_str.txt"
    f = open(path_cases, 'r', encoding="UTF-8")
    for line in f:
        data_cases.append(line.split(":")[-1])
        names_cases.append(line.split(":")[0])
    f.close()
    acc_cases = []
    for d in data_cases:
        sum = 0
        for ds in data_cases:
            sum += Levenshtein.jaro(d, ds)
        result = sum / data_cases.__len__()
        acc_cases.append(result)
    result = dict(zip(names_cases, acc_cases))
    result = sorted(result.items(), key=lambda x: -x[-1])
    number = int(data_cases.__len__() * ratio)
    # low = int(number*(0.5-ratio/2))
    # high = int(number*(0.5+ratio/2))  #用来取中位数
    f = open(run_path + "/top_cases.csv", "w", encoding="UTF-8")
    first_line = "name,acc\n"
    f.write(first_line)
    for a in range(number):
        result_tmp = "%s,%.4f\n" % (result[a][0], result[a][1])
        print(result_tmp)
        f.write(result_tmp)
    f.close()
    path_cases = run_path + "/controls_encoding_str.txt"
    f = open(path_cases, 'r', encoding="UTF-8")
    for line in f:
        data_controls.append(line.split(":")[-1])
        names_controls.append(line.split(":")[0])
    f.close()
    acc_controls = []
    for d in data_controls:
        sum = 0
        for ds in data_controls:
            sum += Levenshtein.jaro(d, ds)
        result = sum / data_controls.__len__()
        acc_controls.append(result)
    result = dict(zip(names_controls, acc_controls))
    result = sorted(result.items(), key=lambda x: -x[-1])
    number = int(data_controls.__len__() * ratio)
    # low = int(number * (0.5 - ratio / 2))
    # high = int(number * (0.5 + ratio / 2))  # 用来取中位数
    f = open(run_path + "/top_controls.csv", "w", encoding="UTF-8")
    first_line = "name,acc\n"
    f.write(first_line)
    for a in range(number):
        result_tmp = "%s,%.4f\n" % (result[a][0], result[a][1])
        print(result_tmp)
        f.write(result_tmp)
    f.close()
    return True
Example #7
0
 def prettyprint(self):
     print "Timestamp:     " + self.data["timeseed"]
     print "Expected Data: " + self.data["expected_data"]
     print "PSK31 Data:    " + self.data["psk_data"]
     print "PSK31 Jaro Dist: " + str(
         Levenshtein.jaro(self.data["expected_data"],
                          self.data["psk_data"]))
     print "DOMEX8 Data:   " + self.data["domex_data"]
     print "DOMEX Jaro Dist: " + str(
         Levenshtein.jaro(self.data["expected_data"],
                          self.data["domex_data"]))
Example #8
0
    def __filter_res(self, itunes_res_list, music_tag):

        # print('*******************************************')
        # print("Music Info:\n File: " + music_tag_dict[gl.FILE_PATH] + "\nAlbum: " + music_album + "\n Artist: " + music_album_artist + '\n Track Artist: ' + music_track_artist + '\n')

        # 最佳匹配
        best_match = {}
        # 最佳匹配率
        best_ratio = 0.0

        for dic in itunes_res_list:
            ratio_album = 0.0
            ratio_artist = 0.0
            ratio_track_artist = 0.0

            if music_tag[music.ALBUM] != "":
                ratio_album = Levenshtein.jaro(
                    music_tag[music.ALBUM], process_album_info(dic.get(self.ITUNES_COLLECTION_NAME, ""))
                )
                # print("Apple Info: \n Album:" + process_album_info(dic['collectionName']) + "\n")
                # print(type(ratio_album))
            if music_tag[music.ALBUM_ARTIST] != "":
                ratio_artist = Levenshtein.jaro(
                    music_tag[music.ALBUM_ARTIST], process_artist_info(dic.get(self.ITUNES_ARTIST_NAME, ""))
                )
                # print("Artist: " + dic['artistName'] + "\n")
                # print(ratio_artist)
            if music_tag[music.ARTIST] != "":
                ratio_track_artist = Levenshtein.jaro(
                    music_tag[music.ARTIST], process_artist_info(dic.get(self.ITUNES_ARTIST_NAME, ""))
                )
                # print("Artist: " + dic['artistName'] + "\n")
                # print(ratio_track_artist)
            # print('---------------------')
            ratio_artist = ratio_artist if ratio_artist > ratio_track_artist else ratio_track_artist

            cur_ratio = (ratio_album + ratio_artist) / 2
            if cur_ratio > best_ratio:
                best_ratio = cur_ratio
                best_match = dic

        # 如果最大匹配率大于用户设定的数值,则认为匹配成功
        if best_ratio > self._trust_prob:
            # print(best_match['collectionViewUrl'])
            # print(process_album_info(best_match['collectionName']))
            # print(process_artist_info(best_match['artistName']))
            # print("hit")
            # print(best_ratio)
            return best_match
        else:
            # print("not hit")
            return {}
Example #9
0
def test(clf):
    dvds = []
    with open("dvd.csv") as f:
        for i, j in enumerate(f):
            dvds.append(j)

    movies = []
    with open("movies.csv") as f:
        for i, j in enumerate(f):
            movies.append(j)

    dvds = [dvd for dvd in dvds if dvd > "B"]
    movies = [movie for movie in movies if movie > "B"]
    print(len(dvds), len(movies))

    with open("test.csv", "w") as f:
        i = 0
        for dvd in dvds:
            prefix = dvd[0]
            i += 1
            maxSimil = 0.0
            for movie in movies:
                if movie[0] == prefix:
                    tempSim = lev.jaro(dvd, movie)
                    if tempSim > maxSimil:
                        maxSimil = tempSim
                        maxMovie = movie

            temp = [
                1.0 - (lev.distance(dvd, maxMovie) / len(dvd)),
                lev.jaro(dvd, maxMovie),
                lev.jaro_winkler(dvd, maxMovie),
                lev.ratio(dvd, maxMovie),
            ]
            print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp)))
            f.write(
                "%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n"
                % (
                    dvd.rstrip(),
                    maxMovie.rstrip(),
                    1.0 - (lev.distance(dvd, maxMovie) / len(dvd)),
                    lev.jaro(dvd, maxMovie),
                    lev.jaro_winkler(dvd, maxMovie),
                    lev.ratio(dvd, maxMovie),
                    clf.decision_function(temp),
                    clf.predict(temp),
                )
            )
Example #10
0
def similarity(str1, str2):
    # 1. difflib
    seq = difflib.SequenceMatcher(None, str1, str2)
    ratio = seq.ratio()
    # print('difflib similarity1: ', ratio)
    #返回的结果超过0.6就算很相似。目前做近义词词典就是借助相似度自动化来实现。

    # 3. 编辑距离,描述由一个字串转化成另一个字串最少的操作次数,在其中的操作包括 插入、删除、替换
    sim1 = Levenshtein.distance(str1, str2)
    # print('Levenshtein similarity: ', sim1)

    # 4.计算莱文斯坦比
    sim2 = Levenshtein.ratio(str1, str2)
    # print('Levenshtein.ratio similarity: ', sim2)

    # 5.计算jaro距离
    sim3 = Levenshtein.jaro(str1, str2)
    # print('Levenshtein.jaro similarity: ', sim3)

    # 6. Jaro–Winkler距离
    sim4 = Levenshtein.jaro_winkler(str1, str2)
    # print('Levenshtein.jaro_winkler similarity: ', sim4)

    if ratio > 0.6 or sim1 < 50 or ((sim2 + sim3 + sim4) / 3) > 0.8:
        return True
Example #11
0
def doCompare(str1, str2):
    print(u"%s - %s 相似度计算" % (str1, str2))

    seq = difflib.SequenceMatcher(None, str1, str2)
    ratio = seq.ratio()

    # 相似度,最大是1
    # print(u"difflib 相似度")
    print((u"difflib 相似度:%s" % ratio))  #可以
    # print(u"difflib 相似度:"+str(ratio))#可以

    #编辑距离,越小越好,但无法体现长字符串的的相似情况
    sim = Levenshtein.distance(str1, str2)
    print(u"Levenshtein 编辑距离:%s" % sim)

    #测试与diff相同
    levenRatio = Levenshtein.ratio(str1, str2)
    print(u"Levenshtein 莱温斯坦比:%s" % levenRatio)

    #测试与diff相同
    sim = Levenshtein.seqratio(str1, str2)
    print(u"Levenshtein 相似率:%s" % sim)

    #越大越好,最大1
    jaro = Levenshtein.jaro(str1, str2)
    print(u"Levenshtein jaro距离:%s" % jaro)

    #越大越好,完全相同是1。出现很多次不完全相同,但结果还是1的情况
    #如果只是数字和符号的不同,会被认为是1
    jaroWinkler = Levenshtein.jaro_winkler(str1, str2)
    print(u"Levenshtein jaro_winkler距离:%s" % jaroWinkler)

    print("\n")
	def get_closest_email(self, emails, entity): 
		"""
		Compares and scores each email in a array with the entity provided by Amazon Lex. Returns a string which contains three emails that best match the entity. 
		:params emails: a array of emails returned by get_emails()
		:type emails: array
		:param entity: an entity derived from the user's input which orginates from Amazon's Lex services
		:type entity: str
		"""	

		# Creating an empty Dataframe with column names only
		df = pd.DataFrame(columns=['entity', 'score'])
		print('test')

		# Loop through each email...
		for email in emails:
			print('test')
			# Append a new row at the bottom: (email, comparison between this email and the entity provided)
			df = df.append(
					{
						'entity': email,
						'score': round(SequenceMatcher.jaro(email, entity), 3)*1000
					},
					ignore_index=True)
			print('test')
		# Sort the dataframe by highest to lowest score
		df = df.sort_values(by=['score'], ascending=False)
		print('test')
		# Concatinate a string with the top 3 scoring emails
		answer = f"\nHere are a few possible answers:\n{df.iloc[0]['entity']}"

		# return the answer
		return answer
Example #13
0
def extract_features(document_tfidf, question_tfidf, answer_tfidf, document,
                     question, answer):
    qa_cos_d = spatial.distance.cosine(question_tfidf, answer_tfidf)
    qd_cos_d = spatial.distance.cosine(question_tfidf, document_tfidf)
    ad_cos_d = spatial.distance.cosine(answer_tfidf, document_tfidf)

    qa_euc_d = np.linalg.norm(question_tfidf - answer_tfidf)
    qd_euc_d = np.linalg.norm(question_tfidf - document_tfidf)
    ad_euc_d = np.linalg.norm(answer_tfidf - document_tfidf)

    qa_lev_d = Levenshtein.distance(question, answer)
    qa_lev_r = Levenshtein.ratio(question, answer)
    qa_jar_s = Levenshtein.jaro(question, answer)
    qa_jaw_s = Levenshtein.jaro_winkler(question, answer)

    qa_tfidf_score = np.sum(question_tfidf * answer_tfidf.T)
    qd_tfidf_score = np.sum(question_tfidf * document_tfidf.T)
    ad_tfidf_score = np.sum(answer_tfidf * document_tfidf.T)

    document_tfidf_sum = np.sum(document_tfidf)
    question_tfidf_sum = np.sum(question_tfidf)
    answer_tfidf_sum = np.sum(answer_tfidf)

    f = [
        qa_cos_d, qd_cos_d, ad_cos_d, qa_euc_d, qd_euc_d, ad_euc_d, qa_lev_d,
        qa_lev_r, qa_jar_s, qa_jaw_s, qa_tfidf_score, qd_tfidf_score,
        ad_tfidf_score, document_tfidf_sum, question_tfidf_sum,
        answer_tfidf_sum
    ]
    return f
def matchKeyWords(path, keys, keysWeight, colsWeight, ansNum):     
    #读取excel中相关信息
    data = xlrd.open_workbook(path)
    sheet1 = data.sheet_by_name('sheet1')
    rowsNum = sheet1.nrows #总行数
    colsNum = 3
    
    #初始化value清零
    value = [0 for x in range(0, rowsNum)]
    #对每个关键字,与表格中所有字符串匹配
    for i in range (0, 5):
        key = keys[i]
        for row in range (1, rowsNum):     
            arrRow = sheet1.row_values(row) #第row行字符串
            for col in range(0, 3):
                #计算相似度加入字符串所在行
                value[row] += Levenshtein.jaro(key , arrRow[col]) * colsWeight[col] * keysWeight[i]
    
    #对行数与其对应value组成二元组按value从大到小排序
    ans = []            
    for i in range(0, rowsNum):
        ans.append((value[i], i))
    ans = sorted(ans, reverse = True)
    for i in range(0, ansNum):
        print(ans[i][1])
Example #15
0
    def __get_suggest(self, word, rating_limit, count):
        word_len = str(len(word) / 2)
        trigrammed_word = '"{}"/1'.format(trigram(word))

        self.__configure(SphinxConfig.index_sugg, word_len)
        result = self.client_sugg.Query(trigrammed_word, SphinxConfig.index_sugg)

        # Если по данному слову не найдено подсказок (а такое бывает?)
        # возвращаем []

        if not result['matches']:
            return []

        maxrank = result['matches'][0]['attrs']['krank']
        maxleven = None

        outlist = list()
        for match in result['matches']:
            if len(outlist) >= count:
                break

            if maxrank - match['attrs']['krank'] < self.default_rating_delta:
                jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
                if not maxleven:
                    maxleven = jaro_rating - jaro_rating * self.regression_coef
                if jaro_rating >= rating_limit and jaro_rating >= maxleven:
                    outlist.append([match['attrs']['word'], jaro_rating])
                del jaro_rating

        outlist.sort(key=lambda x: x[1], reverse=True)

        return outlist
Example #16
0
def calculateD(example):
    '''
    计算各种距离
    :param example:
    :param request_template:
    :return: 返回跟每个模版比较的加权距离,此处加权较为简单,平均做的
    '''
    # sim = {'hamming':0,'distance':0,'Leven':0,..}
    sim_all = []
    # if example in request_template:
    #     return
    for request_M in request_template:
        if example != request_M['request_data']:
            sim = {'hamming': 0, 'distance': 0, 'Leven': 0,'jaro':0,'jaro_winkler':0,'function':request_M['function'],'sum':0}
            sim['distance'] = 1/Levenshtein.distance(example, request_M['request_data'])
            sim['Leven'] = Levenshtein.ratio(example, request_M['request_data'])
            sim['jaro'] = Levenshtein.jaro(example,request_M['request_data'])
            sim['jaro_winkler'] = Levenshtein.jaro_winkler(example,request_M['request_data'])
            try:
                sim['hamming'] = 1/Levenshtein.hamming(example, request_M['request_data'])
            except ValueError:
                sim['hamming'] = 0

            sim['sum'] = (sim['hamming']+sim['distance']+sim['Leven']+sim['jaro']+sim['jaro_winkler'])/5
            sim_all.append(sim)
        else:
            return [{'hamming': 1, 'distance': 1, 'Leven': 1,'jaro':1,'jaro_winkler':1,'function':request_M['function'],'sum':1}]
        # print(sim)
    return sim_all
Example #17
0
def similarity(str1, str2):
    seq = difflib.SequenceMatcher(None, str1, str2)
    ratio = seq.ratio()
    sim3 = Levenshtein.jaro(str1, str2)
    sim4 = jaro.jaro_metric(str1, str2)
    if ratio > 0.731104540194254 and (sim3 + sim4) / 2 > 0.7890962851907381:
        return True
def find(s):
    getVec = 0
    query = 0
    linearSearch = 0
    t1 = time.time()
    sv = getvec(s)
    t2 = time.time()
    res = lsh.query(sv, num_results = 20)
    t3 = time.time()
    resList = []
    choice = (0, 'none')
    aboveThresh = 0
    for r in res:
        resList.append([nameDict[toStr(r[0])],r[1]])
    t4 = time.time()
    if len(resList) >= 1:
        rlen = len(resList)
        for i in range(rlen):
            candidate = resList[i][0][0]
            resList[i].append(Levenshtein.jaro(candidate,s))
        if len(resList) > 1:
            resList = sorted(resList, reverse = True, key=distSort)
        choice = (resList[0][2], resList[0][0][0]) # coice = (dist, name)
        if choice[0] >= thresh:
            aboveThresh = 1
    t5 = time.time()
    getVec = (t2 - t1)
    query = (t3 - t2)
    linearSearch = (t5 - t4)
    timeList = [getVec, query, linearSearch]
    
    return (aboveThresh, choice, timeList)
def rec_results_parser(tbody, rec_BDMC):
    """
    Returns the closest song name using Levenshtein ratio
    """
    
    tr = tbody.findAll('tr')
    # print tr
    rec_BDMC = unicode(rec_BDMC)
    
    print '\n', rec_BDMC
    
    ratios = []
    for i, entries in enumerate(tr[1:]):
        entry = entries.findAll('td')
        sco = entry[0].text
        rec = entry[1].text
        art = entry[3].text
        rel = entry[4].text
        
        if sco == str(100):
            ratio = l.jaro(rec_BDMC, rec)
            ratios.append(ratio)
            # print sco, '\t', rec, ratio, '\t', art, '\t', rel
    idx = ratios.index(max(ratios))
    return idx
def getMatches(name, inDict = False):
    name = name.lower()
    ti = time.time()
    aboveThresh = 1
    tup = tuple(lsh.hshingle(name, num_shingles))
    sig = c.signer.sign(tup)
    resSet = set()
    choice = (0, "none")
    matchList = []
    for band_inx, hshval in enumerate(c.hasher.hash(sig)):
        for h in c.hashmaps[band_inx][hshval]: 
            resSet.add(h)
    for r in resSet:
        sim = lev.jaro(r, name)
        if sim > .7:
            matchList.append((sim, r))
    dt = time.time() - ti
    if len(matchList) > 0:
        matchList = sorted(matchList, reverse = True, key=simSort)
        choice = tuple(matchList[0])
        if inDict and choice[0] == 1:
            #print "skipping match"
            choice = tuple(matchList[int(inDict)])
        if choice[0] < thresh:
            aboveThresh = 0
    return (aboveThresh, choice, [0,dt,0])
 def check_cons(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio > .6 or jaro > .7 or jaro_winkler > .7:
         return True
     else:
         return False
 def check_sure(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio >= 0.9 and jaro >= 0.95 and jaro_winkler >= 0.95:
         return True
     else:
         return False
def gensamples(
        skips, k, batch_size, short, temperature, use_unk, model, sequence, data, idx2word,
        maxlen, maxlenh, maxlend, oov0, glove_idx2idx, vocab_size, nb_unknown_words):
    """Generate text samples."""
    X_test, Y_test = data  # unpack data
    i = random.randint(0, len(X_test) - 1)
    print('HEAD:', ' '.join(idx2word[w] for w in Y_test[i][:maxlenh]))
    print('DESC:', ' '.join(idx2word[w] for w in X_test[i][:maxlend]))
    sys.stdout.flush()

    print('HEADS:')
    x = X_test[i]
    samples = []
    if maxlend == 0:
        skips = [0]
    else:
        skips = range(min(maxlend, len(x)), max(maxlend, len(x)), abs(maxlend - len(x)) // skips + 1)
    for s in skips:
        start = lpadd(x[:s], maxlend, eos)
        fold_start = vocab_fold(start, oov0, glove_idx2idx, vocab_size, nb_unknown_words)
        sample, score = beamsearch(
            predict=keras_rnn_predict,
            start=fold_start,
            k=k,
            maxsample=maxlen,
            empty=empty,
            eos=eos,
            temperature=temperature,
            use_unk=use_unk,
            nb_unknown_words=nb_unknown_words,
            vocab_size=vocab_size,
            model=model,
            maxlen=maxlen,
            maxlend=maxlend,
            sequence=sequence,
            batch_size=batch_size
        )
        assert all(s[maxlend] == eos for s in sample)
        samples += [(s, start, scr) for s, scr in zip(sample, score)]

    samples.sort(key=lambda x: x[-1])
    codes = []
    for sample, start, score in samples:
        code = ''
        words = []
        sample = vocab_unfold(start, sample, oov0)[len(start):]
        for w in sample:
            if w == eos:
                break
            words.append(idx2word[w])
            code += chr(w // (256 * 256)) + chr((w // 256) % 256) + chr(w % 256)
        if short:
            distance = min([100] + [-Levenshtein.jaro(code, c) for c in codes])
            if distance > -0.6:
                print(score, ' '.join(words))
        else:
                print(score, ' '.join(words))
        codes.append(code)
Example #24
0
def get_simhash_dis(str1, str2):
    """计算两个文本之间的simhash相似度"""
    simhash_str1 = simhash.Simhash(str1)
    simhash_str2 = simhash.Simhash(str2)
    dis_simhash = 1 - simhash_str1.distance(simhash_str2) / 64
    dis_ratio = Levenshtein.ratio(str1, str2)
    dis_jaro = Levenshtein.jaro(str1, str2)
    res = (dis_simhash + dis_ratio + dis_jaro) / 3
    return res
def gensamples(skips=2,
               k=10,
               batch_size=constants.BATCH_SIZE,
               short=True,
               temperature=1.,
               use_unk=False):
    i = random.randint(0, len(X_test) - 1)
    #print('DESC:',' '.join(index2word[w] for w in Y_test[i][:DESC_SEQ_LEN]))
    #print('CONTENT:',' '.join(index2word[w] for w in X_test[i][:CONTENT_SEQ_LEN]))
    sys.stdout.flush()

    print('DESCRIPTION:')
    x = X_test[i]
    samples = []
    if CONTENT_SEQ_LEN == 0:
        skips = [0]
    else:
        skips = range(min(CONTENT_SEQ_LEN, len(x)),
                      max(CONTENT_SEQ_LEN, len(x)),
                      abs(CONTENT_SEQ_LEN - len(x)) // skips + 1)

    for s in skips:
        start = lpadd(x[:s])
        fold_start = vocab_fold(start)
        print('Length of list of foldstart: ', len(list(fold_start)))
        sample, score = beamsearch(
            predict=keras_rnn_predict,
            start=fold_start,
            k=k,
            temperature=temperature,
            use_unk=use_unk)  #k = 10 , use_unk = False, temperature = 1.
        try:
            assert all(s[CONTENT_SEQ_LEN] == constants.eos for s in sample)
        except:
            print("Assertion error in gensamples---- proceed")
        samples += [(s, start, scr) for s, scr in zip(sample, score)]

    samples.sort(key=lambda x: x[-1])
    codes = []
    for sample, start, score in samples:
        code = ''
        words = []
        sample = vocab_unfold(start, sample)[len(start):]
        for w in sample:
            if w == constants.eos:
                break
            words.append(index2word[w])
            code += chr(w // (256 * 256)) + chr(
                (w // 256) % 256) + chr(w % 256)
        if short:
            distance = min([100] + [-Levenshtein.jaro(code, c) for c in codes])
            if distance > -0.6:
                print(score, ' '.join(words))
        #         print '%s (%.2f) %f'%(' '.join(words), score, distance)
        else:
            print(score, ' '.join(words))
        codes.append(code)
Example #26
0
def mysimilar():
    import difflib
    import Levenshtein as ls

    str1 = "我的骨骼雪白 也长不出青稞"

    str2 = "雪的日子 我只想到雪中去si"

    # 1. difflib

    seq = difflib.SequenceMatcher(None, str1, str2)

    ratio = seq.ratio()

    print('difflib similarity1: ', ratio)

    # difflib 去掉列表中不需要比较的字符

    seq = difflib.SequenceMatcher(lambda x: x in ' 我的雪', str1, str2)

    ratio = seq.ratio()

    print('difflib similarity2: ', ratio)


    # 2. hamming距离,str1和str2长度必须一致,描述两个等长字串之间对应位置上不同字符的个数

    # sim = ls.hamming(str1, str2)

    # print 'hamming similarity: ', sim

    # 3. 编辑距离,描述由一个字串转化成另一个字串最少的操作次数,在其中的操作包括 插入、删除、替换

    sim = ls.distance(str1, str2)

    print('ls similarity: ', sim)


    # 4.计算莱文斯坦比

    sim = ls.ratio(str1, str2)

    print('ls.ratio similarity: ', sim)


    # 5.计算jaro距离

    sim = ls.jaro(str1, str2)

    print('ls.jaro similarity: ', sim)


    # 6. Jaro–Winkler距离

    sim = ls.jaro_winkler(str1, str2)

    print('ls.jaro_winkler similarity: ', sim)
Example #27
0
def test(name, key):
    """
    test if the two names are the same
    :param name: one of the name
    :param key: another name
    :return: bool result
    """
    if Levenshtein.jaro(name, key) == 1:
        return 1
 def check_beli(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio >= 0.9 or jaro >= 0.9 or jaro_winkler >= 0.9:
         return True
     elif ratio >= .7 and jaro >= .8 and jaro_winkler >= .8:
         return True
     else:
         return False
Example #29
0
def test(clf):
    dvds = []
    with open("dvd.csv") as f:
        for i, j in enumerate(f):
            dvds.append(j)

    movies = []
    with open("movies.csv") as f:
        for i, j in enumerate(f):
            movies.append(j)

    dvds = [dvd for dvd in dvds if dvd > "B"]
    movies = [movie for movie in movies if movie > "B"]
    print(len(dvds), len(movies))

    with open("test.csv", "w") as f:
        i = 0
        for dvd in dvds:
            prefix = dvd[0]
            i += 1
            maxSimil = 0.
            for movie in movies:
                if movie[0] == prefix:
                    tempSim = lev.jaro(dvd, movie)
                    if tempSim > maxSimil:
                        maxSimil = tempSim
                        maxMovie = movie

            temp = [
                1. - (lev.distance(dvd, maxMovie) / len(dvd)),
                lev.jaro(dvd, maxMovie),
                lev.jaro_winkler(dvd, maxMovie),
                lev.ratio(dvd, maxMovie),
            ]
            print("%s\t%s\t%f\t%f" %
                  (dvd.rstrip(), maxMovie.rstrip(),
                   clf.decision_function(temp), clf.predict(temp)))
            f.write("%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n" %
                    (dvd.rstrip(), maxMovie.rstrip(), 1. -
                     (lev.distance(dvd, maxMovie) / len(dvd)),
                     lev.jaro(dvd, maxMovie), lev.jaro_winkler(
                         dvd, maxMovie), lev.ratio(dvd, maxMovie),
                     clf.decision_function(temp), clf.predict(temp)))
 def match_two_list_jaro(listx, listy):
     res = {}
     for animLine in listx:
         animTerm = animLine.split(";")[1]
         for dbpediaLine in listy:
             dbpediaTerm = dbpediaLine.split(";")[1]
             if Levenshtein.jaro(animTerm, dbpediaTerm) > 0.83 and len(
                     animTerm) > 4 and len(dbpediaTerm) > 4:
                 res[animLine.split(";")[0]] = dbpediaLine.split(";")[0]
     return res
Example #31
0
def autocomplete(string, sl):
    bestMatch = None
    value = 0

    for item in sl.items:
        tmpValue = Levenshtein.jaro(string.upper(), itemStr(item).upper())
        if not value or tmpValue > value:
            value = tmpValue
            bestMatch = item
    return bestMatch, value
Example #32
0
def gensamples(X=None, X_test=None, Y_test=None, avoid=None, avoid_score=1, skips=2, k=10, batch_size=batch_size, short=True, temperature=1., use_unk=True):
    if X is None or isinstance(X,int):
        if X is None:
            i = random.randint(0,len(X_test)-1)
        else:
            i = X
        print 'HEAD %d:'%i,' '.join(idx2word[w] for w in Y_test[i])
        print 'DESC:',' '.join(idx2word[w] for w in X_test[i])
        sys.stdout.flush()
        x = X_test[i]
    else:
        x = [word2idx[w.rstrip('^')] for w in X.split()]
        
    if avoid:
        # avoid is a list of avoids. Each avoid is a string or list of word indeicies
        if isinstance(avoid,str) or isinstance(avoid[0], int):
            avoid = [avoid]
        avoid = [a.split() if isinstance(a,str) else a for a in avoid]
        avoid = [vocab_fold([w if isinstance(w,int) else word2idx[w] for w in a])
                 for a in avoid]

    print 'HEADS:'
    samples = []
    if maxlend == 0:
        skips = [0]
    else:
        skips = range(min(maxlend,len(x)), max(maxlend,len(x)), abs(maxlend - len(x)) // skips + 1)
    for s in skips:
        start = lpadd(x[:s])
        fold_start = vocab_fold(start)
        sample, score = beamsearch(predict=keras_rnn_predict, start=fold_start, avoid=avoid, avoid_score=avoid_score,
                                   k=k, temperature=temperature, use_unk=use_unk)
        assert all(s[maxlend] == eos for s in sample)
        samples += [(s,start,scr) for s,scr in zip(sample,score)]

    samples.sort(key=lambda x: x[-1])
    codes = []
    for sample, start, score in samples:
        code = ''
        words = []
        sample = vocab_unfold(start, sample)[len(start):]
        for w in sample:
            if w == eos:
                break
            words.append(idx2word[w])
            code += chr(w//(256*256)) + chr((w//256)%256) + chr(w%256)
        if short:
            distance = min([100] + [-Levenshtein.jaro(code,c) for c in codes])
            if distance > -0.6:
                print score, ' '.join(words)
        #         print '%s (%.2f) %f'%(' '.join(words), score, distance)
        else:
                print score, ' '.join(words)
        codes.append(code)
    return samples
Example #33
0
def gensamples(skips=2,
               k=10,
               batch_size=batch_size,
               short=True,
               temperature=1.,
               use_unk=True):
    i = random.randint(0, len(X_test) - 1)
    print 'HEAD:', ' '.join(idx2word[w] for w in Y_test[i][:maxlenh])
    print 'DESC:', ' '.join(idx2word[w] for w in X_test[i][:maxlend])
    sys.stdout.flush()

    print 'HEADS:'
    x = X_test[i]
    samples = []
    if maxlend == 0:
        skips = [0]
    else:
        skips = range(min(maxlend, len(x)), max(maxlend, len(x)),
                      abs(maxlend - len(x)) // skips + 1)
    for s in skips:
        start = lpadd(x[:s])
        fold_start = vocab_fold(start, vocab_size, glove_idx2idx)
        sample, score = beamsearch_t(start=fold_start,
                                     k=k,
                                     temperature=temperature,
                                     use_unk=use_unk,
                                     maxsample=maxlen,
                                     vocab_size=vocab_size,
                                     model=model,
                                     maxlen=maxlen,
                                     maxlend=maxlend,
                                     sequence=sequence)
        assert all(s[maxlend] == eos for s in sample)
        samples += [(s, start, scr) for s, scr in zip(sample, score)]

    samples.sort(key=lambda x: x[-1])
    codes = []
    for sample, start, score in samples:
        code = ''
        words = []
        sample = vocab_unfold(start, sample, oov0)[len(start):]
        for w in sample:
            if w == eos:
                break
            words.append(idx2word[w])
            code += chr(w // (256 * 256)) + chr(
                (w // 256) % 256) + chr(w % 256)
        if short:
            distance = min([100] + [-Levenshtein.jaro(code, c) for c in codes])
            if distance > -0.6:
                print score, ' '.join(words)
        #         print '%s (%.2f) %f'%(' '.join(words), score, distance)
        else:
            print score, ' '.join(words)
        codes.append(code)
Example #34
0
def mention_estimate(mention, mentions):
    best_href = ''
    if mention != '':
            #print mention
            max = 0.
            for href in mentions[mention]:
                l = Levenshtein.jaro(href[6:], mention)
                if l >= max:
                    max = l
                    best_href = href
    return best_href
Example #35
0
def get_simm(str1, str2):
    # 1. difflib
    seq = difflib.SequenceMatcher(None, str1, str2)
    sim1 = seq.ratio()
    # 2.计算莱文斯坦比
    sim2 = Levenshtein.ratio(str1, str2)
    # 3.计算jaro距离sa764 ZzREsa
    sim3 = Levenshtein.jaro(str1, str2)
    # 4. Jaro–Winkler距离
    sim4 = Levenshtein.jaro_winkler(str1, str2)
    return (sim1 + sim2 + sim3 + sim4) / 4
Example #36
0
def calculate_lev_distance(name_1, name_2, ln_length, jw):

    if jw:
        #print(name_2)
        #print(ln_length)
        if ln_length <= 5 and ln_length > 0:
            return Levenshtein.jaro_winkler(name_1, name_2, .1)
        else:
            return Levenshtein.jaro_winkler(name_1, name_2, .13)

    else:
        return Levenshtein.jaro(name_1, name_2)
def processSentence(sentence, i):
    newX = []
    for word in sentence.split(" "):
        if word in words:
            newX.append(words.index(word))
        else:
            jaros = [Levenshtein.jaro(word, w) for w in words]
            highest_index = jaros.index(max(jaros))
            newX.append(highest_index)
    newX = torch.tensor(newX).to(device).long()
    print(str(i / total_samples * 100.0) + "%\r", end="")
    return newX
Example #38
0
def get_closest_match_leven(text, comparison_list, minimum_match_value):
    closest_match = ''
    closest_match_value=0
    for comparison_text in comparison_list:
        temp_match_value = leven.jaro(text, comparison_text)
        if temp_match_value>closest_match_value:
            closest_match = comparison_text
            closest_match_value = temp_match_value
    if closest_match_value>minimum_match_value:
        return closest_match
    else:
        return '' 
Example #39
0
def getSortedJaroScoreList(name, refIndexNames):
	scoredIndexedNames = []
	for r in refIndexNames:
		jaroTests = []
		test1 = Levenshtein.jaro(name['WrittenFirst'].lower().replace(' ',''), r['YearbookFirst'].lower().replace(' ',''))
		test2 = Levenshtein.jaro(name['WrittenLast'].lower().replace(' ',''), r['YearbookLast'].lower().replace(' ',''))
		jaroTests.append((test1,test2))

		test1 = Levenshtein.jaro(name['WrittenFirst'].lower().replace(' ',''), r['YearbookLast'].lower().replace(' ',''))
		test2 = Levenshtein.jaro(name['WrittenLast'].lower().replace(' ',''), r['YearbookFirst'].lower().replace(' ',''))
		jaroTests.append((test1,test2))

		test1 = Levenshtein.jaro((name['WrittenFirst'] + name['WrittenLast']).lower().replace(' ',''), (r['YearbookFirst'] + r['YearbookLast']).lower().replace(' ',''))
		test2 = test1
		jaroTests.append((test1,test2))

		test1 = Levenshtein.jaro((name['WrittenFirst'] + name['WrittenLast']).lower().replace(' ',''), (r['YearbookLast'] + r['YearbookFirst']).lower().replace(' ',''))
		test2 = test1
		jaroTests.append((test1,test2))
				
		jaroScore = max(map(lambda t:(t[0] + t[1])/2, jaroTests))

		spellingDict = {'Spelling':str(jaroScore)}
		spellingDict.update(r)
		# print(str(spellingDict) + '\n')
		scoredIndexedNames.append(spellingDict)
		if jaroScore == 1: #if you found an exact match, exit early
			break

	return sorted(scoredIndexedNames, key=lambda k: k['Spelling'], reverse=True)
Example #40
0
def _find_song(name):
    max_similarity=-0.1;
    ind=-1;
    threshold=0.5;
    print("match with name %s" % (name));
    for i in song_name_dict.keys():
        sim=leven.jaro(song_name_dict[i], name)
        print("similarity %f" % (sim));
        if(sim>=max_similarity):
            ind=i;
            max_similarity=sim;

    return ind if(max_similarity>=threshold) else None;
Example #41
0
def get_matches(needle, haystack, ratio=0.6):
    needle = unicode(needle)

    result = {}
    for s in haystack:
        if s != needle:
            assert unicode(s)

            distance = Levenshtein.jaro(needle, unicode(s))
            if distance > ratio:
                result[s] = distance

    return result
def getClosest(name):
    aboveThresh = 0
    choice = (0, 'none')    
    ti = time.time()    
    for n in nameList:
        dist = Levenshtein.jaro(n, name)
        if dist > choice[0]:
            choice = (dist, n)
    tf = time.time()
    dt = tf - ti
    if choice[0] >= thresh:
        aboveThresh = 1
    return (aboveThresh, choice, [0,dt,0])
Example #43
0
def search_vindicat(name):
    results = []
    url = 'https://vcat.pl/gielda-dlugow/oferty/api/?draw=2&columns[0][data]=title&columns[0][name]=&columns[0][searchable]=true&columns[0][orderable]=true&columns[0][search][value]=&columns[0][search][regex]=false&columns[1][data]=firm_name&columns[1][name]=&columns[1][searchable]=true&columns[1][orderable]=true&columns[1][search][value]={}&columns[1][search][regex]=false&columns[2][data]=city&columns[2][name]=&columns[2][searchable]=true&columns[2][orderable]=false&columns[2][search][value]=&columns[2][search][regex]=false&columns[3][data]=claim_type&columns[3][name]=&columns[3][searchable]=true&columns[3][orderable]=false&columns[3][search][value]=&columns[3][search][regex]=false&columns[4][data]=debts_sum&columns[4][name]=&columns[4][searchable]=true&columns[4][orderable]=false&columns[4][search][value]=0%2C0&columns[4][search][regex]=false&columns[5][data]=for_sale&columns[5][name]=&columns[5][searchable]=true&columns[5][orderable]=false&columns[5][search][value]=&columns[5][search][regex]=false&columns[6][data]=site_details&columns[6][name]=&columns[6][searchable]=true&columns[6][orderable]=false&columns[6][search][value]=&columns[6][search][regex]=false&order[0][column]=0&order[0][dir]=desc&start=0&length=10000&search[value]=&search[regex]=false&_=1615275634918'.format(
        name.split(' ')[1])
    data_json = requests.get(url).json()
    for elem in data_json['packages']:
        name_part = name.split(' ')[0]
        surname_part = name.split(' ')[1]
        if Levenshtein.jaro(
                elem['firm_name'],
                name) > 0.8 and elem['firm_name'].find(surname_part) != -1:
            results.append(elem)
    return results
 def valueOf(self, _word):
     if _word in self.aggSum:
         # Stop if word already exists
         return self.aggSum[_word]
     else:
         prev = 0
         for i, word in enumerate(self.aggSum):
             # https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html#Levenshtein-jaro
             jaro = Levenshtein.jaro(_word, word)
             if jaro > prev:
                 nArr = self.aggSum[word]
                 prev = jaro
         return nArr
def getClosest(name):
    aboveThresh = 0
    choice = (0, 'none')    
    ti = time.time()    
    for n in surnames.dic.keys():
        dist = lev.jaro(n, name)
        if dist > choice[0]:
            choice = (dist, n)
    tf = time.time()
    dt = tf - ti
    if choice[0] >= thresh:
        aboveThresh = 1
    return (aboveThresh, choice, [0,dt,0])
 def distance_feature(self, df):
     column1, column2 = self.column1, self.column2
     columns = df.columns
     df['distance'] = df[[column1, column2]].apply(
         lambda x: Levenshtein.distance(x[column1], x[column2]), axis=1)
     df['ratio'] = df[[column1, column2]].apply(
         lambda x: Levenshtein.ratio(x[column1], x[column2]), axis=1)
     df['jaro'] = df[[column1, column2]].apply(
         lambda x: Levenshtein.jaro(x[column1], x[column2]), axis=1)
     df['jaro_winkler'] = df[[column1, column2]].apply(
         lambda x: Levenshtein.jaro_winkler(x[column1], x[column2]), axis=1)
     new_columns = list(set(df.columns) - set(columns))
     return df[new_columns]
Example #47
0
def getEntities(sentence):
	
	
	tagged_sent= nltk.pos_tag(nltk.word_tokenize(sentence))
	tree= nltk.ne_chunk(tagged_sent)
	
	print tagged_sent
	
	print tree
	
	
	""" Entities is a list of lists to facilitate pairing
	 	  with multiple entities in a sentence
	"""
	entities=[]
	for subtree in tree.subtrees():
		if subtree.node == 'PERSON':
			nelist=[]
			for child in subtree:
				name,tag = child
				if tag == 'NNP': nelist.append(name)
			if len(nelist) > 0: entities.append(nelist)
		
		
	actors = [line.split(" ", 1) for line in open('actors_index.txt').readlines()]
		
	
	print entities
		
	matches= {}
	for list in entities:
		for pair in pairs(list):
			first, last= pair
			name= first + ' ' + last
			print name
			ratios = []
			for id, actor in actors:
				s= actor.strip().replace(' ', '').split(',')
				if len(s) == 1:
					actor= s[0]
				else:
					actor = s[1] + ' ' + s[0]
				ratio= lev.jaro(name,actor)
				if ratio >= 0.9:
					ratios.append((ratio,id,actor))
			ratios.sort(key=lambda x: x[0], reverse= True)
			if len(ratios) > 0:
				r, id, actor = ratios[0]
				matches[pair] = {'id':id, 'name': actor, 'class': 'actor'}
	return matches
Example #48
0
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []):
    allTrainX = list()
    allTrainY = list()
    with open("./data/train.csv") as f:
        for line in f:
            lin = line.split(",")
            if len(lin) == 3:
                st1 = lin[0].lower()
                st2 = lin[1].lower()

                temp = [
                        1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
                        lev.jaro(st1,st2),
                        lev.jaro_winkler(st1,st2),
                        lev.ratio(st1,st2),
                        distance.sorensen(st1,st2),
                        jaccard(set(st1),set(st2)),
                        1. - distance.nlevenshtein(st1,st2,method=1),
                        1. - distance.nlevenshtein(st1,st2,method=2),
                        dice_coefficient(st1,st2,lenGram=2),
                        dice_coefficient(st1,st2,lenGram=3),
                        dice_coefficient(st1,st2,lenGram=4),
                        cosineWords(st1,st2,dictTrain,tfidf_matrix_train),
                        cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram)
                    ]
                if len(delete) > 0:
                    for elem in delete:
                        temp[elem] = 0.
                allTrainX.append(temp)
                allTrainY.append(int(lin[2]))


    X = np.array(allTrainX,dtype=float)
    y = np.array(allTrainY,dtype=float)
    clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1')
    clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1')
    clf.fit(X, y)
    clf2.fit(X, y)
    weights = np.array(clf.coef_[0])
    print(weights)
    weights = np.array(clf2.coef_[0])
    print(weights)


    return clf,clf2
Example #49
0
 def get_psk_jaro(self):
     return Levenshtein.jaro(self.data["expected_data"],self.data["psk_data"])
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离
Example #51
0
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False):
    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            pass

    dimMatrix = 16
    predict = np.zeros((i+1,dimMatrix))


    clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete)

    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            a = line.rstrip().split("\t")

            ## create same vector with more distances
            st1 = a[0].lower()
            st2 = a[1].lower()

            temp = [
            1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
            lev.jaro(st1,st2),
            lev.jaro_winkler(st1,st2),
            lev.ratio(st1,st2),
            distance.sorensen(st1,st2),
            jaccard(set(st1),set(st2)),
            1. - distance.nlevenshtein(st1,st2,method=1),
            1. - distance.nlevenshtein(st1,st2,method=2),
            dice_coefficient(st1,st2,lenGram=2),
            dice_coefficient(st1,st2,lenGram=3),
            dice_coefficient(st1,st2,lenGram=4),
            cosineWords(st1,st2),
            cosineBigrams(st1,st2)]

            if len(delete) > 0:
                for elem in delete:
                    temp[elem] = 0.

            predict[i,:-3] = temp
            predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float))
            predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float))
            predict[i,-1] = a[-1]


    if plotX:
        labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"]
        f1matrix = np.zeros((100,dimMatrix-1))

        fig = plt.figure()
        fig.set_size_inches(9,6)
        ax = fig.add_subplot(111)
        iC = -1
        for i in np.linspace(0,1,100):
            iC += 1
            for j in range(dimMatrix-1):
                t = np.array(predict[:,j])
                if j >= dimMatrix-3:
                    t = (t - np.min(t))/(np.max(t)-np.min(t))
                f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1])
        F1scores = []
        for j in range(dimMatrix-1):
            F1scores.append(np.max(f1matrix[:,j]))
            #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j])
        ax.bar(range(dimMatrix-1),F1scores)
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        ax.set_ylabel("F1 score")
        ax.set_xlabel("Parameter")
        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("f1_bar.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)

        AUCScores = []
        for j in range(dimMatrix-1):
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j])
            AUCScores.append(auc(fpr, tpr))


            # Plot ROC curve
            ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j])
            ax.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.0])
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title('ROC Curve')

        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("roc.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)
        ax.bar(range(dimMatrix-1),AUCScores)
        ax.set_ylabel('Area Under Curve')
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        customaxis(ax)
        plt.savefig("roc_bar.pdf")
        plt.show()
Example #52
0
    print "Sequences:", asmLCS.seq.sequences  # asmLCS.seq is the LCSequence object
    print "Substrings:", asmLCS.substr.substrings
    lenSeqOne = (float)(len(asmLCS.seq.seqOne))
    lenSeqOneBuiltin = (float)(asmLCS.seq.matrix.seqOneLen)
    lenSeqTwo = (float)(len(asmLCS.seq.seqTwo))
    lenSeqTwoBuiltin = (float)(asmLCS.seq.matrix.seqTwoLen)
    lenLCSeq = (float)(len(asmLCS.seq))
    lenLCSub = (float)(len(asmLCS.substr))
    perSim = ((lenLCSeq / lenSeqOne) + (lenLCSeq / lenSeqTwo)) / 2
    perExact = ((lenLCSub / lenSeqOne) + (lenLCSub / lenSeqTwo)) / 2
    print "Length of SeqOne:", lenSeqOne
    print "Length of SeqOne (builtin):", lenSeqOneBuiltin
    print "Length of SeqTwo:", lenSeqTwo
    print "Length of SeqTwo (builtin):", lenSeqTwoBuiltin
    print "Length of LCSeq:", lenLCSeq
    print "Length of LCSub:", lenLCSub
    print "Substring in SeqOne starts at postion:", asmLCS.seq.seqOne.find(list(asmLCS.substr.substrings)[0])
    print "Substring in SeqTwo starts at postion:", asmLCS.seq.seqTwo.find(list(asmLCS.substr.substrings)[0])
    print "Percent Similar:", perSim
    print "Percent Exact Copy:", perExact
    print "Levenshtein Distance:", ldistance.distance(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "Jaro Similarity:", ldistance.jaro(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "Jaro-Winkler:", ldistance.jaro_winkler(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "Simlarity ratio:", ldistance.ratio(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "\nSeconds to process and calculate:", time.time() - start_time

    # Levenshtein distance - character operations (add, remove, swap) needed to transform one string into the other.
    # Jaro Similarity - similarity of short strings; 0 if completely different, 1 if identical
    # Jaro-Winkler - Prefix weighted version of Jaro, because typos and divergence happens near the end of seqs
    # Similarity Ratio - The real minimal edit distance, aka diff sequence matching
Example #53
0
 def get_domex_jaro(self):
     return Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"])
Example #54
0
def jaroDistance(form1, form2):
    return Levenshtein.jaro(form1, form2) if (len(form1) * len(form2) > 0) else 0.0
Example #55
0
import time
import os
import difflib
import Levenshtein

start = time.time()

with open('/Users/fan/anaconda/bin/Workspace/data/openriceName.json', 'r') as f:
    openrice = json.load(f, encoding='utf8')

with open('/Users/fan/anaconda/bin/Workspace/data/ifoodName.json', 'r') as f:
    ifood = json.load(f, encoding='utf8')

for o in ifood:
    ifname =ifood[o].split('|')[0]
    ifaddress =ifood[o].split('|')[1]
    temp = {}
    for o in openrice:
        opname = openrice[o].split('|')[0]
        opaddress = openrice[o].split('|')[1]
        jw = Levenshtein.jaro_winkler(ifname, opname, 0.25)
        if jw not in temp:
            temp[jw] = '%s|%s' % (opname, opaddress)
        else:
            addressjw0 = Levenshtein.jaro(ifaddress, temp[jw].split('|')[1])
            addressjw1 = Levenshtein.jaro(ifaddress, opaddress)
            if addressjw1 > addressjw0:
                temp[jw] = '%s|%s' % (opname, opaddress)

    print '%s|%s' % (ifname, temp[max(temp.keys())].split('|')[0])
    print '%s|%s' % (ifaddress, temp[max(temp.keys())].split('|')[1])
def main():
    ifName   ='梁記麻辣火鍋冰棒豆腐'
    
    orName   ='桔園'
    orName2  ='火鍋冰棒豆腐'
    orName3  ='梁記'
    orName4  ='梁記麻辣火鍋'
    orName5  ='梁記石頭火鍋'
    orName6  ='梁記火鍋'
    
    
    print  'jaro'
    print  orName,':',Levenshtein.jaro(ifName, orName)
    print  orName2,':',Levenshtein.jaro(ifName, orName2)
    print  orName3,':',Levenshtein.jaro(ifName, orName3)
    print  orName4,':',Levenshtein.jaro(ifName, orName4)
    print  orName5,':',Levenshtein.jaro(ifName, orName5)
    print  orName6,':',Levenshtein.jaro(ifName, orName6)
    
    
    print  '---------------------------'
    print  'jaro_winkler'
    print  orName,':',Levenshtein.jaro_winkler(ifName, orName, 0.25)
    print  orName2,':',Levenshtein.jaro_winkler(ifName, orName2, 0.25)
    print  orName3,':',Levenshtein.jaro_winkler(ifName, orName3, 0.25)
    print  orName4,':',Levenshtein.jaro_winkler(ifName, orName4, 0.25)
    print  orName5,':',Levenshtein.jaro_winkler(ifName, orName5, 0.25)
    print  orName6,':',Levenshtein.jaro_winkler(ifName, orName6, 0.25)
    print  '---------------------------'
    print  'distance'
    print  orName,':',Levenshtein.distance(ifName, orName)
    print  orName2,':',Levenshtein.distance(ifName, orName2)
    print  orName3,':',Levenshtein.distance(ifName, orName3)
    print  orName4,':',Levenshtein.distance(ifName, orName4)
    print  orName5,':',Levenshtein.distance(ifName, orName5)
    print  orName6,':',Levenshtein.distance(ifName, orName6)
    print  '---------------------------'
    print  'ratio'
    print  orName,':',Levenshtein.ratio(ifName, orName)
    print  orName2,':',Levenshtein.ratio(ifName, orName2)
    print  orName3,':',Levenshtein.ratio(ifName, orName3)
    print  orName4,':',Levenshtein.ratio(ifName, orName4)
    print  orName5,':',Levenshtein.ratio(ifName, orName5)
    print  orName6,':',Levenshtein.ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyRatio'
    print  orName,':',fuzz.ratio(ifName, orName)
    print  orName2,':',fuzz.ratio(ifName, orName2)
    print  orName3,':',fuzz.ratio(ifName, orName3)
    print  orName4,':',fuzz.ratio(ifName, orName4)
    print  orName5,':',fuzz.ratio(ifName, orName5)
    print  orName6,':',fuzz.ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyPartial_ratio'
    print  orName,':',fuzz.partial_ratio(ifName, orName)
    print  orName2,':',fuzz.partial_ratio(ifName, orName2)
    print  orName3,':',fuzz.partial_ratio(ifName, orName3)
    print  orName4,':',fuzz.partial_ratio(ifName, orName4)
    print  orName5,':',fuzz.partial_ratio(ifName, orName5)
    print  orName6,':',fuzz.partial_ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyToken_sort_ratio'
    print  orName,':',fuzz.token_sort_ratio(ifName, orName)
    print  orName2,':',fuzz.token_sort_ratio(ifName, orName2)
    print  orName3,':',fuzz.token_sort_ratio(ifName, orName3)
    print  orName4,':',fuzz.token_sort_ratio(ifName, orName4)
    print  orName5,':',fuzz.token_sort_ratio(ifName, orName5)
    print  orName6,':',fuzz.token_sort_ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyToken_set_ratio'
    print  orName,':',fuzz.token_set_ratio(ifName, orName)
    print  orName2,':',fuzz.token_set_ratio(ifName, orName2)
    print  orName3,':',fuzz.token_set_ratio(ifName, orName3)
    print  orName4,':',fuzz.token_set_ratio(ifName, orName4)
    print  orName5,':',fuzz.token_set_ratio(ifName, orName5)
    print  orName6,':',fuzz.token_set_ratio(ifName, orName6)
Example #57
0
 def distances(st1,st2):
     #if
     return lev.jaro(st1,st2)
Example #58
0
def build_spanish(args, prefixes_gen=None):
  corpus_files = os.listdir(args.es_corpus)
  sentences = []
  wordcount = 0
  for filename in corpus_files:
    print 'Processing %s' % filename
    with codecs.open(os.path.join(args.es_corpus, filename), encoding='utf-8') as f:
      sent = []
      for line_no, line in enumerate(f):
        if line.strip() == '' or line.strip()[0] == '<' :
          sentences.append(sent)
          sent = []
        else:
          try:
            word, lemm, morph, num = line.strip().split(' ')
            if num != '0':  # 0 is for punctuation ant other trash
              sent.append( (word, lemm, morph) )
              wordcount += 1
          except Exception:
            print line_no
            print line
  print 'Loaded sentences: %d, wordcount = %d' % (len(sentences), wordcount)

  word_to_sents = defaultdict(set)
  words = []
  sent_sets = []

  for sent_no, s in enumerate(sentences):
    sent_set = set()
    for i, w in enumerate(s):
      words.append( (w[0], w[1], sent_no, len(words)) )
      sent_set.add(w[0].lower())
    for w in sent_set:
      word_to_sents[w].add(len(sent_sets))
    sent_sets.append(sent_set)


  goodwords = []
  word_used = defaultdict(int)
  for w in words:
    key = (w[0], w[1])
    if word_used[key] < 10:
      word_used[key] += 1
      goodwords.append(w)

  print 'Total goodwords: %d' % len(goodwords)

  prefix_map = defaultdict(list)
  for w in goodwords:
    prefix_map[w[0][:2].lower()].append(w)
  print 'Splitting prefix map'
  while True:
    splitted = False
    for k in prefix_map.keys():
      if len(prefix_map[k]) > PREFIX_THRESH:
        lst = prefix_map.pop(k)
        print 'splitting %d items for key %s' % (len(lst), k)
        splitted = True
        for w in lst:
          prefix_map[w[0][:(len(k)+1)].lower()].append(w)
        break
    if not splitted:
      break
  prefix_map_keys = prefix_map.keys()
  print 'Prefix spanish map splitted onto %d groups' % len(prefix_map)
  print 'Prefix map spanish max pairs: %d' % sum([len(v) ** 2 for k, v in prefix_map.iteritems()])


  print 'Loading word2vec data...'
  vectors = wv_common.load_text_vectors(args.es_text_vectors)

  articles = []  # wikipedia articles
  word_to_articles = defaultdict(set)
  word_to_id = {}
  id_to_word = {}
  word_to_freq = defaultdict(int)

  print 'Processing wiki data...'
  with codecs.open(args.spanish_wikidata, encoding='utf-8') as f:
    for line_no, line in enumerate(f):
      elems = line.strip().split()
      if elems[0] == 'WRD':
        article_words_ids = set()
        for word in elems[1:]:
          wl = word.lower()
          word_to_freq[wl] += 1
          if wl not in word_to_id:
            idx = len(word_to_id)
            word_to_id[wl] = idx
            id_to_word[idx] = wl
          idx = word_to_id[wl]
          article_words_ids.add(idx)
          word_to_articles[wl].add(len(articles))
        articles.append(article_words_ids)
      if line_no % 10000 == 0:
        print 'Processed %d lines of wikidata' % line_no
      if line_no > 100000:
        break

  print 'Total wiki vocab size: %d' % (len(word_to_freq))
  total_wiki_words = float(sum(word_to_freq.values()))

  alphabet = set()
  for w in word_to_freq.iterkeys():
    alphabet |= set(w)
  print 'Alphabet size: %d' % len(alphabet)

  trie_fwd = FreqTrie(alphabet)
  trie_inv = FreqTrie(alphabet)

  print 'Building tries'
  alphabet = set(ES_ALPHABET)
  trie_skipped = 0
  for wf_no, wf in enumerate(word_to_freq.iteritems()):
    if len(set(wf[0]) & alphabet) > 0:
      trie_fwd.add(wf[0], wf[1])
      trie_inv.add(wf[0][::-1], wf[1])
    else:
      trie_skipped += 1
    if wf_no % 10000 == 0:
      print 'Added to trie: %d words, nodecounts: %d %d skipped: %d' % (wf_no, trie_fwd.nodecount(), trie_inv.nodecount(), trie_skipped)

  print 'Building freqs'
  # 2,3,4-suffixes
  suffix_freqs = defaultdict(int)
  for w in goodwords:
    wl = w[0].lower()
    suffix_freqs[wl[-2:]] += 1
    suffix_freqs[wl[-3:]] += 1
    suffix_freqs[wl[-4:]] += 1

  word_freqs = defaultdict(int)
  for w in goodwords:
    word_freqs[w[0].lower()] += 1

  prc = 0
  tst = 1000000
  for i in xrange(tst):
    w1, w2 = random.sample(goodwords, 2)
    if w1[1]==w2[1]:
      prc += 1
  print 'prc:', prc, tst

  lemm_map = defaultdict(dict)
  for w in goodwords:
    lemm_map[w[1]][w[0]] = set()
  for i, w in enumerate(goodwords):
    lemm_map[w[1]][w[0]].add(i)  # lemm -> forms -> positions


  positive_examples = gen_positive_examples(lemm_map)
  print 'Generated %d positive examples' % len(positive_examples)

  print 'Generating features...'
  features = []
  if prefixes_gen == None:
    for i in xrange(args.gen_pos + args.gen_neg):
      if i != 0 and (i % 10000) == 0 :
        print '%d...' % i
      answer = True if i < args.gen_pos else False
      w1 = -1
      w2 = -1
      feature = None
      if answer:
        w1, w2 = positive_examples[i]
        feature = {'answer': answer, 'id1': goodwords[w1][3], 'id2': goodwords[w2][3]}
      else:
        prefix_group = None
        while True:
          prefix = random.choice(prefix_map_keys)
          prefix_group = prefix_map[prefix]
          w1 = random.randint(0, len(prefix_group)-1)
          w2 = random.randint(0, len(prefix_group)-1)
          t1 = words[prefix_group[w1][3]][0].lower()
          t2 = words[prefix_group[w2][3]][0].lower()
          if w1 == w2 or answer != (prefix_group[w1][1].lower() == prefix_group[w2][1].lower()) or t1 == t2:
            continue
          break
        feature = {'answer': answer, 'id1': prefix_group[w1][3], 'id2': prefix_group[w2][3]}
      features.append(feature)
  else:
    pass
    pos_ex = set([(min(wid1, wid2), max(wid1, wid2)) for wid1, wid2 in positive_examples])
    def fgen():
      for prefix_no, prefix in enumerate(sorted(prefix_map_keys)):
        print 'Processing prefix: %s (%d of %d) with %d words' % (prefix, prefix_no, len(prefix_map_keys), len(prefix_map[prefix]))
        prefix_group = []
        wcnt = defaultdict(int)
        for line in prefix_map[prefix]:
          if wcnt[line[0].lower()] < 1:
            wcnt[line[0].lower()] += 1
            prefix_group.append(line)
        print 'Lines in prefix group: %d' % (len(prefix_group))

        for i in xrange(len(prefix_group)):
          for j in xrange(i+1, len(prefix_group)):
            wid1 = prefix_group[i][3]
            wid2 = prefix_group[j][3]
            p_key = (min(wid1, wid2), max(wid1, wid2))
            feature = {'answer': p_key in pos_ex, 'id1': wid1, 'id2': wid2}
            yield feature
    features = fgen()


  print 'Filling features...'
  not_found_vectors_count = 0
  zero_mutual_info_corpus_count = 0
  zero_mutual_info_wiki_count = 0
  for feature_no, f in enumerate(features):
    i1 = f['id1'];  w1 = words[i1][0];  wl1 = w1.lower()
    i2 = f['id2'];  w2 = words[i2][0];  wl2 = w2.lower()
    f['tag1'] = '0'
    f['tag2'] = '0'
    f['w1'] = w1
    f['w2'] = w2
    # TODO: add tags: "max common length", "left-pos-tag" and "right-pos-tag" for both of them
    f['common_prefix_len'] = common_prefix_len(w1.lower(), w2.lower())
    f['common_prefix_len_rel'] = f['common_prefix_len'] * 2.0 / (len(w1) + len(w2))
    f['levenshtein'] = levenshtein.distance(wl1, wl2)
    f['jaro_winkler'] = levenshtein.jaro_winkler(wl1, wl2)
    f['jaro'] = levenshtein.jaro(wl1, wl2)

    common_prefix = wl1[:f['common_prefix_len']]
    suffix1 = wl1[f['common_prefix_len']:]
    suffix2 = wl2[f['common_prefix_len']:]

    f['freq_common_prefix'] = trie_fwd.get(common_prefix) * 1.0 / trie_fwd.getsum()
    f['freq_suffix1'] = trie_inv.get(suffix1) * 1.0 / trie_inv.getsum()
    f['freq_suffix2'] = trie_inv.get(suffix2) * 1.0 / trie_inv.getsum()

    f['freq1'] = word_freqs[w1.lower()]
    f['freq2'] = word_freqs[w2.lower()]

    f['suf2freq1'] = suffix_freqs[wl1[-2:]]
    f['suf2freq2'] = suffix_freqs[wl2[-2:]]
    f['suf3freq1'] = suffix_freqs[wl1[-3:]]
    f['suf3freq2'] = suffix_freqs[wl2[-3:]]
    f['suf4freq1'] = suffix_freqs[wl1[-4:]]
    f['suf4freq2'] = suffix_freqs[wl2[-4:]]

    if wl1 in vectors and wl2 in vectors:
      f['wv_dist']   = vectors[wl1]['vec'].dot(vectors[wl2]['vec'])
    else:
      not_found_vectors_count += 1
      f['wv_dist'] = 1.0

    # calculating mutual information
    w1fsc  = len(word_to_sents[wl1])
    w2fsc  = len(word_to_sents[wl2])
    w12fsc = len(word_to_sents[wl1] & word_to_sents[wl2])
    if w12fsc > 0:
      w1fsc /= 1.0 * len(sent_sets)
      w2fsc /= 1.0 * len(sent_sets)
      w12fsc /= 1.0 * len(sent_sets)
      f['mut_info_corpus'] = math.log(w1fsc) + math.log(w2fsc) - math.log(w12fsc)
    else:
      f['mut_info_corpus'] = 0.0
      zero_mutual_info_corpus_count += 1

    w1fsw  = len(word_to_articles[wl1])
    w2fsw  = len(word_to_articles[wl2])
    w12fsw = len(word_to_articles[wl1] & word_to_articles[wl2])
    if w12fsw > 0:
      w1fsw /= 1.0 * len(articles)
      w2fsw /= 1.0 * len(articles)
      w12fsw /= 1.0 * len(articles)
      f['mut_info_wiki'] = math.log(w1fsw) + math.log(w2fsw) - math.log(w12fsw)
    else:
      f['mut_info_wiki'] = 0.0
      zero_mutual_info_wiki_count += 1

    if feature_no % 1000 == 0:
      print 'Samples processed: %d' % (feature_no)

    if prefixes_gen != None:
      yield f

  if prefixes_gen == None:

    print 'Not found word vectors for:    %d pairs of %d' % (not_found_vectors_count, len(features))
    print 'Zeroed mutual corpus info for: %d pairs of %d' % (zero_mutual_info_corpus_count, len(features))
    print 'Zeroed mutual wiki info for:   %d pairs of %d' % (zero_mutual_info_wiki_count, len(features))

    print 'Saving features...'
    save_features(args.es_features_output, features)
Example #59
0
 def domex_valid(self):
     return Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"]) > self.threshold