Example #1
0
def test(clf):
    dvds = []
    with open("dvd.csv") as f:
        for i, j in enumerate(f):
            dvds.append(j)

    movies = []
    with open("movies.csv") as f:
        for i, j in enumerate(f):
            movies.append(j)

    dvds = [dvd for dvd in dvds if dvd > "B"]
    movies = [movie for movie in movies if movie > "B"]
    print(len(dvds), len(movies))

    with open("test.csv", "w") as f:
        i = 0
        for dvd in dvds:
            prefix = dvd[0]
            i += 1
            maxSimil = 0.0
            for movie in movies:
                if movie[0] == prefix:
                    tempSim = lev.jaro(dvd, movie)
                    if tempSim > maxSimil:
                        maxSimil = tempSim
                        maxMovie = movie

            temp = [
                1.0 - (lev.distance(dvd, maxMovie) / len(dvd)),
                lev.jaro(dvd, maxMovie),
                lev.jaro_winkler(dvd, maxMovie),
                lev.ratio(dvd, maxMovie),
            ]
            print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp)))
            f.write(
                "%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n"
                % (
                    dvd.rstrip(),
                    maxMovie.rstrip(),
                    1.0 - (lev.distance(dvd, maxMovie) / len(dvd)),
                    lev.jaro(dvd, maxMovie),
                    lev.jaro_winkler(dvd, maxMovie),
                    lev.ratio(dvd, maxMovie),
                    clf.decision_function(temp),
                    clf.predict(temp),
                )
            )
def are_similar(name1, name2):
    name1, name2 = (mangle_name(s) for s in (name1, name2))
    ratio = Levenshtein.jaro_winkler(name1, name2)
    # TODO: remove this debug print
    if ratio < 0.8:
        print " * ratio = %s => name1 = '%s' vs name2 = '%s'" % (ratio, name1, name2)
    return ratio >= 0.8
Example #3
0
def predictionRatio(df, metric="Levenshtein"):
    #Generate all possible combinations for string matching
    soc_media_1, soc_media_2 = df.columns
    # Convert everything to lower case
    df[soc_media_1] = df[soc_media_1].str.lower()
    df[soc_media_2] = df[soc_media_2].str.lower()

    df_known = DataFrame([df[soc_media_1].tolist()] * df.shape[0], index=df.index, columns=df.index)
    df_search = DataFrame([df[soc_media_2].tolist()] * df.shape[0], index=df.index, columns=df.index)
    df_known_list = df_known.applymap(lambda x: list([x]))
    df_search_list = df_search.applymap(lambda x: list([x]))
    df_search_list = df_known_list+df_search_list.T

    # Find the indices of columns for each row  based on metric
    # For Levenshtein get the min., for JaroWinkler get the max.
    if metric == 'Levenshtein':
        search_res = df_search_list.applymap(lambda x: Levenshtein.distance(x[0], x[1]))
        indices = search_res.idxmin(axis=1)
    else:
        search_res = df_search_list.applymap(lambda x: Levenshtein.jaro_winkler(x[0], x[1]))
        indices = search_res.idxmax(axis=1)
    
    # Get the matches for social media account
    match = df[soc_media_2].ix[indices]
    df_t = DataFrame()
    df_t['actual'] = df[soc_media_2].reset_index(drop=True)
    df_t['match'] = match.reset_index(drop=True)
    # Find the ratio of correct matches
    match_count = (df_t.actual == df_t.match).value_counts()
    ratio = float(match_count[True]) / (match_count[True] + match_count[False])
    return ratio
Example #4
0
def response(db, user, inStr):
    inStr = common.que_init(inStr)
    ans = ''
    colls = db.collection_names()
    random.shuffle(colls)
    for coll in colls:
        if coll[-4:] != '_yml':
            continue
        reqs = db[coll].find_one({'tag': 'dia'})
        if not reqs:
            continue
        qas = reqs['qas']
        if not qas:
            continue
        random.shuffle(qas)
        for qa in qas:
            ques = qa['que']
            random.shuffle(ques)
            for que in ques:
                que = str(que)
                que = common.que_init(que)
                if Leven.jaro_winkler(inStr, que) > JARO_WINKLER_PERCENT:
                    ans = qa['ans']
                    if type(ans) is list:
                        ans = random.choice(ans)
                    return ans
    return ans
Example #5
0
 def __get_match(self, query, words):
     _match = []
     for word in words:
         distance = Levenshtein.jaro_winkler(word, query, self.weight)
         if distance > self.accuracy:
             _match.append((distance, word))
     return _match
 def check_cons(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio > .6 or jaro > .7 or jaro_winkler > .7:
         return True
     else:
         return False
 def check_sure(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio >= 0.9 and jaro >= 0.95 and jaro_winkler >= 0.95:
         return True
     else:
         return False
Example #8
0
def choose(s, possibilities, threshold=.6):
    """
    Returns the closest match to string s if exceeds threshold, else returns None 
    """
    if s in possibilities:
        return s
    startswith = [x for x in possibilities if x.lower().startswith(s.lower())]
    if len(startswith) == 1: return startswith[0]
    contained = [x for x in possibilities if s.lower() in x.lower()]
    if len(contained) > 1: return contained[0]
    close = sorted([(x, Levenshtein.jaro_winkler(s, x, .05)) for x in possibilities], key=itemgetter(1))
    best = max([(x, Levenshtein.jaro_winkler(s, x, .05)) for x in possibilities], key=itemgetter(1))
    if best[1] < threshold:
        print 'returning None because', best, 'is below threshold of', threshold
        print 'out of', close
        return None
    return best[0]
Example #9
0
 def clusterStrings(self, stringList):
     for string_1 in stringList:
         for string_2 in stringList:
             similarity = Levenshtein.jaro_winkler(string_1, string_2)
             if(similarity > 0.95):
                 print similarity
                 print string_1
                 print string_2
             break
 def check_beli(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio >= 0.9 or jaro >= 0.9 or jaro_winkler >= 0.9:
         return True
     elif ratio >= .7 and jaro >= .8 and jaro_winkler >= .8:
         return True
     else:
         return False
Example #11
0
def find_closest_string(query, dictionary, thresh=0.90):
    """ This function returns the closest match for 
         a query string against a dictionary of terms
        using levenstein distance
    """
    dist = {i:Levenshtein.jaro_winkler(query, i) for i in dictionary}
    dist = sorted(dist.items(), key=operator.itemgetter(1), reverse=True)
    if dist[0][1] >= thresh:
        return dist[0][0]
    else:
        return None
 def dp(s1, s2):
     key = (tuple(s1), tuple(s2))
     if key in d:
         return d[key]
     if not s1 or not s2:
         return 0
     best = dp(s1[1:], s2)
     for s2i in s2:
         w = Levenshtein.jaro_winkler(s1[0], s2i)
         best = max(best, w + dp(s1[1:], s2 - set([s2i])))
     d[key] = best
     return best
Example #13
0
    def __best_country_match(self, raw):
        max_jw = 0
        max_country = ''
        for country in self.country_list:
            jw = lev.jaro_winkler(country, raw)
            if jw > max_jw:
                max_jw = jw
                max_country = country

        if max_jw > self.threshhold_jw:
            latitude, longitude = self.countries[max_country]
            return max_country, max_country, latitude, longitude
        else:
            return None, None, None, None
Example #14
0
 def __call__(self, ua, devices):
     """
     @param ua: The user agent
     @type ua: string
     @param devices: The devices object to search
     @type devices: Devices
     @rtype: Device
     @raises pywurfl.DeviceNotFound
     """
     match = max((Levenshtein.jaro_winkler(x, ua, self.weight), x) for
                 x in devices.devuas)
     if match[0] >= self.accuracy:
         return devices.devuas[match[1]]
     else:
         raise DeviceNotFound(ua)
Example #15
0
def choose(s, possibilities, threshold=.6):
    """
    Returns the closest match to string s if exceeds threshold, else returns None
    """
    if not possibilities: return None
    if s in possibilities: return s
    if s == '': return None
    startswith = [x for x in possibilities if x.lower().startswith(s.lower())]
    if len(startswith) == 1: return startswith[0]
    contained = [x for x in possibilities if s.lower() in x.lower()]
    if len(contained) == 1: return contained[0]
    best = max([(x, Levenshtein.jaro_winkler(s, x, .05)) for x in possibilities], key=itemgetter(1))
    if best[1] < threshold:
        #print 'did you mean %s?' % best[0]
        return None
    return best[0]
Example #16
0
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []):
    allTrainX = list()
    allTrainY = list()
    with open("./data/train.csv") as f:
        for line in f:
            lin = line.split(",")
            if len(lin) == 3:
                st1 = lin[0].lower()
                st2 = lin[1].lower()

                temp = [
                        1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
                        lev.jaro(st1,st2),
                        lev.jaro_winkler(st1,st2),
                        lev.ratio(st1,st2),
                        distance.sorensen(st1,st2),
                        jaccard(set(st1),set(st2)),
                        1. - distance.nlevenshtein(st1,st2,method=1),
                        1. - distance.nlevenshtein(st1,st2,method=2),
                        dice_coefficient(st1,st2,lenGram=2),
                        dice_coefficient(st1,st2,lenGram=3),
                        dice_coefficient(st1,st2,lenGram=4),
                        cosineWords(st1,st2,dictTrain,tfidf_matrix_train),
                        cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram)
                    ]
                if len(delete) > 0:
                    for elem in delete:
                        temp[elem] = 0.
                allTrainX.append(temp)
                allTrainY.append(int(lin[2]))


    X = np.array(allTrainX,dtype=float)
    y = np.array(allTrainY,dtype=float)
    clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1')
    clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1')
    clf.fit(X, y)
    clf2.fit(X, y)
    weights = np.array(clf.coef_[0])
    print(weights)
    weights = np.array(clf2.coef_[0])
    print(weights)


    return clf,clf2
Example #17
0
def suggestions(s, possibilities):
    #TODO don't use jaro_winkler, or use it more intelligently;
    # ie break up words and match on each of them
    # jaro_winkler weighs the front more
    startswith = [x for x in possibilities if x.lower().startswith(s.lower())]
    if startswith: return startswith
    contained = [x for x in possibilities if s.lower() in x.lower()]
    if contained: return contained
    jws = [(x, Levenshtein.jaro_winkler(s, x)) for x in possibilities]
    jws.sort(key=lambda x:0-x[1])
    diffs = [x[1] - y[1] for x, y in zip(jws[:-1], jws[1:])]
    output = []
    for (card_name, score), diff in zip(jws[:-1], diffs):
        output.append(card_name)
        print diff
        if diff > .05: break
        if len(output) > 5: break
    return output
Example #18
0
def numMatch(boxesds,num):
    matchedProb = None
    if (num is None):
        return matchedProb
#     tempSim = 0
#     maxSim = 0
#     matchedProb = dict()
    matchedProb = []
#     print('+------------------+')
#     print(boxesds[0].boxname,boxesds[1].boxname,boxesds[2].boxname,boxesds[3].boxname,boxesds[4].boxname)
    for item in boxesds:
        tempSim = Levenshtein.jaro_winkler(str(item.number),num)
        matchedProb.append(tempSim)
#         matchedProb.update({item.boxname:tempSim})
#         print(item.boxname+': '+ str(tempSim))
#         if(tempSim > maxSim):
#             maxSim = tempSim
#             matchRst = item.boxname
#     print('+------------------+\n')
    return matchedProb
Example #19
0
    def __call__(self, ua, devices):
        """
        @param ua: The user agent
        @type ua: string
        @param devices: The devices object to search
        @type devices: Devices
        @rtype: Device
        @raises pywurfl.DeviceNotFound
        """
        match = max((Levenshtein.jaro_winkler(x, ua, self.weight), x) for
                    x in devices.devuas)

        if match[0] >= self.accuracy:
            
            dev_clone = copy.copy(devices.devuas[match[1]])
            dev_clone.accuracy = match[0]
            # print "Got accuracy " + match[1] + " " + str(match[0])
            return dev_clone
        else:
            raise DeviceNotFound(ua)
Example #20
0
def jaro_winkler(str1, str2):
    jaro_dist = Levenshtein.jaro_winkler(str1, str2)
    return jaro_dist
Example #21
0
    print "Sequences:", asmLCS.seq.sequences  # asmLCS.seq is the LCSequence object
    print "Substrings:", asmLCS.substr.substrings
    lenSeqOne = (float)(len(asmLCS.seq.seqOne))
    lenSeqOneBuiltin = (float)(asmLCS.seq.matrix.seqOneLen)
    lenSeqTwo = (float)(len(asmLCS.seq.seqTwo))
    lenSeqTwoBuiltin = (float)(asmLCS.seq.matrix.seqTwoLen)
    lenLCSeq = (float)(len(asmLCS.seq))
    lenLCSub = (float)(len(asmLCS.substr))
    perSim = ((lenLCSeq / lenSeqOne) + (lenLCSeq / lenSeqTwo)) / 2
    perExact = ((lenLCSub / lenSeqOne) + (lenLCSub / lenSeqTwo)) / 2
    print "Length of SeqOne:", lenSeqOne
    print "Length of SeqOne (builtin):", lenSeqOneBuiltin
    print "Length of SeqTwo:", lenSeqTwo
    print "Length of SeqTwo (builtin):", lenSeqTwoBuiltin
    print "Length of LCSeq:", lenLCSeq
    print "Length of LCSub:", lenLCSub
    print "Substring in SeqOne starts at postion:", asmLCS.seq.seqOne.find(list(asmLCS.substr.substrings)[0])
    print "Substring in SeqTwo starts at postion:", asmLCS.seq.seqTwo.find(list(asmLCS.substr.substrings)[0])
    print "Percent Similar:", perSim
    print "Percent Exact Copy:", perExact
    print "Levenshtein Distance:", ldistance.distance(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "Jaro Similarity:", ldistance.jaro(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "Jaro-Winkler:", ldistance.jaro_winkler(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "Simlarity ratio:", ldistance.ratio(asmLCS.seq.seqOne, asmLCS.seq.seqTwo)
    print "\nSeconds to process and calculate:", time.time() - start_time

    # Levenshtein distance - character operations (add, remove, swap) needed to transform one string into the other.
    # Jaro Similarity - similarity of short strings; 0 if completely different, 1 if identical
    # Jaro-Winkler - Prefix weighted version of Jaro, because typos and divergence happens near the end of seqs
    # Similarity Ratio - The real minimal edit distance, aka diff sequence matching
def main():
    ifName   ='梁記麻辣火鍋冰棒豆腐'
    
    orName   ='桔園'
    orName2  ='火鍋冰棒豆腐'
    orName3  ='梁記'
    orName4  ='梁記麻辣火鍋'
    orName5  ='梁記石頭火鍋'
    orName6  ='梁記火鍋'
    
    
    print  'jaro'
    print  orName,':',Levenshtein.jaro(ifName, orName)
    print  orName2,':',Levenshtein.jaro(ifName, orName2)
    print  orName3,':',Levenshtein.jaro(ifName, orName3)
    print  orName4,':',Levenshtein.jaro(ifName, orName4)
    print  orName5,':',Levenshtein.jaro(ifName, orName5)
    print  orName6,':',Levenshtein.jaro(ifName, orName6)
    
    
    print  '---------------------------'
    print  'jaro_winkler'
    print  orName,':',Levenshtein.jaro_winkler(ifName, orName, 0.25)
    print  orName2,':',Levenshtein.jaro_winkler(ifName, orName2, 0.25)
    print  orName3,':',Levenshtein.jaro_winkler(ifName, orName3, 0.25)
    print  orName4,':',Levenshtein.jaro_winkler(ifName, orName4, 0.25)
    print  orName5,':',Levenshtein.jaro_winkler(ifName, orName5, 0.25)
    print  orName6,':',Levenshtein.jaro_winkler(ifName, orName6, 0.25)
    print  '---------------------------'
    print  'distance'
    print  orName,':',Levenshtein.distance(ifName, orName)
    print  orName2,':',Levenshtein.distance(ifName, orName2)
    print  orName3,':',Levenshtein.distance(ifName, orName3)
    print  orName4,':',Levenshtein.distance(ifName, orName4)
    print  orName5,':',Levenshtein.distance(ifName, orName5)
    print  orName6,':',Levenshtein.distance(ifName, orName6)
    print  '---------------------------'
    print  'ratio'
    print  orName,':',Levenshtein.ratio(ifName, orName)
    print  orName2,':',Levenshtein.ratio(ifName, orName2)
    print  orName3,':',Levenshtein.ratio(ifName, orName3)
    print  orName4,':',Levenshtein.ratio(ifName, orName4)
    print  orName5,':',Levenshtein.ratio(ifName, orName5)
    print  orName6,':',Levenshtein.ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyRatio'
    print  orName,':',fuzz.ratio(ifName, orName)
    print  orName2,':',fuzz.ratio(ifName, orName2)
    print  orName3,':',fuzz.ratio(ifName, orName3)
    print  orName4,':',fuzz.ratio(ifName, orName4)
    print  orName5,':',fuzz.ratio(ifName, orName5)
    print  orName6,':',fuzz.ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyPartial_ratio'
    print  orName,':',fuzz.partial_ratio(ifName, orName)
    print  orName2,':',fuzz.partial_ratio(ifName, orName2)
    print  orName3,':',fuzz.partial_ratio(ifName, orName3)
    print  orName4,':',fuzz.partial_ratio(ifName, orName4)
    print  orName5,':',fuzz.partial_ratio(ifName, orName5)
    print  orName6,':',fuzz.partial_ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyToken_sort_ratio'
    print  orName,':',fuzz.token_sort_ratio(ifName, orName)
    print  orName2,':',fuzz.token_sort_ratio(ifName, orName2)
    print  orName3,':',fuzz.token_sort_ratio(ifName, orName3)
    print  orName4,':',fuzz.token_sort_ratio(ifName, orName4)
    print  orName5,':',fuzz.token_sort_ratio(ifName, orName5)
    print  orName6,':',fuzz.token_sort_ratio(ifName, orName6)
    print  '---------------------------'
    print  'fuzzywuzzyToken_set_ratio'
    print  orName,':',fuzz.token_set_ratio(ifName, orName)
    print  orName2,':',fuzz.token_set_ratio(ifName, orName2)
    print  orName3,':',fuzz.token_set_ratio(ifName, orName3)
    print  orName4,':',fuzz.token_set_ratio(ifName, orName4)
    print  orName5,':',fuzz.token_set_ratio(ifName, orName5)
    print  orName6,':',fuzz.token_set_ratio(ifName, orName6)
s = len(res2_unmatch)
if s == 0:
    console('本数据全为精确匹配!')
    res_final = pd.concat([res1_remain, res2_remain],
                          axis=0).reset_index(drop=True)
    res_final.to_csv(path + 'output_data.csv', index=False)  #直接输出精确匹配结果
else:
    console('剩余%d条编码需要模糊匹配' % s)  #打印需要进行模糊匹配的数量
    # 使用分词后的名称进行模糊匹配
    df = pd.DataFrame(columns=['icd_code_yb', 'name_yb'])
    console("开始模糊匹配")
    for item in list(zip(res2_unmatch['icd_code'], res2_unmatch['icd_name'])):
        count += 1
        console("模糊匹配进度: 第%d编码进行正在进行匹配" % count)
        yb_diag['codeScore'] = yb_diag['icd_code_yb'].apply(
            lambda x: Levenshtein.jaro_winkler(item[0][:-1], x)
        )  #对于icd编码使用Levenshtein.jaro_winkler
        # 忽略顺序匹配
        yb_diag['nameScore'] = yb_diag['icd_name_yb'].apply(
            lambda x: fuzz.token_sort_ratio(item[1], x) / 100
        )  #对于icd编码使用fuzz.token_sort_ratio

        yb_diag['finalScore'] = yb_diag[['codeScore', 'nameScore']].apply(
            lambda x: Score(x['nameScore'], x['codeScore']),
            axis=1)  #根据编码和名称的相似度得到最终的相似度

        df1 = yb_diag.iloc[
            yb_diag.finalScore.argmax(), :]  #[['icd_code_yb','icd_name_yb']]
        df = df.append(df1)
        if count % 10 == 0:
            _ = count // 10 * 10
Example #24
0
def jaroWinklerDistance(form1, form2):
    return Levenshtein.jaro_winkler(form1, form2, 0.1) if (len(form1) * len(form2) > 0) else 0.0
Example #25
0
    lenLCSeq = (float)(len(asmLCS.seq))
    lenLCSub = (float)(len(asmLCS.substr))
    perSim = ((lenLCSeq / lenSeqOne) + (lenLCSeq / lenSeqTwo)) / 2
    perExact = ((lenLCSub / lenSeqOne) + (lenLCSub / lenSeqTwo)) / 2
    print "Length of SeqOne:", lenSeqOne
    print "Length of SeqOne (builtin):", lenSeqOneBuiltin
    print "Length of SeqTwo:", lenSeqTwo
    print "Length of SeqTwo (builtin):", lenSeqTwoBuiltin
    print "Length of LCSeq:", lenLCSeq
    print "Length of LCSub:", lenLCSub
    print "Substring in SeqOne starts at postion:", asmLCS.seq.seqOne.find(
        list(asmLCS.substr.substrings)[0])
    print "Substring in SeqTwo starts at postion:", asmLCS.seq.seqTwo.find(
        list(asmLCS.substr.substrings)[0])
    print "Percent Similar:", perSim
    print "Percent Exact Copy:", perExact
    print "Levenshtein Distance:", ldistance.distance(asmLCS.seq.seqOne,
                                                      asmLCS.seq.seqTwo)
    print "Jaro Similarity:", ldistance.jaro(asmLCS.seq.seqOne,
                                             asmLCS.seq.seqTwo)
    print "Jaro-Winkler:", ldistance.jaro_winkler(asmLCS.seq.seqOne,
                                                  asmLCS.seq.seqTwo)
    print "Simlarity ratio:", ldistance.ratio(asmLCS.seq.seqOne,
                                              asmLCS.seq.seqTwo)
    print "\nSeconds to process and calculate:", time.time() - start_time

    # Levenshtein distance - character operations (add, remove, swap) needed to transform one string into the other.
    # Jaro Similarity - similarity of short strings; 0 if completely different, 1 if identical
    # Jaro-Winkler - Prefix weighted version of Jaro, because typos and divergence happens near the end of seqs
    # Similarity Ratio - The real minimal edit distance, aka diff sequence matching
Example #26
0
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False):
    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            pass

    dimMatrix = 16
    predict = np.zeros((i+1,dimMatrix))


    clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete)

    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            a = line.rstrip().split("\t")

            ## create same vector with more distances
            st1 = a[0].lower()
            st2 = a[1].lower()

            temp = [
            1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
            lev.jaro(st1,st2),
            lev.jaro_winkler(st1,st2),
            lev.ratio(st1,st2),
            distance.sorensen(st1,st2),
            jaccard(set(st1),set(st2)),
            1. - distance.nlevenshtein(st1,st2,method=1),
            1. - distance.nlevenshtein(st1,st2,method=2),
            dice_coefficient(st1,st2,lenGram=2),
            dice_coefficient(st1,st2,lenGram=3),
            dice_coefficient(st1,st2,lenGram=4),
            cosineWords(st1,st2),
            cosineBigrams(st1,st2)]

            if len(delete) > 0:
                for elem in delete:
                    temp[elem] = 0.

            predict[i,:-3] = temp
            predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float))
            predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float))
            predict[i,-1] = a[-1]


    if plotX:
        labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"]
        f1matrix = np.zeros((100,dimMatrix-1))

        fig = plt.figure()
        fig.set_size_inches(9,6)
        ax = fig.add_subplot(111)
        iC = -1
        for i in np.linspace(0,1,100):
            iC += 1
            for j in range(dimMatrix-1):
                t = np.array(predict[:,j])
                if j >= dimMatrix-3:
                    t = (t - np.min(t))/(np.max(t)-np.min(t))
                f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1])
        F1scores = []
        for j in range(dimMatrix-1):
            F1scores.append(np.max(f1matrix[:,j]))
            #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j])
        ax.bar(range(dimMatrix-1),F1scores)
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        ax.set_ylabel("F1 score")
        ax.set_xlabel("Parameter")
        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("f1_bar.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)

        AUCScores = []
        for j in range(dimMatrix-1):
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j])
            AUCScores.append(auc(fpr, tpr))


            # Plot ROC curve
            ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j])
            ax.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.0])
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title('ROC Curve')

        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("roc.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)
        ax.bar(range(dimMatrix-1),AUCScores)
        ax.set_ylabel('Area Under Curve')
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        customaxis(ax)
        plt.savefig("roc_bar.pdf")
        plt.show()
Example #27
0
def make_feature(data_or,vec_model):
    print('get features:')
    from gensim.models import Word2Vec
    vec_model = Word2Vec.load('pretrain_model/w2v_300.model')
    dictionary = corpora.Dictionary.load('temp_data/train_dictionary.dict')
    tfidf = models.TfidfModel.load("temp_data/train_tfidf.model")
    index = similarities.SparseMatrixSimilarity.load('temp_data/train_index.index')
    item_id_list = joblib.load('temp_data/paper_id.pkl')

    with open('temp_data/train_content.pkl','rb') as fr:
        corpus = pickle.load(fr)
    data = data_or.copy()

    data['abstract_pre'] = data['abstract_pre'].apply(
        lambda x: np.nan if str(x) == 'nan' or len(x) < 9 else x)

    data['abstract_pre'] = data['abstract_pre'].apply(
        lambda x: 'none' if str(x) == 'nan' or str(x).split(' ') == ['n', 'o', 'n', 'e'] else x)
    data['key_text_pre'] = data['key_text_pre'].fillna('none')
    data['description_text'] = data['description_text'].fillna('none')
    data['title_pro'] = data['title_pro'].fillna('none')
    data['description_text_pre'] = data['description_text_pre'].fillna('none')
    prefix = 'num_'
    
    # 长度
    data[prefix + 'key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' ')))

    # 长度append
    data[prefix + 'description_text_len'] = data['description_text'].apply(lambda x: len(x.split(' ')))

    data.loc[data[prefix + 'key_text_len'] < 7, 'key_text_pre'] = data[data[prefix + 'key_text_len'] < 7][
        'description_text'].apply(
        lambda x: ' '.join(pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+', '', x)))).values

    # abstract是否为空
    data[prefix + 'cate_pa_isnull'] = data['abstract_pre'].apply(lambda x: 1 if str(x) == 'none' else 0)

    # key_words是否为空
    data[prefix + 'cate_pkeywords_isnull'] = data['keywords'].apply(lambda x: 1 if str(x) == 'nan' else 0)


    #描述在key_word中出现的次数
    def get_num_key(x,y):
        if str(y)=='nan':
            return -1
        y=y.strip(';').split(';')
        num=0
        for i in y:
            if i in x:
                num+=1
        return num

    data[prefix+'key_in_key_word_number']=list(map(lambda x,y: get_num_key(x,y),data['key_text_pre'],data['keywords']))
    #描述在key_word中出现的次数/key_words的个数
    data[prefix+'key_in_key_word_number_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number'],
                                                data['keywords']))

    #append
    data[prefix+'key_in_key_word_number2']=list(map(lambda x,y: get_num_key(x,y),data['description_text'],data['keywords']))
    #描述在key_word中出现的次数/key_words的个数
    data[prefix+'key_in_key_word_number2_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number2'],
                                                data['keywords']))

    # 描述在title出现单词的统计
    def get_num_common_words_and_ratio(merge, col):
        # merge data
        merge = merge[col]
        merge.columns = ['q1', 'q2']
        merge['q2'] = merge['q2'].apply(lambda x: 'none' if str(x) == 'nan' else x)

        q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
        q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values

        q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values
        q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values

        q1_word_len_set = merge.q1.apply(lambda x: len(set(x.split(' ')))).values
        q2_word_len_set = merge.q2.apply(lambda x: len(set(x.split(' ')))).values

        result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))]
        result_ratio_q = [result[i] / q1_word_len[i] for i in range(len(q1_word_set))]
        result_ratio_t = [result[i] / q2_word_len[i] for i in range(len(q1_word_set))]

        result_ratio_q_set = [result[i] / q1_word_len_set[i] for i in range(len(q1_word_set))]
        result_ratio_t_set = [result[i] / q2_word_len_set[i] for i in range(len(q1_word_set))]

        return result, result_ratio_q, result_ratio_t, q1_word_len, q2_word_len, q1_word_len_set, q2_word_len_set, result_ratio_q_set, result_ratio_t_set

    data[prefix + 'common_words_k_pt'], \
    data[prefix + 'common_words_k_pt_k'], \
    data[prefix + 'common_words_k_pt_pt'], \
    data[prefix + 'k_len'], \
    data[prefix + 'pt_len'], \
    data[prefix + 'k_len_set'], \
    data[prefix + 'pt_len_set'], \
    data[prefix + 'common_words_k_pt_k_set'], \
    data[prefix + 'common_words_k_pt_pt_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'title_pro'])

    data[prefix + 'common_words_k_at'], \
    data[prefix + 'common_words_k_at_k'], \
    data[prefix + 'common_words_k_at_at'], \
    data[prefix + 'k_len'], \
    data[prefix + 'at_len'], \
    data[prefix + 'k_len_set'], \
    data[prefix + 'at_len_set'], \
    data[prefix + 'common_words_k_at_k_set'], \
    data[prefix + 'common_words_k_at_at_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'abstract_pre'])

    #append
    data[prefix + 'common_words_k_pt_2'], \
    data[prefix + 'common_words_k_pt_k_2'], \
    data[prefix + 'common_words_k_pt_pt_2'], \
    data[prefix + 'k_len_2'], \
    data[prefix + 'pt_len'], \
    data[prefix + 'k_len_set_2'], \
    data[prefix + 'pt_len_set'], \
    data[prefix + 'common_words_k_pt_k_set_2'], \
    data[prefix + 'common_words_k_pt_pt_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'title_pro'])

    data[prefix + 'common_words_k_at_2'], \
    data[prefix + 'common_words_k_at_k_2'], \
    data[prefix + 'common_words_k_at_at_2'], \
    data[prefix + 'k_len_2'], \
    data[prefix + 'at_len'], \
    data[prefix + 'k_len_set_2'], \
    data[prefix + 'at_len_set'], \
    data[prefix + 'common_words_k_at_k_set_2'], \
    data[prefix + 'common_words_k_at_at_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'abstract_pre'])



    # Jaccard 相似度
    def jaccard(x, y):
        if str(y) == 'nan':
            y = 'none'
        x = set(x)
        y = set(y)
        return float(len(x & y) / len(x | y))

    data[prefix + 'jaccard_sim_k_pt'] = list(map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['title_pro']))
    data[prefix + 'jaccard_sim_k_pa'] = list(
        map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['abstract_pre']))

    #append
    data[prefix + 'jaccard_sim_k_pt2'] = list(map(lambda x, y: jaccard(x, y), data['description_text'], data['title_pro']))
    data[prefix + 'jaccard_sim_k_pa2'] = list(
        map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['description_text']))

    # 编辑距离
    print('get edict distance:')
    data[prefix + 'edict_distance_k_pt'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'edict_jaro'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'edict_ratio'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'edict_jaro_winkler'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['title_pro']))

    data[prefix + 'edict_distance_k_pa'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']),
            data['abstract_pre']))
    data[prefix + 'edict_jaro_pa'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'edict_ratio_pa'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'edict_jaro_winkler_pa'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))

    #append
    print('get edict distance:')
    data[prefix + 'edict_distance_k_pt_2'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'edict_jaro_2'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'edict_ratio_2'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'edict_jaro_winkler_2'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['title_pro']))

    data[prefix + 'edict_distance_k_pa_2'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']),
            data['abstract_pre']))
    data[prefix + 'edict_jaro_pa_2'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['abstract_pre']))
    data[prefix + 'edict_ratio_pa_2'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['abstract_pre']))
    data[prefix + 'edict_jaro_winkler_pa_2'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['abstract_pre']))

    #余弦相似度
    def get_sim(doc, corpus):
        corpus = corpus.split(' ')
        corpus_vec = [dictionary.doc2bow(corpus)]
        corpus_tfidf = tfidf[corpus_vec]
        featurenum = len(dictionary.token2id.keys())
        index_i = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum)
        doc = doc.split(' ')
        vec = dictionary.doc2bow(doc)
        vec_tfidf = tfidf[vec]
        sim = index_i.get_similarities(vec_tfidf)
        return sim[0]

    data[prefix + 'sim'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'sim_pa'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))

    #append
    data[prefix + 'sim_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'sim_pa_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['abstract_pre']))

    # tfidf
    def get_simlilary(query, title):
        def get_weight_counter_and_tf_idf(x, y):
            x = x.split()
            y = y.split()
            corups = x + y
            obj = dict(collections.Counter(corups))
            x_weight = []
            y_weight = []
            idfs = []
            for key in obj.keys():
                idf = 1
                w = obj[key]
                if key in x:
                    idf += 1
                    x_weight.append(w)
                else:
                    x_weight.append(0)
                if key in y:
                    idf += 1
                    y_weight.append(w)
                else:
                    y_weight.append(0)
                idfs.append(math.log(3.0 / idf) + 1)
            return [np.array(x_weight), np.array(y_weight), np.array(x_weight) * np.array(idfs),
                    np.array(y_weight) * np.array(idfs), np.array(list(obj.keys()))]

        weight = list(map(lambda x, y: get_weight_counter_and_tf_idf(x, y),
                          tqdm(query), title))
        x_weight_couner = []
        y_weight_couner = []
        x_weight_tfidf = []
        y_weight_tfidf = []
        words = []
        for i in weight:
            x_weight_couner.append(i[0])
            y_weight_couner.append(i[1])
            x_weight_tfidf.append(i[2])
            y_weight_tfidf.append(i[3])
            words.append(i[4])

        # 曼哈顿距离
        def mhd_simlilary(x, y):
            return np.linalg.norm(x - y, ord=1)

        mhd_simlilary_counter = list(map(lambda x, y: mhd_simlilary(x, y),
                                         x_weight_couner, y_weight_couner))
        mhd_simlilary_tfidf = list(map(lambda x, y: mhd_simlilary(x, y),
                                       x_weight_tfidf, y_weight_tfidf))

        # 余弦相似度
        def cos_simlilary(x, y):
            return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

        cos_simlilary_counter = list(map(lambda x, y: cos_simlilary(x, y),
                                         x_weight_couner, y_weight_couner))
        cos_simlilary_tfidf = list(map(lambda x, y: cos_simlilary(x, y),
                                       x_weight_tfidf, y_weight_tfidf))

        # 欧式距离
        def Euclidean_simlilary(x, y):
            return np.sqrt(np.sum(x - y) ** 2)

        Euclidean_simlilary_counter = list(map(lambda x, y: Euclidean_simlilary(x, y),
                                               x_weight_couner, y_weight_couner))
        Euclidean__simlilary_tfidf = list(map(lambda x, y: Euclidean_simlilary(x, y),
                                              x_weight_tfidf, y_weight_tfidf))

        return mhd_simlilary_counter, mhd_simlilary_tfidf, cos_simlilary_counter, \
               cos_simlilary_tfidf, Euclidean_simlilary_counter, Euclidean__simlilary_tfidf

    data[prefix + 'mhd_similiary'], data[prefix + 'tf_mhd_similiary'], \
    data[prefix + 'cos_similiary'], data[prefix + 'tf_cos_similiary'], \
    data[prefix + 'os_similiary'], data[prefix + 'tf_os_similiary'] = get_simlilary(data['key_text_pre'],data['title_pro'])


    data[prefix + 'mhd_similiary_pa'], data[prefix + 'tf_mhd_similiary_pa'], \
    data[prefix + 'cos_similiary_pa'], data[prefix + 'tf_cos_similiary_pa'], \
    data[prefix + 'os_similiary_pa'], data[prefix + 'tf_os_similiary_pa'] = get_simlilary(data['key_text_pre'],data['abstract_pre'])

    '词向量平均的相似度'

    def get_vec(x):
        vec = []
        for word in x.split():
            if word in vec_model:
                vec.append(vec_model[word])
        if len(vec) == 0:
            return np.nan
        else:
            return np.mean(np.array(vec), axis=0)

    data['key_text_pre_vec'] = data['key_text_pre'].progress_apply(lambda x: get_vec(x))
    data['title_pro_vec'] = data['title_pro'].progress_apply(lambda x: get_vec(x))
    data['abstract_pre_vec'] = data['abstract_pre'].progress_apply(lambda x: get_vec(x))
    data['description_text_vec'] = data['description_text'].progress_apply(lambda x: get_vec(x))

    # cos
    data[prefix + 'cos_mean_word2vec'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['key_text_pre_vec']), data['title_pro_vec']))
    data[prefix + 'cos_mean_word2vec'] = data[prefix + 'cos_mean_word2vec'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['key_text_pre_vec']), data['title_pro_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['title_pro_vec']))


    # cos
    data[prefix + 'cos_mean_word2vec_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['key_text_pre_vec']), data['abstract_pre_vec']))
    data[prefix + 'cos_mean_word2vec_pa'] = data[prefix + 'cos_mean_word2vec_pa'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['key_text_pre_vec']), data['abstract_pre_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec']))


    #append
    data[prefix + 'cos_mean_word2vec_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['description_text_vec']), data['title_pro_vec']))
    data[prefix + 'cos_mean_word2vec_2'] = data[prefix + 'cos_mean_word2vec_2'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['description_text_vec']), data['title_pro_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['title_pro_vec']))

    # cos
    data[prefix + 'cos_mean_word2vec_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['description_text_vec']), data['abstract_pre_vec']))
    data[prefix + 'cos_mean_word2vec_pa2'] = data[prefix + 'cos_mean_word2vec_pa2'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['description_text_vec']), data['abstract_pre_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['abstract_pre_vec']))




    #n-gram距离相关
    data[prefix+'n_gram_sim'],data[prefix+'sim_numeber_rate']=get_df_grams(data,2,['key_text_pre','title_pro'])
    data[prefix+'n_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa']=get_df_grams(data,2,['key_text_pre','abstract_pre'])

    #append
    #n-gram距离相关
    data[prefix+'n_gram_sim_2'],data[prefix+'sim_numeber_rate_2']=get_df_grams(data,2,['description_text','title_pro'])
    data[prefix+'n_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2']=get_df_grams(data,2,['description_text','abstract_pre'])

    
#################################################朋哥已做##################################
#     def apply_fun(df):
#         df.columns = ['d_id', 'key', 'doc']
#         df['d_id'] = df['d_id'].fillna('always_nan')
#         query_id_group = df.groupby(['d_id'])
#         bm_list = []
#         for name, group in tqdm(query_id_group):
#             corpus = group['doc'].values.tolist()
#             corpus = [sentence.strip().split() for sentence in corpus]
#             query = group['key'].values[0].strip().split()
#             bm25Model = BM25(corpus)
#             bmscore = bm25Model.get_scores(query)
#             bm_list.extend(bmscore)

#         return bm_list

#     data[prefix + 'bm25'] = apply_fun(data[['description_id', 'key_text_pre', 'title_pro']])
#     data[prefix + 'bm25_pa'] = apply_fun(data[['description_id', 'key_text_pre', 'abstract_pre']])

#     #append
#     data[prefix + 'bm25_2'] = apply_fun(data[['description_id', 'description_text', 'title_pro']])
#     data[prefix + 'bm25_pa_2'] = apply_fun(data[['description_id', 'description_text', 'abstract_pre']])


#     # get bm25
#     def get_bm25(p_id, query):
#         query = query.split(' ')
#         score = bm25Model.get_score(query, item_id_list.index(p_id))
#         return score

#     data[prefix + 'bm_25_all'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['key_text_pre']))
#     #append
#     data[prefix + 'bm_25_all_2'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['description_text']))
#################################################朋哥已做##################################
    data[prefix + 'Hamming_kt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'Hamming_dt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['description_text_pre']), data['title_pro']))
    
    data[prefix + 'Hamming_ka'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'Hamming_da'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['description_text_pre']), data['abstract_pre']))
    
    data[prefix + 'Hamming_sim_kt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'Hamming_sim_dt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['description_text_pre']), data['title_pro']))
    
    data[prefix + 'Hamming_sim_ka'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'Hamming_sim_da'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['description_text_pre']), data['abstract_pre']))
   
    def edit_distance(df,w1, w2):
        word1 = df[w1].split()
        word2 = df[w2].split()
        len1 = len(word1)
        len2 = len(word2)
        dp = np.zeros((len1 + 1, len2 + 1))
        for i in range(len1 + 1):
            dp[i][0] = i
        for j in range(len2 + 1):
            dp[0][j] = j

        for i in range(1, len1 + 1):
            for j in range(1, len2 + 1):
                delta = 0 if word1[i - 1] == word2[j - 1] else 1
                dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
        return dp[len1][len2]
    
    data[prefix + 'edit_distance_kt'] = data.apply(edit_distance, axis=1, 
                                                   args=('key_text_pre', 'title_pro'))
    data[prefix + 'edit_distance_dt'] = data.apply(edit_distance, axis=1, 
                                                   args=('description_text_pre', 'title_pro'))
    data[prefix + 'edit_distance_ka'] = data.apply(edit_distance, axis=1, 
                                                   args=('key_text_pre', 'abstract_pre'))
    data[prefix + 'edit_distance_da'] = data.apply(edit_distance, axis=1, 
                                                   args=('description_text_pre', 'abstract_pre'))
    
    def get_same_word_features(query, title):
        q_list = query.split()
        t_list = title.split()
        set_query = set(q_list)
        set_title = set(t_list)
        count_words = len(set_query.union(set_title))

        comwords = [word for word in t_list if word in q_list]
        comwords_set = set(comwords)
        unique_rate = len(comwords_set) / count_words

        same_word1 = [w for w in q_list if w in t_list]
        same_word2 = [w for w in t_list if w in q_list]
        same_len_rate = (len(same_word1) + len(same_word2)) / (len(q_list) + len(t_list))
        if len(comwords) > 0:
            com_index1 = len(comwords)
            same_word_q = com_index1 / len(q_list)
            same_word_t = com_index1 / len(t_list)

            for word in comwords_set:
                index_list = [i for i, x in enumerate(q_list) if x == word]
                com_index1 += sum(index_list)
            q_loc = com_index1 / (len(q_list) * len(comwords))
            com_index2 = len(comwords)
            for word in comwords_set:
                index_list = [i for i, x in enumerate(t_list) if x == word]
                com_index2 += sum(index_list)
            t_loc = com_index2 / (len(t_list) * len(comwords))

            same_w_set_q = len(comwords_set) / len(set_query)
            same_w_set_t = len(comwords_set) / len(set_title)
            word_set_rate = 2 * len(comwords_set) / (len(set_query) + len(set_title))

            com_set_query_index = len(comwords_set)
            for word in comwords_set:
                index_list = [i for i, x in enumerate(q_list) if x == word]
                if len(index_list) > 0:
                    com_set_query_index += index_list[0]
            loc_set_q = com_set_query_index / (len(q_list) * len(comwords_set))
            com_set_title_index = len(comwords_set)
            for word in comwords_set:
                index_list = [i for i, x in enumerate(t_list) if x == word]
                if len(index_list) > 0:
                    com_set_title_index += index_list[0]
            loc_set_t = com_set_title_index / (len(t_list) * len(comwords_set))
            set_rate = (len(comwords_set) / len(comwords))
        else:
            unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        return unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate
    
    data[prefix+"unique_rate_kt"],data[prefix+"same_len_rate_kt"],data[prefix+"same_word_q_kt"],\
    data[prefix+"same_word_t_kt"],data[prefix+"q_loc_kt"],data[prefix+"t_loc_kt"],data[prefix+"same_w_set_q_kt"],data[prefix+"same_w_set_t_kt"],data[prefix+"word_set_rate_kt"],\
    data[prefix+"loc_set_q_kt"], data[prefix+"loc_set_t_kt"], data[prefix+"set_rate_kt"]= zip(
    *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["title_pro"]), axis=1))
    
    data[prefix+"unique_rate_dt"],data[prefix+"same_len_rate_dt"],data[prefix+"same_word_q_dt"],\
    data[prefix+"same_word_t_dt"],data[prefix+"q_loc_dt"],data[prefix+"t_loc_dt"],data[prefix+"same_w_set_q_dt"],data[prefix+"same_w_set_t_dt"],data[prefix+"word_set_rate_dt"],\
    data[prefix+"loc_set_q_dt"], data[prefix+"loc_set_t_dt"], data[prefix+"set_rate_dt"]= zip(
    *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["title_pro"]), axis=1))

    data[prefix+"unique_rate_ka"],data[prefix+"same_len_rate_ka"],data[prefix+"same_word_q_ka"],\
    data[prefix+"same_word_t_ka"],data[prefix+"q_loc_ka"],data[prefix+"t_loc_ka"],data[prefix+"same_w_set_q_ka"],data[prefix+"same_w_set_t_ka"],data[prefix+"word_set_rate_ka"],\
    data[prefix+"loc_set_q_ka"], data[prefix+"loc_set_t_ka"], data[prefix+"set_rate_ka"]= zip(
    *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["abstract_pre"]), axis=1))
    
    data[prefix+"unique_rate_da"],data[prefix+"same_len_rate_da"],data[prefix+"same_word_q_da"],\
    data[prefix+"same_word_t_da"],data[prefix+"q_loc_da"],data[prefix+"t_loc_da"],data[prefix+"same_w_set_q_da"],data[prefix+"same_w_set_t_da"],data[prefix+"word_set_rate_da"],\
    data[prefix+"loc_set_q_da"], data[prefix+"loc_set_t_da"], data[prefix+"set_rate_da"]= zip(
    *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["abstract_pre"]), axis=1))

    
    
    def get_df_grams_3(train_sample,values,cols):
        def create_ngram_set(input_list, ngram_value=3):
            return set(zip(*[input_list[i:] for i in range(ngram_value)]))

        def get_n_gram(df, values=3):
            train_query = df.values
            train_query = [[word for word in str(sen).replace("'", '').split(' ')] for sen in train_query]
            train_query_n = []
            for input_list in train_query:
                train_query_n_gram = set()
                for value in range(3, values + 1):
                    train_query_n_gram = train_query_n_gram | create_ngram_set(input_list, value)
                train_query_n.append(train_query_n_gram)
            return train_query_n

        train_query = get_n_gram(train_sample[cols[0]], values)
        train_title = get_n_gram(train_sample[cols[1]], values)
        sim = list(map(lambda x, y: len(x) + len(y) - 2 * len(x & y),
                           train_query, train_title))
        sim_number_rate=list(map(lambda x, y:   len(x & y)/ len(x)  if len(x)!=0 else 0,
                           train_query, train_title))
        return sim ,sim_number_rate
    data[prefix+'3_gram_sim'],data[prefix+'sim_numeber_rate_3']=get_df_grams_3(data,3,['key_text_pre','title_pro'])
    data[prefix+'3_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa_3']=get_df_grams_3(data,3,['key_text_pre','abstract_pre'])

    #append
    #n-gram距离相关
    data[prefix+'3_gram_sim_2'],data[prefix+'sim_numeber_rate_2_3']=get_df_grams_3(data,3,['description_text_pre','title_pro'])
    data[prefix+'3_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2_3']=get_df_grams_3(data,3,['description_text_pre','abstract_pre'])
    
    
    def get_son_str_feature(query, title):
        q_list = query.split()
        query_len = len(q_list)
        t_list = title.split()
        title_len = len(t_list)
        count1 = np.zeros((query_len + 1, title_len + 1))
        index = np.zeros((query_len + 1, title_len + 1))
        for i in range(1, query_len + 1):
            for j in range(1, title_len + 1):
                if q_list[i - 1] == t_list[j - 1]:
                    count1[i][j] = count1[i - 1][j - 1] + 1
                    index[i][j] = index[i - 1][j - 1] + j
                else:
                    count1[i][j] = 0
                    index[i][j] = 0
        max_count1 = count1.max()

        if max_count1 != 0:
            row = int(np.where(count1 == np.max(count1))[0][0])
            col = int(np.where(count1 == np.max(count1))[1][0])
            mean_pos = index[row][col] / (max_count1 * title_len)
            begin_loc = (col - max_count1 + 1) / title_len
            rows = np.where(count1 != 0.0)[0]
            cols = np.where(count1 != 0.0)[1]
            total_loc = 0
            for i in range(0, len(rows)):
                total_loc += index[rows[i]][cols[i]]
            density = total_loc / (query_len * title_len)
            rate_q_len = max_count1 / query_len
            rate_t_len = max_count1 / title_len
        else:
            begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len = 0, 0, 0, 0, 0, 0
        return max_count1, begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len    

    data[prefix+"long_same_max_count1_kt"], data[prefix+"long_same_local_begin_kt"], data[prefix+"long_same_local_mean_kt"],data[prefix+"long_same_total_loc_kt"],\
    data[prefix+"long_same_density_kt"], data[prefix+"long_same_rate_q_len_kt"], data[prefix+"long_same_rate_t_len_kt"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["title_pro"]), axis=1))
    
    data[prefix+"long_same_max_count1_dt"], data[prefix+"long_same_local_begin_dt"], data[prefix+"long_same_local_mean_dt"],data[prefix+"long_same_total_loc_dt"],\
    data[prefix+"long_same_density_dt"], data[prefix+"long_same_rate_q_len_dt"], data[prefix+"long_same_rate_t_len_dt"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["title_pro"]), axis=1))
    
    data[prefix+"long_same_max_count1_da"], data[prefix+"long_same_local_begin_da"], data[prefix+"long_same_local_mean_da"],data[prefix+"long_same_total_loc_da"],\
    data[prefix+"long_same_density_da"], data[prefix+"long_same_rate_q_len_da"], data[prefix+"long_same_rate_t_len_da"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["abstract_pre"]), axis=1))
    
    data[prefix+"long_same_max_count1_ka"], data[prefix+"long_same_local_begin_ka"], data[prefix+"long_same_local_mean_ka"],data[prefix+"long_same_total_loc_ka"],\
    data[prefix+"long_same_density_ka"], data[prefix+"long_same_rate_q_len_ka"], data[prefix+"long_same_rate_t_len_ka"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["abstract_pre"]), axis=1))
    
    def q_t_common_words(query, title):
        query = set(query.split(' '))
        title = set(title.split(' '))
        return len(query & title)
    
    data[prefix+'common_words_kt'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.title_pro), axis=1)
    data[prefix+'common_words_dt'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.title_pro), axis=1)
    data[prefix+'common_words_ka'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.abstract_pre), axis=1)
    data[prefix+'common_words_da'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.abstract_pre), axis=1)

    
    data['key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' ')))
    data['description_text_pre_len'] = data['description_text_pre'].apply(lambda x: len(x.split(' ')))
    data['title_pro_len'] = data['title_pro'].apply(lambda x: len(x.split(' ')))
    data['abstract_pre_len'] = data['abstract_pre'].apply(lambda x: len(x.split(' ')))
    
    
    data[prefix+'common_words_kt_rate_k'] = data[prefix+'common_words_kt'] / data['key_text_len']
    data[prefix+'common_words_kt_rate_t'] = data[prefix+'common_words_kt'] / data['title_pro_len']

    data[prefix+'common_words_dt_rate_d'] = data[prefix+'common_words_dt'] / data['description_text_pre_len']
    data[prefix+'common_words_dt_rate_t'] = data[prefix+'common_words_dt'] / data['title_pro_len']

    data[prefix+'common_words_ka_rate_k'] = data[prefix+'common_words_ka'] / data['key_text_len']
    data[prefix+'common_words_ka_rate_a'] = data[prefix+'common_words_ka'] / data['abstract_pre_len']

    data[prefix+'common_words_da_rate_d'] = data[prefix+'common_words_da'] / data['description_text_pre_len']
    data[prefix+'common_words_da_rate_a'] = data[prefix+'common_words_da'] / data['abstract_pre_len']

    
    
    
    
    feat = ['description_id','paper_id']
    for col in data.columns:
        if re.match('num_', col) != None:
            feat.append(col)

    data = data[feat]

    return data
Example #28
0
for index, row in df_train.iterrows():
    jacc = jaccard_similarity(row.question1,row.question2)
    tr_jacc_coef.append(jacc)
    
train_feat['jacc_coef']=tr_jacc_coef
# jaccard coefficient of test set
te_jacc_coef = list()
for index, row in df_test.iterrows():
    jacc = jaccard_similarity(row.question1,row.question2)
    te_jacc_coef.append(jacc)
test_feat['jacc_coef']=te_jacc_coef   
    
#jarowinkler
tr_jarowinkler = list()
for index, row in df_train.iterrows():
    jaro = Levenshtein.jaro_winkler(row.question1,row.question2)
    tr_jarowinkler.append(jaro)
    
train_feat['jarowinkler'] = tr_jarowinkler
te_jarowinkler = list()
for index, row in df_test.iterrows():
    jaro = Levenshtein.jaro_winkler(row.question1,row.question2)
    te_jarowinkler.append(jaro)
test_feat['jarowinkler'] = te_jarowinkler

#dice distance


tr_dice = list()
for i in range(len(train_q1_words_s)):
    total = len(train_q1_words_s[i])+ len(train_q2_words_s[i])
Example #29
0
 def value(self, word, correction):
     return Levenshtein.jaro_winkler(word, correction)
Example #30
0
def getwikidatacity(_step, list_wikidataid, ne_fid, ne_xid, ne_lon, ne_lat, ne_wikidataid, ne_name ,ne_namealt ,ne_adm0name,ne_adm1name,ne_ls_name,ne_geonameid, ne_scalerank,ne_labelrank,ne_natscale):

    query_template="""
        PREFIX geo: <http://www.opengis.net/ont/geosparql#>
        SELECT
            ?place
            ?placeLabel
            ?placeDescription
            (group_concat(distinct  ?pLabel       ; separator = "#")        as ?type_grp)
            (group_concat(distinct  ?placeLabelru ; separator = "#")        as ?placeLabelru)               
            (group_concat(distinct  ?sitelink_en  ; separator = "#")        as ?sitelink_en)
            (group_concat(distinct  ?sitelink_es  ; separator = "#")        as ?sitelink_es)  
            (group_concat(distinct  ?sitelink_ru  ; separator = "#")        as ?sitelink_ru)   
            (group_concat(distinct  ?sitelink_zh  ; separator = "#")        as ?sitelink_zh)                                    
            (group_concat(distinct  ?sitelink_ceb ; separator = "#")        as ?sitelink_ceb)
            (group_concat(distinct  ?countryLabelx; separator = "#")        as ?countryLabel)
            (SAMPLE(?sistercity)                                            as ?sistercity_sample)
            (AVG(?distance)                                                 as ?distance   )
            (MAX(?population)                                               as ?max_population )
            (group_concat(distinct ?place_alternative ; separator = "#")    as ?place_alternative_grp)
            (group_concat(distinct ?GeoNames_ID       ; separator = "#")    as ?GeoNames_ID_grp)
        WITH {
            SELECT DISTINCT ?place ?distance {

                    #S1#     ?place p:P31/ps:P31  wd:Q515.

                    #S2#     ?place p:P31/ps:P31  wd:Q3957.

                    #S3#           {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q532.     }
                    #S3#     UNION {?place  p:P31/ps:P31              wd:Q532.     }
                    #S3#     UNION {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q15078955.}
                    #S3#     UNION {?place  p:P31/ps:P31              wd:Q15078955.}
                    #S3#     UNION {
                    #S3#      ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 .
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S3#      ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en").
                    #S3#     }
                    #S3#     UNION {
                    #S3#      ?place p:P31/ps:P31  wd:Q486972.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S3#      ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en").
                    #S3#     }
                    #S3#     UNION {
                    #S3#      ?place p:P31/ps:P31/wdt:P279*  wd:Q486972.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S3#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S3#      ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en").
                    #S3#     }

                    #S4#            {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q2039348. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q2039348. }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q1867183. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1867183. }
                    #S4#     UNION  {?place wdt:P1376     ?admin_ara.               }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q1637706. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1637706. }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q16861602.}
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q16861602.}
                    #S4#     UNION  {?place p:P31/ps:P31  wd:Q188509.  ?place p:P17/ps:P17  wd:Q408. }
                    #S4#     UNION  {?place (p:P31/wdt:P31/wdt:P279*)  wd:Q1070990. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1070990. }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q748149.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q748149.  }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q735428.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q735428.  }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q318727.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q318727.  }
                    #S4#     UNION  {?place p:P31/wdt:P31/wdt:P279*    wd:Q15284.   }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q15284.   }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q15284.   }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q532.     }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q15078955.}
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q498162.  }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3389680. }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q1639634. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1639634. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2112349. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q749622.  }

                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q11618417.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q11618417. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q640364.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q640364. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2555896.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q2555896. }
                    #S4#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q109108.  }
                    #S4#     UNION  {?place p:P31/ps:P31               wd:Q109108. }



                    #S5#            {?place p:P31/ps:P31/wdt:P279*     wd:Q1763214.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q1763214. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1840161.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q1840161. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q4249901.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q4249901. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3685463.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3685463. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q12081657.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q12081657. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q27676416.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q27676416. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3076994.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3076994. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3360771.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3360771. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3685463.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3685463. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q605291.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q605291. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1539014.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q1539014. }


  

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q7830262.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q7830262. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3327862.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3327862. }


                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q956318.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q956318. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q155239.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q155239. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q27676428.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q27676428. }

                    #S5#     UNION  {?place p:P31/ps:P31  wd:Q5084.  ?place p:P17/ps:P17  wd:Q16. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q17305746.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q17305746. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q14762300.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q14762300. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q17366755.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q17366755. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3327873.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3327873. }

                    #S5#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3788231.  }
                    #S5#     UNION  {?place p:P31/ps:P31               wd:Q3788231. }

            # --- S6 -------------------

                    #S6#            {?place p:P31/ps:P31/wdt:P279*     wd:Q6609799.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q6609799. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q3685430.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q3685430. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2679157.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2679157. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2989470.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2989470. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q6593035.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q6593035. }


                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q43742.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q43742. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q83020.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q83020. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2706302.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2706302. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q482821.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q482821. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q2225003.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q2225003. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q133442.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q133442. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1500350.  }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1500350. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q16725943. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q16725943. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q9316670. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q9316670. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1065118. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1065118. }
                     
                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1289426. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1289426. }

                    #S6#     UNION  {?place p:P31/ps:P31/wdt:P279*     wd:Q1336099. }
                    #S6#     UNION  {?place p:P31/ps:P31               wd:Q1336099. }
                                   
                    #S6#     {
                    #S6#      ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 .
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S6#     # FILTER(NOT EXISTS  { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en")  }).
                    #S6#      ?place rdfs:label ?placeLabel_xru  FILTER (lang(?placeLabel_xru) = "ru").
                    #S6#     }
                    #S6#     UNION {
                    #S6#      ?place p:P31/ps:P31  wd:Q486972.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S6#      #FILTER(NOT EXISTS  { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en")  }).
                    #S6#      ?place rdfs:label ?placeLabel_xru  FILTER (lang(?placeLabel_xru) = "ru").
                    #S6#     }
                    #S6#     UNION {
                    #S6#      ?place p:P31/ps:P31/wdt:P279*  wd:Q486972.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q131596.    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q5084.      }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P31 wd:Q2514025    }.
                    #S6#      FILTER NOT EXISTS  { ?place wdt:P36 ?capitalplace  }.
                    #S6#      #FILTER(NOT EXISTS  { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en")  }).
                    #S6#      ?place rdfs:label ?placeLabel_xru  FILTER (lang(?placeLabel_xru) = "ru").
                    #S6#     }

                    #S7#     FILTER EXISTS { ?place wdt:P190 ?sistercity_x.}

                    #S8#     VALUES ?GeoNames_ID {"3383494"}
                    #S8#     ?place wdt:P1566 ?GeoNames_ID.

                    #S9#      VALUES ?searchnames {"#ne_name#"@en "#ne_name#"@es "#ne_name#"@sv 
                    #S9#                           "#ne_name#"@de "#ne_name#"@fr "#ne_name#"@pt 
                    #S9#                           "#ne_name#"@it "#ne_name#"@da "#ne_name#"@pl
                    #S9#                           "#ne_name#"@cz "#ne_name#"@sk "#ne_name#"@hu
                    #S9#                           "#ne_name#"@lt "#ne_name#"@et "#ne_name#"@lv                    
                    #S9#                           "#ne_name#"@no "#ne_name#"@nl "#ne_name#"@fi  }  
                    #S9#      ?place rdfs:label ?searchnames .

                    SERVICE wikibase:around {     # "#ne_name#" , "#ne_adm0name#"
                        ?place wdt:P625 ?location.
                        bd:serviceParam wikibase:center "Point(16.373064 48.20833)"^^geo:wktLiteral.
                        bd:serviceParam wikibase:radius "#distance#".
                        bd:serviceParam wikibase:distance ?distance.
                    }
                }
            } AS %places
            WHERE {
            INCLUDE %places .
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
            OPTIONAL {?place rdfs:label ?placeLabelru FILTER (lang(?placeLabelru)="ru").}
            OPTIONAL {?place wdt:P31 ?property. ?property rdfs:label ?pLabel FILTER (lang(?pLabel)="en").}
            OPTIONAL {?place wdt:P17 ?country. ?country rdfs:label ?countryLabelx FILTER (lang(?countryLabelx)="en").}
            OPTIONAL {?place wdt:P17       ?country.}
            OPTIONAL {?place wdt:P1566     ?GeoNames_ID.}
            OPTIONAL {?place wdt:P190      ?sistercity.}
            OPTIONAL {?place wdt:P1082     ?population.}
            OPTIONAL {?sitelink_en  schema:about ?place . ?sitelink_en schema:isPartOf  <https://en.wikipedia.org/>.}
            OPTIONAL {?sitelink_es  schema:about ?place . ?sitelink_es schema:isPartOf  <https://es.wikipedia.org/>.}  
            OPTIONAL {?sitelink_ru  schema:about ?place . ?sitelink_ru schema:isPartOf  <https://ru.wikipedia.org/>.}   
            OPTIONAL {?sitelink_zh  schema:about ?place . ?sitelink_zh schema:isPartOf  <https://zh.wikipedia.org/>.}                                  
            OPTIONAL {?sitelink_ceb schema:about ?place . ?sitelink_ceb schema:isPartOf <https://ceb.wikipedia.org/>.}
            OPTIONAL {?place skos:altLabel ?place_alternative   FILTER((LANG(?place_alternative)) = "en").}
        }
        GROUP BY ?place ?placeLabel   ?placeDescription
        ORDER BY ?distance
    """

    q=query_template.replace('16.373064',ne_lon).replace('48.20833',ne_lat)
    q=q.replace('#ne_name#',ne_name).replace('#ne_adm0name#',ne_adm0name)
    q=q.replace('"3383494"','"'+ne_geonameid+'"')

    if   _step==1:
        q=q.replace('#S1#','')
    elif _step==2:
        q=q.replace('#S2#','')
    elif _step==3:
        q=q.replace('#S3#','')
    elif _step==4:
        q=q.replace('#S4#','')
    elif _step==5:
        q=q.replace('#S5#','')
    elif _step==6:
        q=q.replace('#S6#','')    
    elif _step==7:
        q=q.replace('#S7#','')
    elif _step==8:
        q=q.replace('#S8#','')
    elif _step==9:
        q=q.replace('#S9#','')                
    else:
        print("Internal error, _step: ", _step )
        sys.exit(1)



    search_distance=0
    if  ( -10 <=  float(ne_lon) <= 60)  and  (  float(ne_lat) >30  ):
        if   _step==1:
            search_distance=50
        elif _step==2:
            search_distance=50
        elif _step==3:
            search_distance=50
        elif _step==4:
            search_distance=50
        elif _step==5:
            search_distance=50
        elif _step==6:
            search_distance=50
        elif _step==7:
            search_distance=50                        
        elif _step==8:
            search_distance=1200
        elif _step==9:
            search_distance=100

    else:
        if   _step==1:
            search_distance=150
        elif _step==2:
            search_distance=150
        elif _step==3:
            search_distance=120
        elif _step==4:
            search_distance=100
        elif _step==5:
            search_distance=100            
        elif _step==6:
            search_distance=100
        elif _step==7:
            search_distance=100            
        elif _step==8:
            search_distance=1200
        elif _step==9:
            search_distance=100


    print("_step:",_step , "    search_distance=", search_distance)


    # remove double spaces
    while '  ' in q:
        q = q.replace('  ', ' ')

    # remove comments
    qs=''
    for line in q.splitlines():
        if len(line)>0 and line[:2] != ' #'  and  line[:2] != '#S' :
            qs+=line+'\n'
    q=qs

    ts = datetime.datetime.now()

    max_score=-1000

    results = None
    retries = 0
    max_retries=14
    while results == None and retries <  max_retries:
        try:

            results = None

            sleeptime= retries*10 + 5

            qs=q.replace('#distance#', str(search_distance) )
            print("distance-ok")
            if retries > 0:
                print("Try - retries:",retries,"   Distance:",search_distance," Sleeptime:",sleeptime)
            if args.filter_name!='':
                print(qs)
            sparql.setQuery(qs)
            sparql.setTimeout(2000)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()

        except SPARQLExceptions.EndPointNotFound as e:
            print("ERRwikidata-SPARQLExceptions-EndPointNotFound:  Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue

        except SPARQLExceptions.EndPointInternalError as e:
            print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            # Decrease search distance
            if retries > 3:
                search_distance=int( search_distance*0.9)
            continue

        except TimeoutError:
            print("ERRwikidata-SPARQLExceptions  TimeOut : Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue

        except SPARQLExceptions.QueryBadFormed as e:
            print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check!  "  ,  flush=True )
            return "error"

        except HTTPError as e:
            print("ERRwikidata: Got an HTTPError while querying. Retrying in (seconds) : ",sleeptime, flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue

        except:
            print("ERRwikidata: other error. Retrying in (seconds) : ",sleeptime,  flush=True )
            time.sleep(sleeptime)
            retries += 1
            continue


    if results == None and retries >=  max_retries :
        print("Wikidata request failed ; system stopped! ")
        sys.exit(1)

    _runtime=   (datetime.datetime.now() - ts).total_seconds()



    rc_list_wikidataid=[]
#TODO empty answer ..

    for result in results['results']['bindings']:

        _score=0;

        wd_id = result['place']['value'].split('/')[4]


        wd_distance = float( result['distance']['value'] )

        if 'placeLabel' in result:
            wd_label = result['placeLabel']['value']
        else:
            wd_label = ''



        # Check if already queryed?
        if wd_id in list_wikidataid:
            print("Already exist:", wd_id, wd_label)
            continue
        else:
            rc_list_wikidataid.append(wd_id)

        if 'placeLabelru' in result:
            wd_label_ru = result['placeLabelru']['value']
        else:
            wd_label_ru = ''


        if 'placeDescription' in result:
            wd_description = result['placeDescription']['value']
        else:
            wd_description = ''

        if 'type_grp' in result:
            wd_type = "#"+result['type_grp']['value']+"#"
        else:
            wd_type = ''

        if 'countryLabel' in result:
            wd_countrylabel = result['countryLabel']['value']

            cldiff=  - ( 20 -  ( 20 * Levenshtein.jaro_winkler( unidecode.unidecode(ne_adm0name) ,  unidecode.unidecode(wd_countrylabel) )   ) )
            #print( cldiff, ne_adm0name, wd_countrylabel )
            _score+= cldiff

        else:
            wd_countrylabel =''


        if 'sitelink_en' in result:
            wd_sitelink_en = result['sitelink_en']['value']
        else:
            wd_sitelink_en=''


        if wd_sitelink_en != '':
            _score+=   40
        else:
            _score+=  -120
            

        if 'sitelink_es' in result:
            wd_sitelink_es = result['sitelink_es']['value']
        else:
            wd_sitelink_es=''

        if 'sitelink_ru' in result:
            wd_sitelink_ru = result['sitelink_ru']['value']
        else:
            wd_sitelink_ru=''

        if 'sitelink_zh' in result:
            wd_sitelink_zh = result['sitelink_zh']['value']
        else:
            wd_sitelink_zh=''

        if 'sitelink_ceb' in result:
            wd_sitelink_ceb = result['sitelink_ceb']['value']

        else:
            wd_sitelink_ceb=''
 


        if wd_sitelink_en == '':
            if wd_sitelink_es != '':
                _score+= 100
            elif wd_sitelink_ru != '':
                _score+= 80
            elif wd_sitelink_zh != '':
                _score+= 60
            elif wd_sitelink_ceb != '':
                _score+=  -1000        # penalty for   only ceb import






        if 'GeoNames_ID_grp' in result:
            wd_geonames_id_grp="#"+result['GeoNames_ID_grp']['value']+"#"
        else:
            wd_geonames_id_grp=''

        if 'max_population' in result:
            wd_max_population = result['max_population']['value']
            if wd_max_population!='':
                _score+=8
        else:
            wd_max_population=''

        if 'place_alternative_grp' in result:
            wd_place_alternative_grp="#"+result['place_alternative_grp']['value']+"#"
        else:
            wd_place_alternative_grp=''


        if ('#'+ne_name+'#' in wd_place_alternative_grp)  :
            _in_altnames='Y'
            _score+=72
        if ('#'+unidecode.unidecode(ne_name)+'#' in unidecode.unidecode(wd_place_alternative_grp))  :
            _in_altnames='Y'
            _score+=58
        else:
            _in_altnames='N'

        wd_has_sistercity=""
        if ('sistercity_sample' in result):
            if result['sistercity_sample']['value'] !=  '':
                wd_has_sistercity="Y"
                _score+=15




        uni_ne_name=unidecode.unidecode(ne_name)
        uni_ne_ls_name=unidecode.unidecode(ne_ls_name)
        uni_ne_namealt=unidecode.unidecode(ne_namealt)
        uni_ne_adm0name=unidecode.unidecode(ne_adm0name)
        uni_ne_adm1name=unidecode.unidecode(ne_adm1name)

        uni_wd_name=unidecode.unidecode(wd_label)

        if wd_label==wd_id and wd_label_ru != '':    
            _lev_jaro_winkler_ru = Levenshtein.jaro_winkler( uni_ne_name, unidecode.unidecode(wd_label_ru))
        else:
            _lev_jaro_winkler_ru =  0

        _lev_ratio        = Levenshtein.ratio(uni_ne_name, uni_wd_name)
        _lev_distance     = Levenshtein.distance(uni_ne_name, uni_wd_name)
        _lev_jaro         = Levenshtein.jaro(uni_ne_name, uni_wd_name)

        _lev_jaro_winkler       = Levenshtein.jaro_winkler(uni_ne_name, uni_wd_name)
        _lev_jaro_winkler_ls    = Levenshtein.jaro_winkler(uni_ne_ls_name, uni_wd_name)
        _lev_jaro_winkler_alt   = Levenshtein.jaro_winkler(uni_ne_namealt, uni_wd_name)

        _lev_jaro_winkler_adm0  = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm0name, uni_wd_name )
        _lev_jaro_winkler_adm1  = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm1name, uni_wd_name )

        _max_lev_jaro_winkler = max(_lev_jaro_winkler,_lev_jaro_winkler_ls,_lev_jaro_winkler_alt,_lev_jaro_winkler_adm0,_lev_jaro_winkler_adm1, _lev_jaro_winkler_ru)

        _match_rating_comparison     = jellyfish.match_rating_comparison(uni_ne_name, uni_wd_name)
        _damerau_levenshtein_distance= jellyfish.damerau_levenshtein_distance(uni_ne_name, uni_wd_name)
        _hamming_distance            = jellyfish.hamming_distance(uni_ne_name, uni_wd_name)

        _score+= _max_lev_jaro_winkler*10;

        if ne_name == wd_label:
            _name_status='R01-Equal'
            _score+=100
        elif ne_name.lower()==wd_label.lower():
            _name_status='R12-Lowcase_equal'
            _score+=99
        elif uni_ne_name==uni_wd_name:
            _name_status='R13-Unidecode_equal'
            _score+=90
        elif uni_ne_ls_name==uni_wd_name:
            _name_status='R31-ls_name eq'
            _score+=60
        elif uni_ne_namealt==uni_wd_name:
            _name_status='R32-namealt eq'
            _score+=60
        elif uni_ne_namealt==uni_wd_name:
            _name_status='R33-namealt eq'
            _score+=60
        elif _max_lev_jaro_winkler == 1.0 :
            _name_status='R41- max(jaro_winkler)=1'
            _score+=50
        elif _max_lev_jaro_winkler >= 0.9 :
            _name_status='R42- max(jaro_winkler) 0.9-1.0'
            _score+=40
        elif _max_lev_jaro_winkler >= 0.8 :
            _name_status='R43- max(jaro_winkler) 0.8-0.9'
            _score+=30
        else:
            _name_status=''


        if wd_distance < 5:
            _score += 10
        elif wd_distance < 10:
            _score += 5
        elif wd_distance > 60:
            _score +=  -30
        elif wd_distance > 30:
            _score +=  -15
        elif wd_distance > 15:
            _score +=  -5

        if ne_geonameid != '' and ('#'+ne_geonameid+'#' in wd_geonames_id_grp)  :
            _geonames_status='EQ'
            _score+=40
        elif ne_geonameid != '' and ne_geonameid != '-1' and wd_geonames_id_grp!='##' and ('#'+ne_geonameid+'#' not in wd_geonames_id_grp)  :
            _geonames_status='NE'
            _score+=0
        else:
            _geonames_status='Na'


        if (ne_wikidataid != '' ) and (wd_id !='' ) and (ne_wikidataid==wd_id):
            _wikidata_status='EQ'
            _score+=15
        elif (ne_wikidataid != '' ) and (wd_id !='' ):
            _wikidata_status='NE'

            # smaller wikidataid is sometimes better
            if float(  ne_wikidataid[1:]) > float(wd_id[1:]):
                _score+=  3
            else:
                _score+= -3

        else:
            _wikidata_status='Na'

        if _score > max_score:
            max_score=_score

        if _score > 140:
            print("@@_score>120:" , ne_name , " :: ",  wd_id, wd_label, wd_description, wd_type )


        c.execute("INSERT INTO wd VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
            (

                        ne_fid,
                        ne_wikidataid,
                        wd_id,
                        ne_name,
                        wd_label,
                        ne_adm0name,
                        wd_countrylabel,
                        ne_adm1name,
                        ne_ls_name,
                        ne_namealt,
                        wd_description,
                        wd_type,
                        ne_geonameid,
                        wd_geonames_id_grp,
                        _geonames_status,
                        wd_place_alternative_grp,
                        wd_sitelink_en,
                        wd_sitelink_es,   
                        wd_sitelink_ru,  
                        wd_sitelink_zh,                                                                          
                        wd_sitelink_ceb,
                        wd_label_ru,
                        wd_has_sistercity,
                        wd_max_population,
                        wd_distance,
                        _step,
                        _score,
                        _name_status,
                        _wikidata_status,
                        _in_altnames,
                        _lev_ratio,
                        _lev_distance,
                        _lev_jaro,
                        _lev_jaro_winkler,
                        ne_scalerank,
                        ne_labelrank,
                        ne_natscale,
                        ne_xid,
                        ts,
                        search_distance,
                        retries,
                        _runtime
            ))

    conn.commit()
    sys.stdout.flush()
    if max_score <= 30:
        print(" Low score .. stop ", max_score)



    return  list_wikidataid + rc_list_wikidataid , max_score
# -*-coding:utf-8-*-
# @ auth ivan
# @ time 2021-02-09
# @ goal 105.Test_LevenshteinDistance,

import Levenshtein

s1, s2 = "ABCD", "ACE"
print(Levenshtein.distance(s1, s2), Levenshtein.distance(s2, s1),
      Levenshtein.editops(s1, s2), Levenshtein.ratio(s1, s2),
      Levenshtein.jaro(s1, s2), Levenshtein.jaro_winkler(s1, s2))

# s1, s2 = "广东省广州市番禺区luoxi海bin花园A座1房", "广州洛溪海滨花园A座1房"
Example #32
0
def jarowinkler_sim(field_1, field_2):
    similarity = Levenshtein.jaro_winkler(field_1, field_2)
    return similarity
Example #33
0
def Levenshtein_jaro_winkler(text1, text2):
    text1 = text1.replace(" ", "")
    text2 = text2.replace(" ", "")
    return Levenshtein.jaro_winkler(text1, text2)
Example #34
0
def lvmatch(s1, s2):
    "how well does s2 match s1?"
    return int(Levenshtein.jaro_winkler(s1, s2) * 100)
Example #35
0
def test_jarao_winkler(s, arr):
    for x in arr:
        print Levenshtein.jaro_winkler(s, x)
Example #36
0
    def proba_duplicate(cls, person1, person2):
        """ SEE: check with levenshtein / soundex.....
        Probas can be calculated as the worst case (most popular french names
        (Martin, Marie) since 1881

        Order matters for some tests (C03) : person1 have to be the old value,
        person2 the new value.

        Checks :
            C01: same lastname, firstnames, birthday => we are sure !
            C02: missing other firstnames, same lastname, firstname, birthday
            C03: married :-) lastname to birthname, same firsntames, birthdays
            C04: fuzzy : errors in lastname, firstname, but same birthday
            C05: same names but not matching birthdays (not nulls)
            C06: same names,
         """

        # Check C01
        if (person1.lastname == person2.lastname and
                person1.firstname == person2.firstname and
                person1.firstnames == person2.firstnames and
                person1.birthday == person2.birthday and
                person1.birthday is not None):
            return 1.0
        # Check C02
        if (person1.lastname == person2.lastname and
                person1.firstname == person2.firstname and
                person1.birthday is not None and
                person1.birthday == person2.birthday):
            return 1.0
        # Check C03
        if (person2.lastname == person1.birthname and
                person1.firstname == person2.firstname and
                person1.firstnames == person2.firstnames and
                person1.birthday == person2.birthday and
                person1.birthday is not None):
            return 1.0
        # Check C04
        if ((Levenshtein.jaro_winkler(unicode(person1.lastname),
                                      unicode(person2.lastname)) > 0.85)
                and
                (Levenshtein.jaro_winkler(unicode(person1.firstname),
                                          unicode(person2.firstname)) > 0.85)
                and
                (person1.birthday == person2.birthday) and
                (person1.birthday is not None)):
            return 0.96
        # Check C05
        if (person1.lastname == person2.lastname and
                person1.firstname == person2.firstname and
                person1.firstnames == person2.firstnames and
                person1.birthday is not None and
                person2.birthday is not None and
                Levenshtein.distance(
                    datetime.datetime.strftime(person1.birthday, '%d/%m/%Y'),
                    datetime.datetime.strftime(person2.birthday, '%d/%m/%Y')
                ) < 3):
            return 0.96
        # Check 06
        if (person1.lastname == person2.lastname and
                person1.firstname == person2.firstname and
                len(person1.firstnames) > 2 and
                person1.firstnames == person2.firstnames and
                (person1.birthday is None or person2.birthday is None)):
            return 0.96
        # Check 07
        if (person1.lastname == person2.lastname and
                person1.firstname == person2.firstname and
                (person1.birthday is None or person2.birthday is None)):
            return 0.5
        return 0
Example #37
0
    def org_alternate_names(self, slot_type):
        # load china province city list
        china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb'))
        city_list = []
        for p in china_province_city:
            if p['type'] == 0 and p['name'] != (u'台湾' or u'臺灣'):  # type 0 means 直辖市
                continue
            for c in p['sub']:
                city_list.append(c['name'])
                if p['name'] == (u'台湾' or u'臺灣'):
                    continue
                for d in c['sub']:
                    city_list.append(d['name'])

        # load china province list
        province_dict = []
        f = io.open('data/dict/china_province_dict', 'r', -1, 'utf-8')
        for line in f:
            province_dict.append(line.strip())

        # load country list
        country_list = []
        f = io.open('data/dict/country_list', 'r', -1, 'utf-8')
        for line in f:
            country_list.append(line.strip())

        line_outputs = []
        # find query name segmentation
        query_name_seg = []
        for e in self.evidences[slot_type]:
            if self.query.name not in ''.join(e.parse_result['text']):
                continue
            org_list = self.find_org(e.parse_result['words'])

            for org in org_list:
                if self.query.name in ''.join([word[0] for word in org]):
                    query_name_seg = org

        for e in self.evidences[slot_type]:
            org_list = self.find_org(e.parse_result['words'])
            alternate_name = []
            for org in org_list:
                org_name = ''.join([w[0] for w in org])
                if org_name == self.query.name:
                    continue

                # ======================== organization name pattern match ======================= #
                # edit distance
                simi_score = Levenshtein.distance(self.query.name, org_name)
                if simi_score < 2:
                    alternate_name.append(org)
                    continue

                # alternate name must consist of words from query name
                if set(org_name) - set(self.query.name):
                    continue

                # org name should not be the name of a single city, country or state/province
                def foo():
                    for element in list(itertools.chain(city_list, province_dict, country_list)):
                        if org_name in element:
                            return False
                    return True
                if not foo():
                    continue

                # abbreviation match
                query_name_abbre = ''.join(w[0][0] for w in query_name_seg)
                if query_name_abbre in org_name or org_name in query_name_abbre:
                    alternate_name.append(org)
                    continue

                # jaro_winkler score: the closer the word to the beginning, the higher weight it has.
                simi_score = Levenshtein.jaro_winkler(self.query.name, org_name)
                if simi_score > 0.8:
                    alternate_name.append(org)
                    continue

            for org in alternate_name:
                slot_filler = ''.join([w[0] for w in org])
                l = self.create_line_output(e, slot_filler, 0, slot_type, combined_slot_filler=True)

                line_outputs.append(l)

        return line_outputs
Example #38
0
 def value(self, word, correction):
     return Levenshtein.jaro_winkler(word, correction)
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离
Example #40
0
def get_videos():
    """
        @api {GET} /api/v1/videos/videos 获取视频(已完成)
        @apiName 获取视频(已完成)
        @apiGroup 视频
        @apiVersion 1.0.0
        @apiDescription
        可用来获取全部视频、指定数量的视频、指定排序顺序的视频、关键词匹配的视频以及分页视频

        @apiHeader {String="application/json"} Content-Type 浏览器编码类型

        @apiParam{String="likes -- 点赞数降序","views -- 观看量降序","releaseTime -- 发布时间降序","total -- 综合"}[order="likes"]
        用来指定视频的排序方式。
        @apiParam {String} [searchValue] 搜索内容
        @apiParam {Boolean} [isPagination=false] 是否分页
        @apiParam {Number{大于0}} [pageNumber=1] 页码
        @apiParam {Number{大于0}} [pageSize=10] 页面大小
        @apiParamExample {json} 参数示例
        {
            "order": "likes",
            "searchValue":"hello",
            "isPagination":true,
            "pageNumber":1,
            "pageSize":5
        }

        @apiUse Success200
        @apiSuccess {object[]} data.videos 获取到的视频
        @apiSuccess {Number} data.videos.id 视频编号
        @apiSuccess {String} data.videos.name 视频名称
        @apiSuccess {Number} data.videos.authorId 作者编号
        @apiSuccess {String} data.videos.authorName 作者名称
        @apiSuccess {String} data.videos.introduction 视频简介
        @apiSuccess {Number} data.videos.likes 点赞数
        @apiSuccess {Number} data.videos.views 观看数
        @apiSuccess {String} data.videos.releaseTime 发布时间
        @apiSuccess {String} data.videos.imageUrl 封面图片url
        @apiSuccess {String} data.videos.videoUrl 视频url
        @apiSuccessExample {json} 返回值示例
        {
            "result":true,
            "code":200,
            "message":"",
            "header":{},
            "data":{
                "videos": [
                    {
                        "id": 12,
                        "name": "testVideo",
                        "authorId": 1,
                        "authorName": "testMan",
                        "introduction": "test video",
                        "likes": 50,
                        "views": 5023,
                        "releaseTime": "2021-4-21",
                        "imageUrl": "http://xxx",
                        "videoUrl": "http://xxx"
                    },
                    {...},
                ]
            }
        }

        @apiUse Errors
    """
    data = request.args

    order = data.get("order")
    search_value = data.get("searchValue")
    is_pagination = data.get("isPagination")
    page_number = data.get("pageNumber")
    page_size = data.get("pageSize")
    video_filter = None

    # 处理搜索
    if search_value is not None:
        # 匹配到的视频的名字的队列
        match_video_name_list = list()
        # 从数据库获取所有视频的名字
        videos_name = Videos.query.with_entities(Videos.name).all()
        for video_name in videos_name:
            # 搜索内容和视频相似度
            similarity = Levenshtein.jaro_winkler(search_value, video_name[0])
            # print(video_name[0] + ':' + str(similarity))
            # 如果相似度大于设定的值就将视频名字放到匹配队列中去
            if similarity >= Config.SEARCH_SIMILARITY:
                match_video_name_list.append(video_name[0])
        video_filter = Videos.query.filter(Videos.name.in_(match_video_name_list))
    else:
        video_filter = Videos.query
    # 处理排序
    if order is None:
        order = "likes"
    if order == "likes":
        video_filter = video_filter.order_by(Videos.likes.desc())
    if order == "views":
        video_filter = video_filter.order_by(Videos.views.desc())
    if order == "releaseTime":
        video_filter = video_filter.order_by(Videos.releaseTime.desc())

    if is_pagination is None:
        is_pagination = False
    if is_pagination == 'true':
        is_pagination = True
    elif is_pagination == 'false':
        is_pagination = False
    if is_pagination:
        # 设定页码
        if page_size is None:
            page_size = 10
        if page_number is None:
            page_number = 1
        try:
            page_size = int(page_size)
            page_number = int(page_number)
        except ValueError:
            return jsonify(result=False, code=400, message="参数类型错误!", header={}, data={}), 400
        video_filter = video_filter.paginate(page_number, page_size, False).items
        # print(video_filter)
    else:
        video_filter = video_filter.all()
        # print(video_filter)

    return_data = {
        "videos": []
    }

    for v in video_filter:
        author_name = Users.query.filter_by(id=v.author_id).with_entities(Users.username).first()[0]
        video_dict = {
            "id": v.id,
            "name": v.name,
            "authorId": v.author_id,
            "authorName": author_name,
            "introduction": v.introduction,
            "likes": v.likes,
            "views": v.views,
            "releaseTime": str(v.releaseTime.strftime("%Y年%m月%d日 %H:%M")),
            "imageUrl": v.image_url,
            "videoUrl": v.video_url
        }
        return_data.get("videos").append(video_dict)
        # print(return_data)
    return jsonify(result=True, code=200, message="", header={}, data=return_data), 200
def are_similar(name1, name2):
    name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2))
    ratio = Levenshtein.jaro_winkler(name1, name2)
    return ratio >= 0.8 or name1 in name2 or name2 in name1
Example #42
0
def are_similar(name1, name2):
    name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2))
    ratio = Levenshtein.jaro_winkler(name1, name2, 0.0)  # no common prefix length
    return ratio >= 0.8
 def jaro_winkle_distance(str1, str2):
     sim = Levenshtein.jaro_winkler(str1, str2)
     return sim
def are_similar(name1, name2):
    name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2))
    ratio = Levenshtein.jaro_winkler(name1, name2)
    return ratio >= 0.8 or name1 in name2 or name2 in name1