def gen_corrections(only_last, uncorrected_string): global transcript_corrections, expected_word, current_word_number_temporary_offset if not only_last: current_word_number_temporary_offset = -1 #-1? words = uncorrected_string.split(' ') if only_last: words_to_check = [len(words) - 1] else: words_to_check = range(0, len(words) - 1) for i in words_to_check: set_expected_word() #current_word_number_temporary_offset +=1 found_correction = False #initial value #Perform a phonetic comparision if (jellyfish.match_rating_comparison(words[i], expected_word)): found_correction = True #if its close indicate the correction to be made #print("corgen") if found_correction: #Replace the actual word in the list, and store to intrim #words[-1] = expected_word #Create correction object o = correction o.expected_index = i #blechED YOU ARE IT WORKS-ish o.old_string = words[i] o.new_string = expected_word #Add it to the list... transcript_corrections.append(o)
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def noisify_word(self, word): # normlaize the word w1 = word.strip().strip(".").strip().strip(",").strip().strip( "\"").strip().strip(",").strip() # TODO # we are doing naive substituion # some representation words in the terms of features would be much better # two alternative improvements options: # v1: create & store metaphone/soundex and compare with suitable algorithm (match_rating_comparison) # v2: create & store features which represents sounds... find in space using kNN / cos_sim .... if not w1 in self.word_wID: return word # do the subsitution with given probability & threshodls else: # first obtain all similar words wID = self.word_wID[w1] lst_of_similar_words = [] lst_setID = self.wID_setID_lst[ wID] # word can possibly be part of multiple sets (homonyms are non transitive) for setID in lst_setID: wID_list = self.setID_wID_lst[setID] for wid in wID_list: w = self.wID_word[wid] lst_of_similar_words.append(w) # next calculate some suitable score with all of them similar = [] for w2 in lst_of_similar_words: if w1 == w2: # skip same [just a safety check ... will not happen] continue elif jellyfish.match_rating_comparison( w1, w2): # must be phonetically similar score = jellyfish.jaro_winkler(w1, w2) if score > self.threshold: # and score must be higher than threshold similar.append((w2, score)) if len(similar) == 0: return word # and sort them by this score similar = sorted(similar, key=lambda tup: tup[1], reverse=True) # sample those in range of threshols & top_k # ... todo # finally, with given probability if random.uniform(0, 1) >= 1 - self.prob: idx = random.randint(0, len(similar) - 1) w2, _ = similar[idx] print("returning w2") return w2 # return the original word, if substituion was not returned ... print("returning original") return word
def test_match_rating_comparison(self): cases = [("Bryne", "Boern", True), ("Smith", "Smyth", True), ("Catherine", "Kathryn", True), ("Michael", "Mike", False), ] for (s1, s2, value) in cases: self.assertEqual(jellyfish.match_rating_comparison(s1, s2), value)
def get_name_similarity(self, candidate): import jellyfish return { # Phonetic distance 'mra': jellyfish.match_rating_comparison(self.app_name.replace(' ', ''), candidate.app_name.replace(' ', '')), # String distance 'jaro': jellyfish.jaro_winkler(self.app_name, candidate.app_name), }
def determine_matching_stats(string1: str, string2: str) -> MatchingStats: """Determines the different distances between two strings.""" return MatchingStats( string1=string1, string2=string2, damerau_levenshtein_distance=jellyfish.damerau_levenshtein_distance( string1, string2), jaro_winkler_distance=jellyfish.jaro_winkler(string1, string2), match_rating_approach_comparison=jellyfish.match_rating_comparison( string1, string2), exact_match=string1.strip().lower() == string2.strip().lower(), )
def mra_1_to_all(word, all_words, threshold): similar_list = [] for j, w2 in enumerate(all_words): if word == w2: # skip -- same word continue # Must similar according to Match Rating Comparison (similarity on MRA hashes) if jellyfish.match_rating_comparison(word, w2): # And also score must be higher than threshold if jellyfish.jaro_winkler_similarity(word, w2) >= threshold: similar_list.append(w2) return similar_list
def getSimilarity(str1, str2): distance = {} if distance_metric1 == "JaroWinkler": distance[distance_metric1] = jellyfish.jaro_winkler(str1, str2) if distance_metric2 == "Jaro": distance[distance_metric2] = jellyfish.jaro_distance(str1, str2) if distance_metric3 == "MatchRating": distance[distance_metric3] = jellyfish.match_rating_comparison( str1, str2) if distance_metric4 == "Levenshtein": distance[distance_metric4] = jellyfish.levenshtein_distance(str1, str2) if distance_metric5 == "Hamming": distance[distance_metric5] = jellyfish.hamming_distance(str1, str2) return distance
def mrc(): # english ----------------------------- tokens = [ 'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp' ] print('Running Match Rating Codex (EN)...') # print tokens print('Tokens: ', end='') for i in tokens: print(i, ' | ', end='') # printcodes print('\n', end="") print('Codes: ', end='') for i in tokens: print(jellyfish.match_rating_codex(i), ' | ', end='') # print string match comparisons print('\n', end="") print('Comparisons: ', end='') print('Ball Bearing, bll brng: ', jellyfish.match_rating_comparison('Ball Bearing', 'bll brng')) print('Centrifugal, centrifigal: ', jellyfish.match_rating_comparison('Centrifugal', 'centrifigal')) print('PUmp, pmp: ', jellyfish.match_rating_comparison('PUmp', 'pmp')) # german ----------------------------- tokens = [ 'Kugellager', 'kugelagr', 'Zentrifugal', 'zentrifkl', 'PUmpe', 'pmp' ] print('\n\nRunning Match Rating Codex Comparison (DE)...') # print tokens print('Tokens: ', end='') for i in tokens: print(i, ' | ', end='') # printcodes print('\n', end="") print('Codes: ', end='') for i in tokens: print(jellyfish.match_rating_codex(i), ' | ', end='') # print string match comparisons print('\n', end="") print('Comparisons: ', end='') print('Kugellager, kugelagr: ', jellyfish.match_rating_comparison('Kugellager', 'kugelagr')) print('Zentrifugal, zentrifkl: ', jellyfish.match_rating_comparison('Zentrifugal', 'zentrifkl')) print('PUmpe, pmp: ', jellyfish.match_rating_comparison('PUmpe', 'pmp'))
def distance(string_1, string_2): """Compute the edit distance between two strings. """ return jsonify({ "levenshtein": jellyfish.levenshtein_distance(string_1, string_2), "damerau-levenshtein": jellyfish.damerau_levenshtein_distance( string_1, string_2 ), "jaro": jellyfish.jaro_distance(string_1, string_2), "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2), "match_rating_codex": jellyfish.match_rating_comparison( string_1, string_2 ), "sift3": pymailcheck.sift3_distance(string_1, string_2), })
def _word_similarity_score(a, b): if a == b: return 1. # Case and whitespace insenstive comparison. if a.lower().strip() == b.lower().strip(): return 0.95 # Penalize whitespace matching to non-whitespace. if ((_isspace(a) and not _isspace(b)) or (not _isspace(a) and _isspace(b))): return 0 # Exceptions to punctuation. if _match_ampersand(a, b): return 0.85 # Penalize punctuation matching to non-punctuation. if _ispunc(a) and _ispunc(b): return 0.95 if ((_ispunc(a) and not _ispunc(b)) or (not _ispunc(a) and _ispunc(b))): return 0 # Problems with phonetic match functions segfaulting on # empty strings. Also beneficial to match strings with # no alpha characters to each other (e.g., line numbers). a_alpha = u''.join([ c for c in a if c.isalpha() ]) b_alpha = u''.join([ c for c in b if c.isalpha() ]) if a_alpha == '' and b_alpha == '': return 0.85 # Strings sound alike (approximate phonetic match). if jf.match_rating_comparison(a_alpha, b_alpha): return 0.9 if jf.metaphone(a_alpha) == jf.metaphone(b_alpha): return 0.9 if jf.soundex(a_alpha) == jf.soundex(b_alpha): return 0.9 if jf.nysiis(a_alpha) == jf.nysiis(b_alpha): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a, b)
def gen_corrections(only_last, uncorrected_string): global transcript_corrections, expected_word, current_word_number_temporary_offset, transcript_variations, transcript_variations_temporary #if not only_last: #current_word_number_temporary_offset = -1 #-1? words = uncorrected_string.split(' ') while '' in words: words.remove('') print("Gen corrections: {}".format(words)) words_to_check = [] if only_last: words_to_check = [-1] else: words_to_check = range(0, len(words)) for i in words_to_check: set_expected_word(current_word_number + i) #current_word_number_temporary_offset +=1 found_correction = False #initial value found_variation = False #False #initial value #Check if its the same if words[i] == expected_word: pass elif (jellyfish.match_rating_comparison(words[i], expected_word)): found_correction = True #if its close indicate the correction to be made else: found_variation = True if found_correction or found_variation: o = correction() o.expected_index = current_word_number + i # - 1#blechED YOU ARE IT WORKS-ish o.old_string = words[i] o.new_string = expected_word if found_correction: transcript_corrections.append(o) if found_variation: if only_last: transcript_variations_temporary.append(o) else: transcript_variations.append(o)
def string_comparison(self, text1, text2, choice='levenshtein_distance'): ''' text1: String Input 1 text2: String Input 2 choice: 'levenshtein_distance' or 'damerau_levenshtein_distance' or 'hamming_distance' or 'jaro_distance' or 'jaro_winkler' or 'match_rating_comparison' ''' # https://jellyfish.readthedocs.io/en/latest/comparison.html if choice == 'levenshtein_distance': return jellyfish.levenshtein_distance(text1, text2) elif choice == 'damerau_levenshtein_distance': return jellyfish.damerau_levenshtein_distance(text1, text2) elif choice == 'hamming_distance': return jellyfish.hamming_distance(text1, text2) elif choice == 'jaro_distance': return jellyfish.jaro_distance(text1, text2) elif choice == 'jaro_winkler': return jellyfish.jaro_winkler(text1, text2) elif choice == 'match_rating_comparison': return jellyfish.match_rating_comparison(text1, text2) else: print("Wrong Choice")
def generate_data(words): word_dict = {} for i, w1 in enumerate(words): matching_words = [] for j, w2 in enumerate(words): if w1 == w2: continue else: is_match_rating_true = jellyfish.match_rating_comparison( w1, w2) if is_match_rating_true: jaro_winkler_score = jellyfish.jaro_winkler(w1, w2) if jaro_winkler_score > 0.0: word_score = (w2, jaro_winkler_score) matching_words.append(word_score) if j == len(words) - 1: sorted_matching_words = sorted(matching_words, key=lambda tup: tup[1], reverse=True) word_dict[w1] = sorted_matching_words return word_dict
def calcula_similaridade(documents): """ documents = ["The legal system is made up of civil courts, criminal courts and specialty courts such as family law courts and bankruptcy court. Each court has its own jurisdiction, which refers to the cases that the court is allowed to hear. In some instances, a case can only be heard in one type of court. For example, a bankruptcy case must be heard in a bankruptcy court. In other instances, there may be several potential courts with jurisdiction. For example, a federal criminal court and a state criminal court would each have jurisdiction over a crime that is a federal drug offense but that is also an offense on the state level.", "The legal system is comprised of criminal and civil courts and specialty courts like bankruptcy and family law courts. Every one of the courts is vested with its own jurisdiction. Jurisdiction means the types of cases each court is permitted to rule on. Sometimes, only one type of court can hear a particular case. For instance, bankruptcy cases an be ruled on only in bankruptcy court. In other situations, it is possible for more than one court to have jurisdiction. For instance, both a state and federal criminal court could have authority over a criminal case that is illegal under federal and state drug laws.", "In many jurisdictions the judicial branch has the power to change laws through the process of judicial review. Courts with judicial review power may annul the laws and rules of the state when it finds them incompatible with a higher norm, such as primary legislation, the provisions of the constitution or international law. Judges constitute a critical force for interpretation and implementation of a constitution, thus de facto in common law countries creating the body of constitutional law."] """ shingles = [] # handle documents one by one # makes a list of sets which are compresized of a list of K words string for doc in documents: # makes a set of tokens # sh = set([' ', ..., ' ']) sh = make_a_set_of_tokens(doc) # shingles : list of sets (sh) shingles.append(sh) # print("shingles=%s") %(shingles) combinations = list( itertools.combinations([x for x in range(len(shingles))], 2) ) #print("combinations=",combinations) # compare each pair in combinations tuple of shingles for c in combinations: i1 = c[0] i2 = c[1] jac = jaccard_set(shingles[i1], shingles[i2]) #print(c,": jaccard=", jac) # Comparação de todo o documento (sem tokenizar) N = len(documents) mtx_lv=numpy.empty((N,N,)) mtx_lv[:]=numpy.nan mtx_jd=numpy.empty((N,N,)) mtx_jd[:]=numpy.nan mtx_dlv=numpy.empty((N,N,)) mtx_dlv[:]=numpy.nan mtx_jw=numpy.empty((N,N,)) mtx_jw[:]=numpy.nan mtx_hd=numpy.empty((N,N,)) mtx_hd[:]=numpy.nan mtx_mr=numpy.empty((N,N,)) mtx_mr[:]=numpy.nan mtx_fuz=numpy.empty((N,N,)) mtx_fuz[:]=numpy.nan comb = list(itertools.combinations([x for x in range(len(documents))],2)) #print("comb=", comb) for d in comb: i1 = d[0] i2 = d[1] #lv = jellyfish.levenshtein_distance(documents[i1],documents[i2]) mtx_lv[d[0]][d[1]]=jellyfish.levenshtein_distance(documents[i1],documents[i2]) mtx_jd[d[0]][d[1]]=jellyfish.jaro_distance(documents[i1],documents[i2]) mtx_dlv[d[0]][d[1]]=jellyfish.damerau_levenshtein_distance(documents[i1],documents[i2]) mtx_jw[d[0]][d[1]]=jellyfish.jaro_winkler(documents[i1],documents[i2]) mtx_hd[d[0]][d[1]]=jellyfish.hamming_distance(documents[i1],documents[i2]) mtx_mr[d[0]][d[1]]=jellyfish.match_rating_comparison(documents[i1],documents[i2]) mtx_fuz[d[0]][d[1]]= fuzz.ratio(documents[i1],documents[i2]) """ print("\n\nlv dist",d,":\t\t",lv ) jd = jellyfish.jaro_distance(documents[i1],documents[i2]) print("jaro dist",d,":\t\t",jd ) dlv = jellyfish.damerau_levenshtein_distance(documents[i1],documents[i2]) print("damerau dist",d,":\t\t",dlv ) jw = jellyfish.jaro_winkler(documents[i1],documents[i2]) print("jaro_wink dist",d,":\t\t",jw ) hd = jellyfish.hamming_distance(documents[i1],documents[i2]) print("hamming dist",d,":\t\t",hd ) mr = jellyfish.match_rating_comparison(documents[i1],documents[i2]) print("match_rat dist",d,":\t\t",mr ) fuz = fuzz.ratio(documents[i1],documents[i2]) print("fuzzy dist",d,":\t\t",fuz) """ print("levenshtein_distance\n",mtx_lv) print("\n\njaro distance\n",mtx_jd) print("\n\ndemerau levenshtein distance\n",mtx_dlv) print("\n\njaro winkler\n",mtx_jw) print("\n\nhamming distance\n",mtx_hd) print("\n\nmatch_rating\n",mtx_mr) print("\n\nfuzz.ratio\n",mtx_fuz) import seaborn as sns import pandas as pd import matplotlib.pyplot as plt mtx_lv=numpy.tril(mtx_lv.T,1) #tranforma matriz diagonal superior em inferior para a plotagem sns.set(style="white") mask = numpy.zeros_like(mtx_lv, dtype=numpy.bool) mask[numpy.triu_indices_from(mask)] = True fig,ax = plt.subplots(figsize=(10,10)) ax.set_title("levenshtein_distance") cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(mtx_lv, mask=mask, cmap=cmap, vmax=500, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5}) plt.show()
def match_rating_comparison(s1, s2): return jellyfish.match_rating_comparison(s1, s2) # return two values
def match_rating_comparison(x, y): """The Match Rating comparison score of the Jellyfish package """ return 100 if jf.match_rating_comparison(x, y) else 0
sim_arry4 = [ 1.0 - jellyfish.damerau_levenshtein_distance(unicode(string[0]), unicode(s)) / ((len(string[0]) + len(s)) / 2.0) for s in string ] print 'dameru', sim_arry4 sim_arry5 = [ jellyfish.jaro_distance(unicode(string[0]), unicode(s)) for s in string ] print 'jaro', sim_arry5 sim_arry6 = [ jellyfish.jaro_winkler(unicode(string[0]), unicode(s)) for s in string ] print 'jaro winkler', sim_arry6 sim_arry7 = [ jellyfish.match_rating_comparison(unicode(string[0]), unicode(s)) for s in string ] print 'match rating comparison', sim_arry7 # tokens = word_tokenize([string]) # print(string_token) # print tfidf_matrix # print(y.toarray() ngram_array = [word_grams(s.split(' ')) for s in string] # print ngram_array n = NGram() # print list(n.split(string[0])) ngram_array = [list(n.split(s)) for s in string] # print ngram_array sim_arry8 = [NGram.compare(string[0].lower(), s.lower(), N=4) for s in string]
def main(): # declare test strings # rem: u prefix is required jellyfish convention str1 = u'Jellyfish' str2= u'Smellyfish' # test Phonetic Encoding print('\nPhonetic Encoding ----------------------------') # Metaphone r1 = jellyfish.metaphone(str1) r2 = jellyfish.metaphone(str2) print('Metaphone: ', r1, ", ", r2) # American Soundex r1 = jellyfish.soundex(str1) r2 = jellyfish.soundex(str2) print('Soundex: ', r1, ", ", r2) # NYSIIS r1 = jellyfish.nysiis(str1) r2 = jellyfish.nysiis(str2) print('NYSIIS: ', r1, ", ", r2) # Match Rating Codex r1 = jellyfish.match_rating_codex(str1) r2 = jellyfish.match_rating_codex(str2) print('Match Rating Codex: ', r1, ", ", r2) # test Stemming print('\nStemming -------------------------------------') pStr1 = u'Jellyfished' pStr2 = u'Smellyfishing' r1 = jellyfish.porter_stem(str1) r2 = jellyfish.porter_stem(str2) print('Porter Stemmer: ', r1, ", ", r2) # test String Comparison print('\nString Comparisons ---------------------------') # Levenshtein Distance r = jellyfish.levenshtein_distance(str1, str2) print('Levenshtein Distance: ', r) # Damerau-Levenshtein Distance r = jellyfish.damerau_levenshtein_distance(str1, str2) print('Damerau-Levenshtein Distance: ', r) # Hamming Distance result = jellyfish.hamming_distance(str1, str2) print('Hamming Distance: ', r) # Jaro Distance result = jellyfish.jaro_distance(str1, str2) print('Jaro Distance: ', r) # Jaro-Winkler Distance result = jellyfish.jaro_winkler(str1, str2) print('Jaro-Winkler Distance: ', r) # Match Rating Approach (comparison) r = jellyfish.match_rating_comparison(str1, str2) print('Match Rating Comparison: ', r) # end program print('Done.')
def test_match_rating_comparison_segfault(self): import hashlib sha1s = [hashlib.sha1(str(v)).hexdigest() for v in range(100)] # this segfaulted on 0.1.2 r = [[jellyfish.match_rating_comparison(h1, h2) for h1 in sha1s] for h2 in sha1s]
def match_rating_comparison(d1, d2): return jellyfish.match_rating_comparison(d1, d2)
import pandas as pd import jellyfish importer_list = pd.read_csv( r'C:\Users\S\PycharmProjects\CompanyNames\HMRC\importsNames.csv') importer_names = importer_list[['NAME']].drop_duplicates() # sample_df = pd.read_csv(r'C:\Users\S\PycharmProjects\CompanyNames\data\raw\company_names.csv') # # # x= pd.merge(sample_df,importer_names,how='inner',left_on = ['CompanyName'],right_on=['NAME']) # x=x[['NAME']].sample(100) # x.to_csv('matched.csv',index=None ) x = pd.read_csv(r'./HMRC/matched.csv') y = x['NAME'][0] z = [[i, jellyfish.jaro_similarity(i, y)] for i in x['NAME'] if y != i] z3 = [[i, jellyfish.match_rating_comparison(i, y)] for i in x['NAME'] if y != i] z2 = pd.DataFrame(z)
def match_rating_comparison(s1, s2): return None if s1 == None or s2 == None else J.match_rating_comparison( s1, s2)
def match_rating(query, template): return jellyfish.match_rating_comparison(query, template)
def getwikidatacity(_step, list_wikidataid, ne_fid, ne_xid, ne_lon, ne_lat, ne_wikidataid, ne_name ,ne_namealt ,ne_adm0name,ne_adm1name,ne_ls_name,ne_geonameid, ne_scalerank,ne_labelrank,ne_natscale): query_template=""" PREFIX geo: <http://www.opengis.net/ont/geosparql#> SELECT ?place ?placeLabel ?placeDescription (group_concat(distinct ?pLabel ; separator = "#") as ?type_grp) (group_concat(distinct ?placeLabelru ; separator = "#") as ?placeLabelru) (group_concat(distinct ?sitelink_en ; separator = "#") as ?sitelink_en) (group_concat(distinct ?sitelink_es ; separator = "#") as ?sitelink_es) (group_concat(distinct ?sitelink_ru ; separator = "#") as ?sitelink_ru) (group_concat(distinct ?sitelink_zh ; separator = "#") as ?sitelink_zh) (group_concat(distinct ?sitelink_ceb ; separator = "#") as ?sitelink_ceb) (group_concat(distinct ?countryLabelx; separator = "#") as ?countryLabel) (SAMPLE(?sistercity) as ?sistercity_sample) (AVG(?distance) as ?distance ) (MAX(?population) as ?max_population ) (group_concat(distinct ?place_alternative ; separator = "#") as ?place_alternative_grp) (group_concat(distinct ?GeoNames_ID ; separator = "#") as ?GeoNames_ID_grp) WITH { SELECT DISTINCT ?place ?distance { #S1# ?place p:P31/ps:P31 wd:Q515. #S2# ?place p:P31/ps:P31 wd:Q3957. #S3# {?place (p:P31/wdt:P31/wdt:P279*) wd:Q532. } #S3# UNION {?place p:P31/ps:P31 wd:Q532. } #S3# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q15078955.} #S3# UNION {?place p:P31/ps:P31 wd:Q15078955.} #S3# UNION { #S3# ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 . #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S3# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S3# ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en"). #S3# } #S3# UNION { #S3# ?place p:P31/ps:P31 wd:Q486972. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S3# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S3# ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en"). #S3# } #S3# UNION { #S3# ?place p:P31/ps:P31/wdt:P279* wd:Q486972. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S3# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S3# ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en"). #S3# } #S4# {?place (p:P31/wdt:P31/wdt:P279*) wd:Q2039348. } #S4# UNION {?place p:P31/ps:P31 wd:Q2039348. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q1867183. } #S4# UNION {?place p:P31/ps:P31 wd:Q1867183. } #S4# UNION {?place wdt:P1376 ?admin_ara. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q1637706. } #S4# UNION {?place p:P31/ps:P31 wd:Q1637706. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q16861602.} #S4# UNION {?place p:P31/ps:P31 wd:Q16861602.} #S4# UNION {?place p:P31/ps:P31 wd:Q188509. ?place p:P17/ps:P17 wd:Q408. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q1070990. } #S4# UNION {?place p:P31/ps:P31 wd:Q1070990. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q748149. } #S4# UNION {?place p:P31/ps:P31 wd:Q748149. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q735428. } #S4# UNION {?place p:P31/ps:P31 wd:Q735428. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q318727. } #S4# UNION {?place p:P31/ps:P31 wd:Q318727. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q15284. } #S4# UNION {?place p:P31/ps:P31 wd:Q15284. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q15284. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q532. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q15078955.} #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q498162. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3389680. } #S4# UNION {?place p:P31/ps:P31 wd:Q1639634. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1639634. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2112349. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q749622. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q11618417. } #S4# UNION {?place p:P31/ps:P31 wd:Q11618417. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q640364. } #S4# UNION {?place p:P31/ps:P31 wd:Q640364. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2555896. } #S4# UNION {?place p:P31/ps:P31 wd:Q2555896. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q109108. } #S4# UNION {?place p:P31/ps:P31 wd:Q109108. } #S5# {?place p:P31/ps:P31/wdt:P279* wd:Q1763214. } #S5# UNION {?place p:P31/ps:P31 wd:Q1763214. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1840161. } #S5# UNION {?place p:P31/ps:P31 wd:Q1840161. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q4249901. } #S5# UNION {?place p:P31/ps:P31 wd:Q4249901. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31 wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q12081657. } #S5# UNION {?place p:P31/ps:P31 wd:Q12081657. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q27676416. } #S5# UNION {?place p:P31/ps:P31 wd:Q27676416. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3076994. } #S5# UNION {?place p:P31/ps:P31 wd:Q3076994. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3360771. } #S5# UNION {?place p:P31/ps:P31 wd:Q3360771. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31 wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q605291. } #S5# UNION {?place p:P31/ps:P31 wd:Q605291. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1539014. } #S5# UNION {?place p:P31/ps:P31 wd:Q1539014. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q7830262. } #S5# UNION {?place p:P31/ps:P31 wd:Q7830262. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3327862. } #S5# UNION {?place p:P31/ps:P31 wd:Q3327862. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q956318. } #S5# UNION {?place p:P31/ps:P31 wd:Q956318. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q155239. } #S5# UNION {?place p:P31/ps:P31 wd:Q155239. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q27676428. } #S5# UNION {?place p:P31/ps:P31 wd:Q27676428. } #S5# UNION {?place p:P31/ps:P31 wd:Q5084. ?place p:P17/ps:P17 wd:Q16. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q17305746. } #S5# UNION {?place p:P31/ps:P31 wd:Q17305746. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q14762300. } #S5# UNION {?place p:P31/ps:P31 wd:Q14762300. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q17366755. } #S5# UNION {?place p:P31/ps:P31 wd:Q17366755. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3327873. } #S5# UNION {?place p:P31/ps:P31 wd:Q3327873. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3788231. } #S5# UNION {?place p:P31/ps:P31 wd:Q3788231. } # --- S6 ------------------- #S6# {?place p:P31/ps:P31/wdt:P279* wd:Q6609799. } #S6# UNION {?place p:P31/ps:P31 wd:Q6609799. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3685430. } #S6# UNION {?place p:P31/ps:P31 wd:Q3685430. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2679157. } #S6# UNION {?place p:P31/ps:P31 wd:Q2679157. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2989470. } #S6# UNION {?place p:P31/ps:P31 wd:Q2989470. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q6593035. } #S6# UNION {?place p:P31/ps:P31 wd:Q6593035. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q43742. } #S6# UNION {?place p:P31/ps:P31 wd:Q43742. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q83020. } #S6# UNION {?place p:P31/ps:P31 wd:Q83020. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2706302. } #S6# UNION {?place p:P31/ps:P31 wd:Q2706302. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q482821. } #S6# UNION {?place p:P31/ps:P31 wd:Q482821. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2225003. } #S6# UNION {?place p:P31/ps:P31 wd:Q2225003. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q133442. } #S6# UNION {?place p:P31/ps:P31 wd:Q133442. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1500350. } #S6# UNION {?place p:P31/ps:P31 wd:Q1500350. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q16725943. } #S6# UNION {?place p:P31/ps:P31 wd:Q16725943. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q9316670. } #S6# UNION {?place p:P31/ps:P31 wd:Q9316670. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1065118. } #S6# UNION {?place p:P31/ps:P31 wd:Q1065118. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1289426. } #S6# UNION {?place p:P31/ps:P31 wd:Q1289426. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1336099. } #S6# UNION {?place p:P31/ps:P31 wd:Q1336099. } #S6# { #S6# ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 . #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S6# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S6# # FILTER(NOT EXISTS { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en") }). #S6# ?place rdfs:label ?placeLabel_xru FILTER (lang(?placeLabel_xru) = "ru"). #S6# } #S6# UNION { #S6# ?place p:P31/ps:P31 wd:Q486972. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S6# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S6# #FILTER(NOT EXISTS { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en") }). #S6# ?place rdfs:label ?placeLabel_xru FILTER (lang(?placeLabel_xru) = "ru"). #S6# } #S6# UNION { #S6# ?place p:P31/ps:P31/wdt:P279* wd:Q486972. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S6# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S6# #FILTER(NOT EXISTS { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en") }). #S6# ?place rdfs:label ?placeLabel_xru FILTER (lang(?placeLabel_xru) = "ru"). #S6# } #S7# FILTER EXISTS { ?place wdt:P190 ?sistercity_x.} #S8# VALUES ?GeoNames_ID {"3383494"} #S8# ?place wdt:P1566 ?GeoNames_ID. #S9# VALUES ?searchnames {"#ne_name#"@en "#ne_name#"@es "#ne_name#"@sv #S9# "#ne_name#"@de "#ne_name#"@fr "#ne_name#"@pt #S9# "#ne_name#"@it "#ne_name#"@da "#ne_name#"@pl #S9# "#ne_name#"@cz "#ne_name#"@sk "#ne_name#"@hu #S9# "#ne_name#"@lt "#ne_name#"@et "#ne_name#"@lv #S9# "#ne_name#"@no "#ne_name#"@nl "#ne_name#"@fi } #S9# ?place rdfs:label ?searchnames . SERVICE wikibase:around { # "#ne_name#" , "#ne_adm0name#" ?place wdt:P625 ?location. bd:serviceParam wikibase:center "Point(16.373064 48.20833)"^^geo:wktLiteral. bd:serviceParam wikibase:radius "#distance#". bd:serviceParam wikibase:distance ?distance. } } } AS %places WHERE { INCLUDE %places . SERVICE wikibase:label { bd:serviceParam wikibase:language "en".} OPTIONAL {?place rdfs:label ?placeLabelru FILTER (lang(?placeLabelru)="ru").} OPTIONAL {?place wdt:P31 ?property. ?property rdfs:label ?pLabel FILTER (lang(?pLabel)="en").} OPTIONAL {?place wdt:P17 ?country. ?country rdfs:label ?countryLabelx FILTER (lang(?countryLabelx)="en").} OPTIONAL {?place wdt:P17 ?country.} OPTIONAL {?place wdt:P1566 ?GeoNames_ID.} OPTIONAL {?place wdt:P190 ?sistercity.} OPTIONAL {?place wdt:P1082 ?population.} OPTIONAL {?sitelink_en schema:about ?place . ?sitelink_en schema:isPartOf <https://en.wikipedia.org/>.} OPTIONAL {?sitelink_es schema:about ?place . ?sitelink_es schema:isPartOf <https://es.wikipedia.org/>.} OPTIONAL {?sitelink_ru schema:about ?place . ?sitelink_ru schema:isPartOf <https://ru.wikipedia.org/>.} OPTIONAL {?sitelink_zh schema:about ?place . ?sitelink_zh schema:isPartOf <https://zh.wikipedia.org/>.} OPTIONAL {?sitelink_ceb schema:about ?place . ?sitelink_ceb schema:isPartOf <https://ceb.wikipedia.org/>.} OPTIONAL {?place skos:altLabel ?place_alternative FILTER((LANG(?place_alternative)) = "en").} } GROUP BY ?place ?placeLabel ?placeDescription ORDER BY ?distance """ q=query_template.replace('16.373064',ne_lon).replace('48.20833',ne_lat) q=q.replace('#ne_name#',ne_name).replace('#ne_adm0name#',ne_adm0name) q=q.replace('"3383494"','"'+ne_geonameid+'"') if _step==1: q=q.replace('#S1#','') elif _step==2: q=q.replace('#S2#','') elif _step==3: q=q.replace('#S3#','') elif _step==4: q=q.replace('#S4#','') elif _step==5: q=q.replace('#S5#','') elif _step==6: q=q.replace('#S6#','') elif _step==7: q=q.replace('#S7#','') elif _step==8: q=q.replace('#S8#','') elif _step==9: q=q.replace('#S9#','') else: print("Internal error, _step: ", _step ) sys.exit(1) search_distance=0 if ( -10 <= float(ne_lon) <= 60) and ( float(ne_lat) >30 ): if _step==1: search_distance=50 elif _step==2: search_distance=50 elif _step==3: search_distance=50 elif _step==4: search_distance=50 elif _step==5: search_distance=50 elif _step==6: search_distance=50 elif _step==7: search_distance=50 elif _step==8: search_distance=1200 elif _step==9: search_distance=100 else: if _step==1: search_distance=150 elif _step==2: search_distance=150 elif _step==3: search_distance=120 elif _step==4: search_distance=100 elif _step==5: search_distance=100 elif _step==6: search_distance=100 elif _step==7: search_distance=100 elif _step==8: search_distance=1200 elif _step==9: search_distance=100 print("_step:",_step , " search_distance=", search_distance) # remove double spaces while ' ' in q: q = q.replace(' ', ' ') # remove comments qs='' for line in q.splitlines(): if len(line)>0 and line[:2] != ' #' and line[:2] != '#S' : qs+=line+'\n' q=qs ts = datetime.datetime.now() max_score=-1000 results = None retries = 0 max_retries=14 while results == None and retries < max_retries: try: results = None sleeptime= retries*10 + 5 qs=q.replace('#distance#', str(search_distance) ) print("distance-ok") if retries > 0: print("Try - retries:",retries," Distance:",search_distance," Sleeptime:",sleeptime) if args.filter_name!='': print(qs) sparql.setQuery(qs) sparql.setTimeout(2000) sparql.setReturnFormat(JSON) results = sparql.query().convert() except SPARQLExceptions.EndPointNotFound as e: print("ERRwikidata-SPARQLExceptions-EndPointNotFound: Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue except SPARQLExceptions.EndPointInternalError as e: print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 # Decrease search distance if retries > 3: search_distance=int( search_distance*0.9) continue except TimeoutError: print("ERRwikidata-SPARQLExceptions TimeOut : Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue except SPARQLExceptions.QueryBadFormed as e: print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check! " , flush=True ) return "error" except HTTPError as e: print("ERRwikidata: Got an HTTPError while querying. Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue except: print("ERRwikidata: other error. Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue if results == None and retries >= max_retries : print("Wikidata request failed ; system stopped! ") sys.exit(1) _runtime= (datetime.datetime.now() - ts).total_seconds() rc_list_wikidataid=[] #TODO empty answer .. for result in results['results']['bindings']: _score=0; wd_id = result['place']['value'].split('/')[4] wd_distance = float( result['distance']['value'] ) if 'placeLabel' in result: wd_label = result['placeLabel']['value'] else: wd_label = '' # Check if already queryed? if wd_id in list_wikidataid: print("Already exist:", wd_id, wd_label) continue else: rc_list_wikidataid.append(wd_id) if 'placeLabelru' in result: wd_label_ru = result['placeLabelru']['value'] else: wd_label_ru = '' if 'placeDescription' in result: wd_description = result['placeDescription']['value'] else: wd_description = '' if 'type_grp' in result: wd_type = "#"+result['type_grp']['value']+"#" else: wd_type = '' if 'countryLabel' in result: wd_countrylabel = result['countryLabel']['value'] cldiff= - ( 20 - ( 20 * Levenshtein.jaro_winkler( unidecode.unidecode(ne_adm0name) , unidecode.unidecode(wd_countrylabel) ) ) ) #print( cldiff, ne_adm0name, wd_countrylabel ) _score+= cldiff else: wd_countrylabel ='' if 'sitelink_en' in result: wd_sitelink_en = result['sitelink_en']['value'] else: wd_sitelink_en='' if wd_sitelink_en != '': _score+= 40 else: _score+= -120 if 'sitelink_es' in result: wd_sitelink_es = result['sitelink_es']['value'] else: wd_sitelink_es='' if 'sitelink_ru' in result: wd_sitelink_ru = result['sitelink_ru']['value'] else: wd_sitelink_ru='' if 'sitelink_zh' in result: wd_sitelink_zh = result['sitelink_zh']['value'] else: wd_sitelink_zh='' if 'sitelink_ceb' in result: wd_sitelink_ceb = result['sitelink_ceb']['value'] else: wd_sitelink_ceb='' if wd_sitelink_en == '': if wd_sitelink_es != '': _score+= 100 elif wd_sitelink_ru != '': _score+= 80 elif wd_sitelink_zh != '': _score+= 60 elif wd_sitelink_ceb != '': _score+= -1000 # penalty for only ceb import if 'GeoNames_ID_grp' in result: wd_geonames_id_grp="#"+result['GeoNames_ID_grp']['value']+"#" else: wd_geonames_id_grp='' if 'max_population' in result: wd_max_population = result['max_population']['value'] if wd_max_population!='': _score+=8 else: wd_max_population='' if 'place_alternative_grp' in result: wd_place_alternative_grp="#"+result['place_alternative_grp']['value']+"#" else: wd_place_alternative_grp='' if ('#'+ne_name+'#' in wd_place_alternative_grp) : _in_altnames='Y' _score+=72 if ('#'+unidecode.unidecode(ne_name)+'#' in unidecode.unidecode(wd_place_alternative_grp)) : _in_altnames='Y' _score+=58 else: _in_altnames='N' wd_has_sistercity="" if ('sistercity_sample' in result): if result['sistercity_sample']['value'] != '': wd_has_sistercity="Y" _score+=15 uni_ne_name=unidecode.unidecode(ne_name) uni_ne_ls_name=unidecode.unidecode(ne_ls_name) uni_ne_namealt=unidecode.unidecode(ne_namealt) uni_ne_adm0name=unidecode.unidecode(ne_adm0name) uni_ne_adm1name=unidecode.unidecode(ne_adm1name) uni_wd_name=unidecode.unidecode(wd_label) if wd_label==wd_id and wd_label_ru != '': _lev_jaro_winkler_ru = Levenshtein.jaro_winkler( uni_ne_name, unidecode.unidecode(wd_label_ru)) else: _lev_jaro_winkler_ru = 0 _lev_ratio = Levenshtein.ratio(uni_ne_name, uni_wd_name) _lev_distance = Levenshtein.distance(uni_ne_name, uni_wd_name) _lev_jaro = Levenshtein.jaro(uni_ne_name, uni_wd_name) _lev_jaro_winkler = Levenshtein.jaro_winkler(uni_ne_name, uni_wd_name) _lev_jaro_winkler_ls = Levenshtein.jaro_winkler(uni_ne_ls_name, uni_wd_name) _lev_jaro_winkler_alt = Levenshtein.jaro_winkler(uni_ne_namealt, uni_wd_name) _lev_jaro_winkler_adm0 = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm0name, uni_wd_name ) _lev_jaro_winkler_adm1 = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm1name, uni_wd_name ) _max_lev_jaro_winkler = max(_lev_jaro_winkler,_lev_jaro_winkler_ls,_lev_jaro_winkler_alt,_lev_jaro_winkler_adm0,_lev_jaro_winkler_adm1, _lev_jaro_winkler_ru) _match_rating_comparison = jellyfish.match_rating_comparison(uni_ne_name, uni_wd_name) _damerau_levenshtein_distance= jellyfish.damerau_levenshtein_distance(uni_ne_name, uni_wd_name) _hamming_distance = jellyfish.hamming_distance(uni_ne_name, uni_wd_name) _score+= _max_lev_jaro_winkler*10; if ne_name == wd_label: _name_status='R01-Equal' _score+=100 elif ne_name.lower()==wd_label.lower(): _name_status='R12-Lowcase_equal' _score+=99 elif uni_ne_name==uni_wd_name: _name_status='R13-Unidecode_equal' _score+=90 elif uni_ne_ls_name==uni_wd_name: _name_status='R31-ls_name eq' _score+=60 elif uni_ne_namealt==uni_wd_name: _name_status='R32-namealt eq' _score+=60 elif uni_ne_namealt==uni_wd_name: _name_status='R33-namealt eq' _score+=60 elif _max_lev_jaro_winkler == 1.0 : _name_status='R41- max(jaro_winkler)=1' _score+=50 elif _max_lev_jaro_winkler >= 0.9 : _name_status='R42- max(jaro_winkler) 0.9-1.0' _score+=40 elif _max_lev_jaro_winkler >= 0.8 : _name_status='R43- max(jaro_winkler) 0.8-0.9' _score+=30 else: _name_status='' if wd_distance < 5: _score += 10 elif wd_distance < 10: _score += 5 elif wd_distance > 60: _score += -30 elif wd_distance > 30: _score += -15 elif wd_distance > 15: _score += -5 if ne_geonameid != '' and ('#'+ne_geonameid+'#' in wd_geonames_id_grp) : _geonames_status='EQ' _score+=40 elif ne_geonameid != '' and ne_geonameid != '-1' and wd_geonames_id_grp!='##' and ('#'+ne_geonameid+'#' not in wd_geonames_id_grp) : _geonames_status='NE' _score+=0 else: _geonames_status='Na' if (ne_wikidataid != '' ) and (wd_id !='' ) and (ne_wikidataid==wd_id): _wikidata_status='EQ' _score+=15 elif (ne_wikidataid != '' ) and (wd_id !='' ): _wikidata_status='NE' # smaller wikidataid is sometimes better if float( ne_wikidataid[1:]) > float(wd_id[1:]): _score+= 3 else: _score+= -3 else: _wikidata_status='Na' if _score > max_score: max_score=_score if _score > 140: print("@@_score>120:" , ne_name , " :: ", wd_id, wd_label, wd_description, wd_type ) c.execute("INSERT INTO wd VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", ( ne_fid, ne_wikidataid, wd_id, ne_name, wd_label, ne_adm0name, wd_countrylabel, ne_adm1name, ne_ls_name, ne_namealt, wd_description, wd_type, ne_geonameid, wd_geonames_id_grp, _geonames_status, wd_place_alternative_grp, wd_sitelink_en, wd_sitelink_es, wd_sitelink_ru, wd_sitelink_zh, wd_sitelink_ceb, wd_label_ru, wd_has_sistercity, wd_max_population, wd_distance, _step, _score, _name_status, _wikidata_status, _in_altnames, _lev_ratio, _lev_distance, _lev_jaro, _lev_jaro_winkler, ne_scalerank, ne_labelrank, ne_natscale, ne_xid, ts, search_distance, retries, _runtime )) conn.commit() sys.stdout.flush() if max_score <= 30: print(" Low score .. stop ", max_score) return list_wikidataid + rc_list_wikidataid , max_score