def find_searchedperson(first_name, last_name, national_id): ''' Try super hard to match this person up. Rather return bad match than no match - we can manually search bad matches. ''' if national_id in persons_by_id: yield persons_by_id[national_id] last_name_norm = normalize_string(last_name) matches = [] if last_name_norm in persons_by_last_name: matches = persons_by_last_name[last_name_norm] else: for key in persons_by_last_name.keys(): if jaro(last_name_norm, key) > 0.9: matches.extend(persons_by_last_name[key]) first_name_norm = normalize_string(first_name) for match in matches: key = normalize_string(match.first_name) ''' A match is valid if: 1. We don't have a first name because only last name was used in a failed search. 2. The entire first name string is similar. 3. One of the first names are similar. ''' if (not key and isinstance(match, SearchedPersonNotFound)) \ or jaro(first_name_norm, key) > 0.9 \ or has_matching_word(first_name_norm, key): yield match
def compare_two_names(name1, name2, max_splits=7): def normalize_name(s): return re.sub(r"\s+", " ", s.lower().strip().replace("-", " ")) def slugify_name(s): return ( s.replace(" ", "") .replace(".", "") .replace('"', "") .replace("'", "") .replace("’", "") ) name1 = normalize_name(name1) name2 = normalize_name(name2) if slugify_name(name1) == slugify_name(name2): return 1 if jaro(name1, name2) > 0.95: return 1 splits = name2.split(" ") limit = reduce(mul, range(1, max_splits + 1)) if len(splits) > max_splits: print("Too much permutations for {}".format(name2)) return max( jaro(name1, " ".join(opt)) for opt in islice(permutations(splits), limit) )
def full_compare(name1, name2): name1 = _normalize_name(name1) name2 = _normalize_name(name2) slugified_name1 = _slugify_name(name1) slugified_name2 = _slugify_name(name2) if slugified_name1 == slugified_name2: return True if slugified_name1.startswith(slugified_name2) and len(slugified_name2) >= 10: return True if slugified_name2.startswith(slugified_name1) and len(slugified_name1) >= 10: return True if slugified_name1.endswith(slugified_name2) and len(slugified_name2) >= 10: return True if slugified_name2.endswith(slugified_name1) and len(slugified_name1) >= 10: return True if jaro(slugified_name1, slugified_name2) > 0.95: return True if jaro(slugified_name2, slugified_name1) > 0.95: return True if _compare_two_names(name1, name2): return True if _compare_two_names(name2, name1): return True return _thorough_compare(name1, name2) or _thorough_compare(name2, name1)
def get_jaro_to_list(first4jaro, list4jaro, factor=0.9): result = [[0 for x in range(len(list4jaro))] for y in range(len(first4jaro))] loc_data = 0.0 #If loc_data =0, we take the first one loc_i = 0 loc_j = 0 for i, item in enumerate(first4jaro): for j, data in enumerate(list4jaro): if (item[1] == "") or (data[1] == ""): result[i][j] = jaro(item[0], data[0]) else: result[i][j] = jaro(item[0], data[0]) * jaro(item[1], data[1]) if result[i][j] > loc_data: loc_data = result[i][j] loc_i = i loc_j = j first2return = first4jaro[:loc_i] + first4jaro[loc_i + 1:] list4return = list4jaro[:loc_j] + list4jaro[loc_j + 1:] if (len(first2return) == 0) or (len(list4return) == 0): dif = abs(len(first2return) - len(list4return)) return loc_data * loc_data * math.pow(factor, dif) else: return loc_data * loc_data * get_jaro_to_list(first2return, list4return)
def _get_name(name, first_name, gender): for prefix in ('eng', 'dr', 'hr', 'phd'): if first_name.startswith(prefix + ' '): first_name = first_name[len(prefix) + 1:] first_name = first_name.replace(' ', '') if first_name in ('abd', 'abdel', 'عبد') or first_name == '': ns = name.split(' ') segs = [] for s in ns: segs.append(s) if s not in ('abd', 'el', 'عبد'): break first_name = ' '.join(segs).replace(' ', '') if is_arabic(first_name): return first_name if 'female' in gender: gender = 'female' elif 'male' in gender: gender = 'male' if gender == 'unknown': if first_name in males_en or first_name in males: gender = 'male' elif first_name in females_en or first_name in females: gender = 'female' if gender == 'unknown': nearest = -1 nearest_gender = 'unknown' for en_name in males_en: similarity = jaro(first_name, en_name) if similarity > nearest: nearest_gender = 'male' nearest = similarity if similarity == nearest and nearest_gender == 'female': nearest_gender = 'unknown' for en_name in females_en: similarity = jaro(first_name, en_name) if similarity > nearest: nearest_gender = 'female' nearest = similarity if similarity == nearest and nearest_gender == 'male': nearest_gender = 'unknown' gender = nearest_gender d = males if gender == 'male' else females if gender == 'female' else unknowns if first_name in d: return d[first_name] res = translator.translate(first_name, dest='ar', src='en').text d[first_name] = res return res
def get_compared_data_file(data, language="en", data_kind="surname"): ''' This function will compare the given name with the current data input ''' if language in LANGUAGES_FILES.keys(): if data_kind in LANGUAGES_FILES[language].keys(): data_in_met = adapted_doublemetaphone(data, language=language) total_data = [] for word, met_value in LANGUAGES_DATA[language][data_kind].items(): if met_value == data_in_met: total_data.append(word) #If the value is already available, we just return it if data in LANGUAGES_DATA[language][data_kind].keys(): return data, 1.0 else: data_temp = data.lower() norm = LANGUAGES_FILES[language]["normalize"] for notnorm in norm.keys(): data_temp = data_temp.replace(notnorm, norm[notnorm]) results = {} for candidate in total_data: candidate_temp = candidate.lower() for notnorm in norm.keys(): candidate_temp = candidate_temp.replace( notnorm, norm[notnorm]) results[candidate] = jaro(candidate_temp, data_temp) if (any(results)): return max(results, key=results.get), max(results.values()) else: return data, -1.0 return data, -1.0
def score_reconciliation(txn, payment): words = list(filter(None, re.split(r"\W+", txn.payee))) bankref_parts = [payment.bankref[:4], payment.bankref[4:]] bankref_distances = [ratio(w, p) for w in words for p in bankref_parts] # Get the two best matches, for the two parts of the bankref # A match gives 1.0, a 2-char substring 0.666, and a 6-char superstring 0.857 bankref_score = sum(sorted(bankref_distances)[-2:]) name_score = jaro(txn.payee, payment.user.name) other_score = 0.0 if txn.amount == payment.amount: other_score += 0.4 if txn.account.currency == payment.currency: other_score += 0.6 # check posted against expiry? app.logger.debug( "Scores for txn %s payment %s: %s %s %s", txn.id, payment.id, bankref_score, name_score, other_score, ) return bankref_score + name_score + other_score
def update(self, idx, idy, get_word): # ox = self.store.get(idx) # oy = self.store.get(idy) #save distance: # self.vec[idx,idy] = stime = datetime.now() try: idx = int(idx) idy = int(idy) if idx == idy: return ox = self.store.get(str(idx)) oy = self.store.get(str(idy)) if ox is None or oy is None: return w1 = get_word(ox) w2 = get_word(oy) simi = jaro(w1, w2) logger.debug("({},{}) vs ({},{}) : {}".format( idx, w1, idy, w2, simi)) self.store.set_entry(idx, idy, simi) except Exception as e: logger.debug("(idx : {} type : {})".format(idx, type(idx))) raise e ntime = datetime.now() logger.debug("processing time : {}".format(ntime - stime))
def full_compare(name1, name2): def normalize_name(s): return re.sub(r"\s+", " ", s.strip().replace("-", " ")) def slugify_name(s): s = (s.replace(" ", "").replace(".", "").replace('"', "").replace( "'", "").replace("’", "").replace("є", "е").replace("i", "и").replace( "ь", "").replace("'", "").replace('"', "").replace('`', "").replace( "’", "").replace("ʼ", "")) return re.sub(r"\d+", "", s) name1 = normalize_name(name1) name2 = normalize_name(name2) slugified_name1 = slugify_name(name1) slugified_name2 = slugify_name(name2) if slugified_name1 == slugified_name2: return True if slugified_name1.startswith( slugified_name2) and len(slugified_name2) >= 10: return True if slugified_name2.startswith( slugified_name1) and len(slugified_name1) >= 10: return True if jaro(slugified_name1, slugified_name2) < 0.6: return False if jaro(slugified_name1, slugified_name2) > 0.95: return True if jaro(slugified_name2, slugified_name1) > 0.95: return True if _compare_two_names(name1, name2): return True if _compare_two_names(name2, name1): return True return False
def fetch_answer(Q_input): score_list = list(map(lambda x: jaro(x, Q_input), Q_list)) highest_score = max(score_list) highest_score_index = score_list.index(highest_score) selected_answer = A_list[highest_score_index] score_list.pop(highest_score_index) second_highest_score = max(score_list) return selected_answer, highest_score, second_highest_score
def predict_ner(): ''' This will return the similarity between two strings ''' str1 = str(request.form['string1']) str2 = str(request.form['string2']) result = jaro(str1, str2) return render_template('ner.html', prediction_text='{}'.format(result))
def cmp_auth(self, s1, s2): if len(s1) != len(s2): return False, False matching, sorted = True, True sim_mat = [[jaro(s1[y], s2[x]) for x in range(len(s2))] for y in range(len(s1))] for i in range(len(s2)): if max(sim_mat[i]) != sim_mat[i][i]: sorted = False if max(sim_mat[i]) < self.threshold['author']: matching = False return matching, sorted
def compare_fingerprints(left, right): result = 0 left_list = ensure_list(left.get('fingerprints')) right_list = ensure_list(right.get('fingerprints')) for (left, right) in itertools.product(left_list, right_list): similarity = jaro(left, right) score = similarity * dampen(3, 20, min(left, right, key=len)) result = max(result, score) return result
def compare_names(left, right): result = 0 left_list = list(_normalize_names(left.names)) right_list = list(_normalize_names(right.names)) for (left, right) in itertools.product(left_list, right_list): similarity = jaro(left, right) score = similarity * dampen(2, 20, shortest(left, right)) result = max(result, score) return result
def compare_names(left, right): result = 0 left_list = [normalize(n, latinize=True) for n in left.names] right_list = [normalize(n, latinize=True) for n in right.names] for (left, right) in itertools.product(left_list, right_list): similarity = jaro(left, right) score = similarity * dampen(2, 20, shortest(left, right)) result = max(result, score) return result
def get_classify(tuple_cont): '''判断当前站点所属类别,该函数为进程池目标函数 ''' try: if jaro(Config.ipc_r_kw.value[0], tuple_cont[0]) > Config.edit_dist_benchmark: Config.ipc_list_url.append(tuple_cont[1] + '\n') except: # print '#############################################' # print Config.ipc_r_kw.value[1] pass
def get_name_from_fullname(full_name, list_father_surnames, list_mother_surnames, language="en"): ''' Given a full name, including surname, this function will provide out the first name of the person removing the surname of the person ''' merged_list = list_father_surnames + list_mother_surnames for surname in merged_list: temp_surname = surname.split(" ") if len(temp_surname) > 1: for i, _ in enumerate(temp_surname): if temp_surname[i] in LANGUAGES_ADDS[language]: temp_surname[i] = "" new_surname = " ".join(temp_surname).rstrip().strip() if (not new_surname in merged_list): merged_list.append(new_surname) merged_metaphore = [] for data in merged_list: if adapted_doublemetaphone(data, language) not in merged_metaphore: merged_metaphore.append(adapted_doublemetaphone(data, language)) full_name_list = get_splitted_name_from_complete_name(full_name, language) for i, value in enumerate(full_name_list[0]): #We remove from the specific particle the particles from each language that are used inside surnames #to connect check_surname = value.split(" ") if len(check_surname) > 1: for j, value in enumerate(check_surname): if (check_surname[j].lower() in LANGUAGES_ADDS[language]): check_surname[j] = "" adapted_surname = "".join(check_surname).rstrip() if (adapted_doublemetaphone(value, language) in merged_metaphore) or (adapted_doublemetaphone( adapted_surname, language) in merged_metaphore): #The methapone algorithm is not perfect... so that we include here a crosschecking of very close phonetical, but far written data. similar = 0 for compared in merged_list: if jaro(adapted_surname, compared) > similar: similar = jaro(adapted_surname, compared) if similar > THRESHOLD_JARO: full_name_list[0][i] = "" return " ".join(full_name_list[0]).rstrip()
def fuzzyStr(perc, str_one, str_two): if all(i in str_two.split() for i in str_one.split()): return True try: perc = int(perc) except: return False if int(jaro(str_one, str_two) * 100) >= perc: return True else: return False
def score_of_given_name_and_meta(first4jaro, list4jaro, name1, name2, factor=0.9): ''' This function will take the maximum score between the direct comparison of the name and the phonetic comparison ''' score_compare = jaro(name1, name2) score_met = get_jaro_to_list(first4jaro, list4jaro, factor=factor) return max(score_met, score_compare * score_compare)
def compare(x, y): if x is None: return [0, x, y] x = normalize(x) x_tokens = [word.lower() for word in tokenize(x) if word.isalnum()] y = normalize(y) y_tokens = [word.lower() for word in tokenize(y) if word.isalnum()] ppx = untokenize(x_tokens) ppy = untokenize(y_tokens) return [jaro(ppx, ppy), ppx, ppy]
def _compare_two_names(name1, name2, max_splits=7, straight_limit=0.93, smart_limit=0.95): splits = name2.split(" ") straight_similarity = jaro(name1, name2) if straight_similarity > smart_limit: return True if straight_similarity > straight_limit: min_pair_distance = 1 for a, b in zip_longest(name1.split(" "), splits): if a is not None and b is not None: min_pair_distance = min(jaro(a, b), min_pair_distance) if min_pair_distance > 0.8: if len(splits) > 1 and DEBUG: tqdm.write("Hmmm, looks like a match {}\t{}".format( name1, name2)) return True else: if len(splits) > 1 and DEBUG: tqdm.write("Check if it's match: {}\t{}".format(name1, name2)) limit = reduce(mul, range(1, max_splits + 1)) if len(splits) > max_splits and DEBUG: tqdm.write("Too much permutations for {}".format(name2)) max_similarity = max( jaro(name1, " ".join(opt)) for opt in islice(permutations(splits), limit)) return max_similarity > smart_limit
def score_of_given_name_and_meta(first4jaro, list4jaro, name1, name2, factor=0.9): ''' This function will take the maximum score between the direct comparison of the name and the phonetic comparison ''' #Jaro is creating odd situations with names which are very different in length, with this modification, we penalize lenght differences a lot len_factor = (abs((len(name1) - len(name2))) / max(len(name1), len(name2))) score_compare = jaro(name1, name2) score_met = get_jaro_to_list(first4jaro, list4jaro, factor=factor) if (len_factor < 0.33) or (1 - len_factor) * (1 - len_factor) > max( score_met, score_compare * score_compare): return max(score_met, score_compare * score_compare) #We undo only in case this new scoring is more negative else: return (1 - len_factor) * (1 - len_factor)
def test_parse(self): for file in os.listdir(SAMPLE_DIR): if not file.endswith(".rst"): continue filename = os.path.join(SAMPLE_DIR, file) article = parse_article(filename) rendered = article.render().strip() with open(filename) as f: source = f.read().strip() source = source.expandtabs(4).decode("utf8") if source != rendered: lev_ = distance(source, rendered) jaro_ = jaro(source, rendered) if lev_ > 10 and jaro_ < 0.8 and file not in MUTATED_FILES: print("%d %f %s" % (lev_, jaro_, filename)) raise AssertionError(filename)
def _compare_two_names( name1, name2, max_splits=7, straight_limit=0.70, smart_limit=0.96 ): straight_similarity = jaro(name1, name2) if straight_similarity > smart_limit: return True if straight_similarity > straight_limit: min_pair_distance = 1 for a, b in zip_longest(name1.split(" "), name2.split(" ")): if a is not None and b is not None: chunk_distance = _smart_jaro(a, b, func=jaro_winkler) min_pair_distance = min(chunk_distance, min_pair_distance) if min_pair_distance > 0.88: return True return False
def match_location(self, location): """ We will mutate the score a bit to add +0.1 to the jaro distance for starting with the same letter. Alexa's speech processing system really sucks at this. location -- the location to match against """ if location: matches = [] for switch_id, switch_func in self.server.switches.items(): similarity = jaro(location, switch_id) if location[0].lower() == switch_id[0].lower(): similarity += 0.1 matches += [(similarity, switch_id, switch_func)] matches.sort(key=lambda x: x[0], reverse=True) if matches[0][0] >= _MATCH_THRESHOLD: return matches[0][1:] raise ActionParseError("I didn't understand the location. " "Could you please repeat?")
def annotate(self, training_set): #Levenshtein distance - minimum number of single character edits distance_udf = udf(lambda x, y: distance(x, y), IntegerType()) #Levenshtein ratio - similarity of two strings ratio_udf = udf(lambda x, y: ratio(x, y), DoubleType()) #Jaro - similarity score jaro_udf = udf(lambda x, y: jaro(x, y), DoubleType()) #Jaro-winkler - similarity score, which favors strings that match prefix from the beginning jaro_winkler_udf = udf(lambda x, y: jaro_winkler(x, y), DoubleType()) #fuzz partial ratio - gives a score based on how well parts of a string match another fuzz_partial_ratio_udf = udf( lambda x, y: fuzz.partial_ratio(x, y) / 100, DoubleType()) training_set = training_set.withColumn("distance", distance_udf("concept_name_1", "concept_name_2")) \ .withColumn("ratio", ratio_udf("concept_name_1", "concept_name_2")) \ .withColumn("jaro", jaro_udf("concept_name_1", "concept_name_2")) \ .withColumn("jaro_wrinkler", jaro_winkler_udf("concept_name_1", "concept_name_2")) \ .withColumn("fuzz_partial_ratio", fuzz_partial_ratio_udf("concept_name_1", "concept_name_2")) return training_set
def score_reconciliation(txn, payment): words = txn.payee.replace('-', ' ').split(' ') bankref_distances = [ratio(w, payment.bankref) for w in words] # Get the two best matches, for the two parts of the bankref bankref_score = sum(sorted(bankref_distances)[-2:]) name_score = jaro(txn.payee, payment.user.name) other_score = 0.0 if txn.amount == payment.amount: other_score += 0.4 if txn.account.currency == payment.currency: other_score += 0.6 # check posted against expiry? app.logger.debug('Scores for txn %s payment %s: %s %s %s', txn.id, payment.id, bankref_score, name_score, other_score) return bankref_score + name_score + other_score
def match_locality(string, localities): ''' Try to figure out which locality 'string' is by finding the known localities that have the maximum (jaro) similarity score ''' if ',' in string: parts = string.split(',') string = parts[1].strip() + ' ' + parts[0] best = 0 matches = [] jaro_hits = [] substring_hits = [] for loc in localities: ulower_string = strip_accents(string.lower().decode("utf-8")) ulower_loc_name = strip_accents(loc['loc_name'].lower().decode("utf-8")) smaller, bigger = ulower_string, ulower_loc_name if len(bigger) < len(smaller): smaller, bigger = bigger, smaller if len(smaller) < len(bigger): # they might actually be the same size if smaller in bigger: hit = dict(loc) hit['score'] = 'sub' substring_hits.append(hit) similarity = jaro(ulower_string, ulower_loc_name) if similarity > best: matches = [] best = similarity if similarity == best: hit = dict(loc) hit['score'] = similarity matches.append(hit) jaro_hits = [(m['loc_name'],m['muni_name']) for m in matches] for s in substring_hits: if (s['loc_name'],s['muni_name']) not in jaro_hits: matches.append(s) return matches
def score_reconciliation(txn, payment): words = list(filter(None, re.split('\W+', txn.payee))) bankref_parts = [payment.bankref[:4], payment.bankref[4:]] bankref_distances = [ratio(w, p) for w in words for p in bankref_parts] # Get the two best matches, for the two parts of the bankref # A match gives 1.0, a 2-char substring 0.666, and a 6-char superstring 0.857 bankref_score = sum(sorted(bankref_distances)[-2:]) name_score = jaro(txn.payee, payment.user.name) other_score = 0.0 if txn.amount == payment.amount: other_score += 0.4 if txn.account.currency == payment.currency: other_score += 0.6 # check posted against expiry? app.logger.debug('Scores for txn %s payment %s: %s %s %s', txn.id, payment.id, bankref_score, name_score, other_score) return bankref_score + name_score + other_score
async def special_matches(self, monster): max_score = 0 for class_attrs in self.monster_class_attributes: val: MonsterModel = monster for class_attr in class_attrs: val: str = getattr(val, class_attr, None) if val is None: continue val: str = val.lower() if self.match == "=" and val == self.string: # Exact match return True, MatchData(self) elif self.match == "r" and bool(re.search(self.string, val)): # Regex match return True, MatchData(self) elif self.match == "g" and fnmatch( val, '*' + self.string + '*'): # Glob match return True, MatchData(self) elif self.string in val: return True, MatchData(self) max_score = max(max_score, jaro(self.string, val)) if max_score >= TOKEN_JW_DISTANCE: return max_score, MatchData(self) return False, MatchData(self)
def get_common_audios(login: str, password: str, *ids): #пока только для 2 акков #добавить lru_cache vk_session = vk_api.VkApi(login, password, app_id=app_id, scope=scope) vk_session.auth() #vk=vk_session.get_api() vkaudio = audio.VkAudio(vk_session) users_track_list = [] for id in ids: user_track_list = set([ track['artist'] + '-' + track['title'] for track in vkaudio.get_iter(id) ]) print("I'm not dead") users_track_list.append(user_track_list) shares = [len(i) for i in users_track_list] common_audios_lst = set() while (len(users_track_list) > 1): common_audios_lst = set() users_track_list.sort(key=lambda x: len(x)) for i in users_track_list[0]: for j in users_track_list[1]: if jaro(i, j) > 0.75: common_audios_lst.add(i) users_track_list.pop(0) print("I'm not dead") users_track_list[0] = common_audios_lst print("_______________________") print(common_audios_lst) shares = [len(common_audios_lst) / i for i in shares.copy()] returnable_text = '' for i in common_audios_lst: returnable_text += (i + '\n') returnable_text += "Процент общих песен\n" for i in shares: returnable_text += (str(round(100 * i, 2)) + r'% ') return returnable_text[:-1]
ddir = '/home/ngaude/workspace/data/cdiscount/' test = pd.read_csv(ddir+'test.csv',sep=';').fillna(''); test['lib'] = map(normalize_guess,test.Libelle.values) test = test.sort('lib').reset_index(drop=True) resultat = pd.read_csv(ddir+'test.csv',sep=';').fillna(''); a = test.lib.values b = [0] for i in range(0,len(a)-1): if len(a[i])<8 or len(a[i+1])<8: b.append(0) else: b.append(jaro(a[i],a[i+1])) """ plt.hist(b,bins=300,cumulative=True) plt.show() """ cut_threshold = np.percentile(b,50) same_categorie_than_previous_item = [i>cut_threshold for i in b] group_categorie = [0]*len(same_categorie_than_previous_item) for i in range(1,len(same_categorie_than_previous_item)): if same_categorie_than_previous_item[i] == True: group_categorie[i] = group_categorie[i-1] else:
word_phonemes = dict() matched = dict() # Map words to their phonemes for i in range(len(words)): word_phonemes[words[i]] = phonemes[i] print ("word1,word2,semantic_similarity,phonetic_similarity,word_similarity,sem_x_phon_similarity") for triade in semsim: a = triade[0] b = triade[1] # ignore duplicated (b,a) dup = matched.get(b + "_" + a, None) if dup: continue matched[a + "_" + b] = True # ignore pairs with the same stem if (stem(a) == stem(b)): continue # Get their phonemes ph1 = word_phonemes.get(a, None) ph2 = word_phonemes.get(b, None) if ph1 is None or ph2 is None: continue # Semantic similarity ss = float(triade[2]) # Phonetic similarity ps = jaro(ph1, ph2) # Word similarity ld = jaro(a,b) print ("%s,%s,%.4f,%.4f,%.4f,%.4f" % (a,b, ss, ps, ld, ss*ps))
if office_holder_dict not in columbus_file: columbus_file.append(office_holder_dict) #puts detroit at large ***office holder*** dicts in a separate dictList elif office_holder_dict['OCDID'] == 'ocd-division/country:us/state:mi/place:detroit' and office_holder_dict['Office Name'] != 'Mayor': if office_holder_dict not in detroit_file: detroit_file.append(office_holder_dict) #puts boston at large ***office holder*** dicts in a separate dictList elif office_holder_dict['OCDID'] == 'ocd-division/country:us/state:ma/place:boston' and office_holder_dict['Office Name'] != 'Mayor': if office_holder_dict not in boston_file: boston_file.append(office_holder_dict) #if the dicts have UIDs, and are not charlotte at large, detroit at large, boston at large, or columbus council members, then start string comparison else: if scraped_dict['UID'] == office_holder_dict['UID']: if scraped_dict['official.name'] == office_holder_dict['Official Name']: print office_holder_dict['UID'],"scraped name: ", scraped_dict['official.name'], "file name: ",office_holder_dict['Official Name'], '\n\t>>>all good, exact match' elif jaro(scraped_dict['official.name'].lower().replace(' ', '').replace('"','').replace('.','').replace(',',''),office_holder_dict['Official Name'].lower().replace(' ', '').replace('"','').replace('.','').replace(',','')) > .65: print office_holder_dict['UID'], scraped_dict['official.name'],office_holder_dict['Official Name'], '\n\t>>>not exact match, but high lev score' else: print "\n\t>>>found a difference!" print office_holder_dict['UID'],"scraped name: ", scraped_dict['official.name'], "file name: ",office_holder_dict['Official Name'] print jaro(scraped_dict['official.name'],office_holder_dict['Official Name']) #answer = raw_input("\n\t>>>is this a meaningful difference? Y/N") #if answer == "Y" or answer == "y": checkList.append(office_holder_dict['UID']) ###output from initial scrape compare string comparisons txt_file.append("\nCheck List: "+ ",".join(checkList)) txt_file.append("\nNo UID for:"+ ",".join(GPmissingList))
def handle(self, *args, **options): activate(settings.LANGUAGE_CODE) all_companies = [] keys = ["pk", "code", "name", "name_en", "short_name", "short_name_en"] for p in Company.objects.all(): all_companies.append(dict(zip(keys, [ p.pk, p.edrpou, p.name_uk, p.name_en, p.short_name_uk, p.short_name_en, ]))) grouped_by_code = defaultdict(list) grouped_by_name = defaultdict(list) # First pass: exact matches by code, full name or short name for l in all_companies: code = self.cleanup(l["code"]) if len(code) > 2: grouped_by_code[code].append(l["pk"]) for k in ["name", "name_en", "short_name", "short_name_en"]: name = self.cleanup(l[k]) if len(name) > 3: grouped_by_name[name].append(l["pk"]) spoiled_ids = set() chunks_to_review = list() for k, v in grouped_by_code.items(): if len(set(v)) > 1: spoiled_ids |= set(v) chunks_to_review.append(v) for k, v in grouped_by_name.items(): if len(set(v)) > 1: spoiled_ids |= set(v) chunks_to_review.append(v) for chunk in chunks_to_review: try: CompanyDeduplication( company1_id=chunk[0], company2_id=chunk[1], company1_json=Company.objects.get(pk=chunk[0]).to_dict(), company2_json=Company.objects.get(pk=chunk[1]).to_dict(), ).save() except IntegrityError: pass candidates_for_fuzzy = [ l for l in all_companies if l["pk"] not in spoiled_ids ] for a, b in combinations(candidates_for_fuzzy, 2): for field_a, field_b in product(["name", "short_name"], repeat=2): val_a = self.cleanup(a[field_a]) val_b = self.cleanup(b[field_b]) if len(val_a) < 4 or len(val_b) < 4: continue if self.cleanup_digits(a[field_a]) == self.cleanup_digits(b[field_b]): continue score = jaro(val_a, val_b) if score > 0.97: try: CompanyDeduplication( company1_id=a["pk"], company2_id=b["pk"], company1_json=Company.objects.get(pk=a["pk"]).to_dict(), company2_json=Company.objects.get(pk=b["pk"]).to_dict(), fuzzy=True, ).save() break except IntegrityError: pass for field_a, field_b in product(["name_en", "short_name_en"], repeat=2): val_a = self.cleanup(a[field_a]) val_b = self.cleanup(b[field_b]) if len(val_a) < 4 or len(val_b) < 4: continue if self.cleanup_digits(a[field_a]) == self.cleanup_digits(b[field_b]): continue score = jaro(val_a, val_b) if score > 0.97: try: CompanyDeduplication( company1_id=a["pk"], company2_id=b["pk"], company1_json=Company.objects.get(pk=a["pk"]).to_dict(), company2_json=Company.objects.get(pk=b["pk"]).to_dict(), fuzzy=True, ).save() break except IntegrityError: pass
def has_matching_word(phrase1, phrase2): for word1 in phrase1.split(): for word2 in phrase2.split(): if jaro(word1, word2) > 0.9: return True
def _jaro(a,b): """Jaro The Jaro string similarity metric is intended for short strings like personal last names.""" return jaro(a,b)
def run(self, entry): self.logger.create(entry['ID']) article = search(entry) crit, missing, support = self.check_tags(entry) if not support: self.logger.log( 'No support for ' + entry['ENTRYTYPE'] + '. Currently, only the following entry types are supported: ' + ', '.join(set(self.tags.keys()).symmetric_difference({'all'})), 2) self.logger.print() return entry if not crit: self.logger.log('missing critical tag(s): ' + ' '.join(missing), 3) self.logger.print() return entry if article is None: self.logger.log('article could not be found on PubMed', 2) self.logger.print() return entry if len(missing) > 0: self.logger.log( 'the following tags are missing: ' + ' '.join(missing), 1) self.logger.unindent() # compare authors list matching, sorted = self.cmp_auth(self.parse_auth(entry['author']), article.authors) if not matching: self.logger.log('authors mismatch:', 3) self.logger.log('PM: ' + ' & '.join(article.authors), 3) self.logger.unindent() self.logger.log( 'bib: ' + ' & '.join(self.parse_auth(entry['author'])), 3) self.logger.unindent() self.logger.unindent() entry['author'] = self.pack_auth(article.authors) elif not sorted: self.logger.log('authors list misordered:', 3) self.logger.log('PM: ' + ' & '.join(article.authors), 3) self.logger.unindent() self.logger.log( 'bib: ' + ' & '.join(self.parse_auth(entry['author'])), 3) self.logger.unindent() self.logger.unindent() entry['author'] = self.pack_auth(article.authors) # critical comparisons for tag in set(self.tags[entry['ENTRYTYPE']]).symmetric_difference( {'author'}): idx = [entry[tag]] if tag in self.aliases.keys(): for i in self.aliases[tag].keys(): if jaro(idx[0].strip(stripsym).lower(), i.strip(stripsym).lower()) > self.threshold[tag]: idx.append(self.aliases[tag][i]) if not any([ self.threshold[tag] <= jaro( article.__dict__[dictionary[tag]].strip( stripsym).lower(), i.strip(stripsym).lower()) for i in idx ]): self.logger.log(tag + ' mismatch:', 3) self.logger.log( 'PM: ' + article.__dict__[dictionary[tag]].strip(stripsym), 3) self.logger.unindent() self.logger.log('bib: ' + entry[tag].strip(stripsym), 3) self.logger.unindent() self.logger.unindent() entry[tag] = article.__dict__[dictionary[tag]] self.logger.print() return entry
def discogs_ordered_search(query, item_type, limit=100): name_pattern = ' \([0-9]+\)' q_stripped = query.strip("'\"") # special case when searching directly by id if q_stripped.isdigit(): url = 'http://{host}/{item_type}s/{query}'.format( host=DISCOGS_HOST, query=urllib.quote_plus(query.lower()), item_type=item_type ) log.debug('search by id: {0}'.format(url)) r = requests.get(url) if not r.status_code == 200: return [] data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST)) # TODO: not very nice - remap some fields if item_type == 'release': if 'title' in data: data['title'] = re.sub(name_pattern, '', data['title']) if 'formats' in data: formats = [] for format in [f['name'] for f in data['formats'] if 'name' in f]: formats.append(format) data['format'] = formats if 'labels' in data: try: data['catno'] = data['labels'][0]['catno'] except KeyError: pass if item_type == 'artist': if 'name' in data: data['title'] = re.sub(name_pattern, '', data['name']) if 'aliases' in data: aliases = [] for alias in [a['name'] for a in data['aliases'] if 'name' in a]: aliases.append(re.sub(name_pattern, '', alias)) data['aliases'] = aliases if 'members' in data: members = [] for member in [m['name'] for m in data['members'] if 'name' in m]: members.append(re.sub(name_pattern, '', member)) data['members'] = members if 'images' in data: for image in [i['uri150'] for i in data['images'] if 'type' in i and i['type'] == 'primary']: data['thumb'] = image break return [data,] url = 'http://{host}/database/search?q={query}&type={item_type}&per_page=100'.format( host=DISCOGS_HOST, query=urllib.quote_plus(query.encode('utf8').lower()), item_type=item_type ) results = [] results_unsorted = [] results_exact = [] results_start = [] results_other = [] x = 0 while url and x < API_MAX_REQUESTS: log.debug(url) r = requests.get(url) if not r.status_code == 200: return [] data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST)) url = reduce(dict.get, ['pagination', 'urls', 'next'], data) for r in data['results']: if 'title' in r: title = r['title'] formatted_title = re.sub(name_pattern, '', title) r['index'] = get_index(title) r['formatted_title'] = formatted_title r['uri'] = 'https://www.discogs.com%s' % r['uri'] r['dist'] = distance(formatted_title.lower(), q_stripped.lower()) r['dist1'] = jaro(formatted_title.lower(), q_stripped.lower()) r['dist2'] = jaro_winkler(formatted_title.lower(), q_stripped.lower()) r['dist3'] = ratio(formatted_title.lower(), q_stripped.lower()) # print r['dist'], # print r['dist1'], # print r['dist2'], # print r['dist3'], # print formatted_title.lower(), # print '::: {0} <> {1}'.format(formatted_title.lower(), q_stripped.lower()) results_unsorted.append(r) if formatted_title.lower() == q_stripped.lower(): #print 'exact', formatted_title.lower() results_exact.append(r) elif formatted_title.lower().startswith(q_stripped.lower()[0:10]): #print 'start', formatted_title.lower() results_start.append(r) else: #print 'other', formatted_title.lower() results_other.append(r) x += 1 #results = sort_results(results_exact) + sort_results(results_start)+ sort_results(results_other) results = sort_results_by_distance(results_unsorted) if item_type == 'artist': results = populate_results(results) return results[0:limit]