def suggest_identifier(id, names): sorted_names = sorted(names, key=lambda other: jaro_winkler(id, other), reverse=True) if len(sorted_names) > 0: if jaro_winkler(id, sorted_names[0]) > 0.0 and similarity( id, sorted_names[0]) > 0.5: return sorted_names[0]
def _most_similar_ident(self, id): names = set() names.update(self.host_environment.keys()) for typing_env in reversed(self.env_stack): names.update(typing_env.keys()) sorted_names = sorted(names, key=lambda other: jaro_winkler(id, other), reverse=True) if len(sorted_names) > 0: if jaro_winkler(id, sorted_names[0]) > 0.0: return sorted_names[0]
def entity_similarity(left, right): left_name = left.get('name') right_name = right.get('name') score = 0 if left_name is not None and right_name is not None: name_sim = jaro_winkler(chomp(left_name), chomp(right_name)) score += (name_sim * 0.6) left_fp = fingerprints.generate(left_name) right_fp = fingerprints.generate(right_name) if left_fp is not None and right_fp is not None: fp_sim = jaro_winkler(left_fp, right_fp) score += (fp_sim * 0.4) return min(1.0, score)
def start(self): threads_ids = [] threads = self.client.fetchThreadList() full_msgs = [] for i in threads: if i.name in self.DANGEROUS_PEOPLE: a = self.client.fetchThreadMessages(i.uid) full_msgs.append(a) msgs_to_delete = [] for i in range(len(full_msgs)): flag = False for j in range(len(full_msgs[i])): if full_msgs[i][j].text: words = full_msgs[i][j].text.split(' ') for l in words: for m in self.DANGEROUS_WORDS: if jaro_winkler(l, m) >= 0.75: msgs_to_delete.append(full_msgs[i][j]) flag = True if flag: break if flag: break for i in msgs_to_delete: self.client.deleteMessages(i.uid)
def detect(self, company_name, company_website=None): request = 'site:linkedin.com/company "%s"' % company_name result = self._fetch(request, company_name) if result is None and company_website is not None: company_domain = urlparse(company_website).netloc if company_domain != "": request = 'site:linkedin.com/company "%s"' % company_domain result = self._fetch(request, company_name) if result is None: return result if not LINKEDIN_URL.match(result.url): #sys.stderr.write("Not a linkedin url: " + result.url + "\n") return None company_identifier = LINKEDIN_URL.search( result.url).groupdict()["company"] #If the identifier is the universal name and not the id, we test for similarity try: int(company_identifier) except ValueError: score = jaro_winkler(normalize(company_name), normalize(company_identifier)) if score < 0.7: #sys.stderr.write("%s too distant from %s (%.2f)\n" % (normalize(company_name), # normalize(company_identifier), # score)) return None return result
def jarowinkler_distance(words: Iterator[str], vocabulary: str): """Corrects the words based on JaroWinkler distances Args: words (Iterator[str]): Iterator over the misspelled words vocabulary (str) : dictionary holding words and their frequency """ for word in words: distances = [] suggestions = [] vocab_list = list(vocabulary) for (i,vocab) in enumerate(vocab_list): distances.append(jaro_winkler(word, vocab)) idx = np.array(distances).argsort()[::-1][:5] for i in range(5): for j in range(i+1,5): if distances[idx[i]] == distances[idx[j]]: if vocabulary.get(vocab_list[idx[i]]) < vocabulary.get(vocab_list[idx[j]]): temp = idx[i] idx[i] = idx[j] idx[j] = temp for i in idx: suggestions.append(vocab_list[i]) output("{misspelled}\t{corrections}".format( misspelled=word, corrections="\t".join(suggestions) )) # may cause IO bottleneck
def gen_text_similarity_feature(sa, sb, prefix='', ngrams_word_jaccard=[], use_char_ngram_jaccard=False, ngrams_char_jaccard=[3, 4, 5]): if not isinstance(sa, str) or not isinstance(sb, str): return {} feats = {} wa0 = tokenize0(sa) wb0 = tokenize0(sb) wa1 = tokenize1(sa) wb1 = tokenize1(sb) feats[prefix + 'word0_jaccard'] = jaccard(wa0, wb0) feats[prefix + 'word1_jaccard'] = jaccard(wa1, wb1) for n in ngrams_word_jaccard: feats[prefix + 'word0_jaccard_{}gram'.format(n)] = word_jaccard_ngram( wa0, wb0, n) feats[prefix + 'word1_jaccard_{}gram'.format(n)] = word_jaccard_ngram( wa1, wb1, n) if use_char_ngram_jaccard: for n in ngrams_char_jaccard: feats[prefix + 'char_jaccard_{}gram'.format(n)] = char_jaccard_ngram( sa, sb, n) feats[prefix + 'jw'] = jaro_winkler(sa, sb) feats[prefix + 'edit_distance_ratio'] = edit_distance(sa, sb) / (len(sa) + len(sb)) return feats
def calc_ratio_name(token: Union[QueryToken, str], full_word: str, prefix_weight: float, index: MonsterIndex) -> float: """Calculate the name distance between two tokens""" string = token.value if isinstance(token, QueryToken) else token mw = index.mwt_to_len[full_word] != 1 jw = jaro_winkler(string, full_word, prefix_weight) if string != full_word: if isinstance(token, QueryToken) and token.exact: return 0.0 if string.isdigit() and full_word.isdigit(): return 0.0 if full_word == string: score = 1.0 elif len(string) >= 3 and full_word.startswith(string): score = .995 if mw and jw < score: return score else: score = jw if mw: score = score**10 * index.mwt_to_len[full_word] return score
def find_assignee(self, bz_patchers, hg_patchers, bz_commenters, bz_info): """Find a potential assignee. If an email is common between patchers (people who made patches on bugzilla) and hg patchers then return this email. If "Foo Bar [:foobar]" made a patch and his hg name is "Bar Foo" return the corresponding Bugzilla email. """ if not bz_patchers: # we've no patch in the bug # so try to find an assignee in the commenters bz_patchers = set(bz_commenters.keys()) potential = set() hg_patchers_mail = set(mail for _, mail in hg_patchers) common = bz_patchers & hg_patchers_mail if len(common) == 1: # there is a common email between Bz patchers & Hg email return list(common)[0] # here we try to find at least 2 common elements # in the creator real name and in the hg author name hg_patchers_name = [self.clean_name(name) for name, _ in hg_patchers] for bz_patcher in bz_patchers: if bz_patcher not in bz_info: continue real_name = self.clean_name(bz_info[bz_patcher]) for name in hg_patchers_name: if len(name & real_name) >= 2: potential.add(bz_patcher) # try to find similarities between email and name for name in hg_patchers_name: possible_mail_parts = self.mk_possible_mails(name) for bz_patcher in bz_patchers: _bz_patcher = self.clean_mail(bz_patcher) for part in possible_mail_parts: if len(part) >= 5 and part in _bz_patcher: potential.add(bz_patcher) # try to find similarities between email in using Jaro-Winkler metric for b in bz_patchers: _b = self.clean_mail(b) for h in hg_patchers_mail: _h = self.clean_mail(h) d = 1 - jaro_winkler(_b, _h) if d <= 0.2: potential.add(b) if potential: potential = list(potential) if len(potential) == 1: return potential[0] return max( ((p, bz_commenters.get(p, 0)) for p in potential), key=lambda x: x[1] )[0] return None
def leven_music(agg_musics): """ jaro_winklerによって曲を表記ゆれを吸収していく 観覧車の表記ゆれを吸収できなかったので、music_listsに 観覧車 ~あの日と、昨日と今日と明日と~ 観覧車 をそれぞれいれておく。 """ """ music_lists = [ 'Ever Spiral','ラムネ','Blue Planet','Square of the moon','散歩日和', '雨のちキミと晴れ模様','アマオト','七色の空','星に想いを夜に願いを','ひとひら', 'ありがとう','向日葵','Aozora','happiness','あした天気になあれ','アイの庭','夢遥か', '夜空','My song','12 Stories','Melody','蕾','青×春☆','甘い罠','Platonic syndrome', 'Love letter','フタリ','No.51','カラフル','二人色','Revolution!','ISI','光の溢れるときには', '夢の通り道','コイノハナ','恋をしよーよ','手紙','アルビナ','Dear','たからもの','Snow wish', 'With you','こころの種','Cafe','アリガト','Love Clover','クローバー','カラフルDiary', 'Temptation (Duca Ver)','久遠の夢','たいせつなきみのために、ぼくにできるいちばんのこと', '僕らの日々','Only you','ADABANA -仇華-','終わりのはじまり','ニブルヘイム','祈りの虹', 'いろんなカタチ','ツナグミライ','キミガスキ','キミの大きな手',"絶対Darli'n",'大好きだよ。', 'シアワセ定義','愛しいキズナ','Brand-New World','Lie','風の唄','恋せよ!乙女', 'Save the Tale',"Welcome☆Berry's",'また好きになる','赤い薔薇、銀色の月','COLD BUTTERFLY', 'スターライン','ひとひら ゆらゆらり','幸せのオトシモノ','キミとなら','ボク恋','桜色の想い','Story', '君がいてくれたから','太陽とキミと',"Eden's healing",'タイムカプセル','ことば旅行','marry me?', 'Wishing you','inertia world','My First Love','シアワセのハジマリ','Make a Wish', 'しあわせの場所','ロケット☆ライド','シアワセsummer','Dreamer','叶えたい未来','Aria','キミとメロディ', 'Moon Beams','Rainbow Color','恋の記憶','snow crystal','恋をしようよ Let it snow', 'Jewel Days','メグルmerry-go-round','未来トラベリング','My Darling','想いのハーモニー','恋のAria', '0 ~zero~',"I'm in the side",'Nothing','恋想葬','約束','想い出のパズル','Passion', 'アイオライト','恋するまでの時間','かさねた気持ち','eternal','雪の街 キミと','Confession Eve','雫', '記憶×ハジマリ','Chaser×Chaser','1/5 (ゴブンノイチ)','シアワセの理由','笑顔のレシピ','イロドリ', '灼熱 Heart Beat','be confidence','恋するletter','かさなるココロ','コイイロセカイ', 'beloved story','優しい雨','キミと...','Hello,Future!','ナツコイ','あいのうた','Say to you', 'Fate Line','キミのオト','eyes to eyes','キミへ贈るメロディー','End of the Line','Still', '君がいない明日','achromia','ネコイチ','Growing','内緒のホント','Blooming', '観覧車 ~あの日と、昨日と今日と明日と~','観覧車',candy♥girl','Dribing story', 'ラムネ (12 Stories ver)','ラムネ -strings arrange-','星に想いを夜に願いを Ending arrange ver', '夢遥か Piano Arrange Ver','カラフル (ロックバージョン)','二人色 (Jump out mix)', 'フルスロットルHeart (cobalt green Remix)','光の溢れるときには (arrange ver)', 'いろんなカタチ Piano Arrange Ver','ロケット☆ライド (AUG Remix)', 'カラフルDiary Piano Arrange Ver','snow crystal ~Acoustic Arrange~', 'また好きになる Piano Arrange Ver' ] """ music_lists = ['ロケット☆ライド', '恋をしよーよ', "Welcome☆Berry's", '観覧車'] result = {} for music_name, count in agg_musics.items(): for music_list_name in music_lists: if not music_list_name in result: result[music_list_name] = 0 jalo = jaro_winkler(music_list_name, music_name) if jalo > 0.84: result[music_list_name] += int(count) else: continue return result
def match_names(name1_df, name2_df, reduce=True): name_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3)) name_vectorizer.fit(list(name1_df['name'])) name1_X = name_vectorizer.transform(name1_df['name']) name2_X = name_vectorizer.transform(name2_df['name']) similarity_matrix = cosine_similarity(name1_X, name2_X) matches = [] for i, j in zip(*similarity_matrix.nonzero()): # Similarity between names name_sim_cs = similarity_matrix[i, j] if name_sim_cs < 0.5: continue match1 = name1_df.iloc[i] match2 = name2_df.iloc[j] name1 = match1['name'] name2 = match2['name'] name_sim_jw = jaro_winkler(name1, name2, 0.1) similarity = (name_sim_cs + name_sim_jw) / 2 # Similarity between titles title1 = match1['title'] title2 = match2['title'] titles = title1.split() title_sim = jaccard(titles, title2.split()) if title1 != '': title_weight = min(len(titles), 2) * 0.1 similarity = (title_weight * title_sim + (1 - title_weight) * similarity) matches.append([ match1['id'], match1['seq'], match1['full_name'], name1, title1, match2['id'], match2['seq'], match2['full_name'], name2, title2, name_sim_cs, name_sim_jw, title_sim, similarity ]) headers = [ 'id', 'seq', 'full_name', 'name', 'title', 'match_id', 'match_seq', 'match_full_name', 'match_name', 'match_title', 'name_similarity_cs', 'name_similarity_jw', 'title_similarity', 'similarity' ] match_df = pd.DataFrame(data=matches, columns=headers) if reduce: return match_df.groupby(['id']).apply(choose_match) else: return match_df
def _is_close(self, text, thresh=0.94): words = self.__preprocess(text) self.df["Scores"] = self.df["Profane"].apply( lambda x: max(jaro_winkler(w, x) for w in words)) bword, score = self.df.sort_values("Scores", ascending=False).iloc[0].tolist() if score > thresh: return True return False
def lev_menu_for (word): with_ratio = [ (jaro_winkler(word, key), key) for key in keys_without_accents.keys() ] with_ratio.sort(reverse = True) options = filter(lambda (ratio, key): ratio > 0.9, with_ratio[0:4]) if options: print "Quizás quiso decir:" for ratio, key in options: print search(key)
def calc_ratio_modifier(s1: Union[QueryToken, str], s2: str, prefix_weight: float = .05) -> float: """Calculate the modifier distance between two tokens""" if isinstance(s1, QueryToken): if s1.exact: return 1.0 if s1.value == s2 else 0.0 s1 = s1.value return jaro_winkler(s1, s2, prefix_weight)
def text_score(match, candidates): if isinstance(candidates, basestring): candidates = [candidates] match_n = normalize(match) best_score = 0 for candidate in candidates: cand_n = normalize(candidate) score = jaro_winkler(match_n, cand_n, 0.02) * 100 best_score = max(int(score), best_score) return best_score
def participant_autocomplete(request): ctx = request.context keyword = request.GET.get('q') if not keyword: raise HTTPBadRequest("please specify search terms (q)") limit = request.GET.get('limit', 20) try: limit = int(limit) except: raise HTTPBadRequest("limit must be an integer") if limit > 100: raise HTTPBadRequest("be reasonable") query = AgentProfile.default_db.query( AgentProfile.id, AgentProfile.name, User.username ).outerjoin(User).filter((User.verified == True) | (User.id == None)) discussion = ctx.get_instance_of_class(Discussion) if discussion: query = query.filter(AgentProfile.id.in_( discussion.get_participants_query(True, True).subquery())) if len(keyword) < 6: query = query.add_column(literal(0)) matchstr = '%'.join(keyword) matchstr = '%'.join(('', matchstr, '')) agents = query.filter(AgentProfile.name.ilike(matchstr) | User.username.ilike(matchstr) ).limit(limit * 5).all() agents.sort(key=lambda u: max( jaro_winkler(u[1], keyword), jaro_winkler(u[2], keyword) if u[2] else 0 ), reverse=True) num = min(len(agents), limit) agents = agents[:num] else: matchstr = keyword query, rank = add_simple_text_search( query, [AgentProfile.name], keyword.split()) agents = query.order_by(rank.desc()).limit(limit).all() return {'results': [{ 'id': AgentProfile.uri_generic(id), 'text': name} for (id, name, username, rank) in agents]}
def find_similar_words(word_to_match: str): """Edit distance function.""" topologic_response = requests.get(os.path.join(TOPOLOGIC["api"], "get_all_field_values", TOPOLOGIC["dbname"]), params={"field": "word"}) words = topologic_response.json()["field_values"] similar_words: List[Tuple[str, float]] = [] for word in words: similarity = jaro_winkler(word_to_match, word, 0.15) if similarity >= 0.85: similar_words.append((word, similarity)) similar_words.sort(key=lambda x: x[1], reverse=True) return [word for word, _ in similar_words]
def _find_module_match(self, pattern, exact=False): logging.debug('matching on {}'.format(pattern)) matches = [] if isinstance(pattern, unicode): pattern = pattern.encode('ascii', 'ignore') logging.debug('_find_module_match: {}'.format(pattern)) noext = pattern.replace('.py', '').replace('.ps1', '') # exact is looking for a very precise name such as "vmware_guest" if exact: candidates = [pattern] else: candidates = [pattern, '_' + pattern, noext, '_' + noext] for k, v in self.MODULES.items(): if v['name'] in candidates: logging.debug('match {} on name: {}'.format(k, v['name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in self.MODULES.items(): if k == pattern: logging.debug('match {} on key: {}'.format(k, k)) matches = [v] break # spellcheck if not exact and not matches and '/' not in pattern: _pattern = pattern if not isinstance(_pattern, unicode): _pattern = _pattern.decode('utf-8') candidates = [] for k, v in self.MODULES.items(): vname = v['name'] if not isinstance(vname, unicode): vname = vname.decode('utf-8') jw = jaro_winkler(vname, _pattern) if jw > .9: candidates.append((jw, k)) for candidate in candidates: matches.append(self.MODULES[candidate[1]]) return matches
def checkForMatches(newAgents, agents): merged = defaultdict(dict) for agent in agents: merger = list(filter(lambda x: jaro_winkler(x['name'].lower(), agent['name'].lower()) > 0.8, newAgents)) if(len(merger) > 0): mergedAgent = merger[0] merged[mergedAgent['name']] = Agent.mergeFromDict(agent, mergedAgent) else: merged[agent.name] = agent for newAgent in newAgents: if newAgent.name not in merged: merged[newAgent.name] = newAgent return merged.values()
def calc_ratio_name(token, full_word, index2, factor=.05): mw = index2.mwt_to_len[full_word] != 1 jw = jaro_winkler(token, full_word, factor) if full_word == token: score = 1 elif len(token) >= 3 and full_word.startswith(token): score = .995 if mw and jw < score: return score else: score = jw if mw: score = score ** 10 * index2.mwt_to_len[full_word] return score
def resolve_country(country_text): """ We are going to look for the best match between the country_text informed and the ISO 3166-1 alfa-3 code :param country_text: the text about the country :return: the ISO 3166-1 alfa-3 code """ country_text = str(country_text.lower()) # Check if the country_text is an ISO 3166-1 alfa-3 code try: if COUNTRIES_3LETTER_CODE.index(country_text): return country_text.upper() except ValueError: pass # Check if the country_text is an ISO 3166-1 alfa-2 code try: if COUNTRIES_2LETTER_CODE.index(country_text): return COUNTRIES_3LETTER_CODE[COUNTRIES_2LETTER_CODE.index( country_text)].upper() except ValueError: pass # Check if the country_text is a recognized name try: if COUNTRIES_NAME.index(country_text): return COUNTRIES_3LETTER_CODE[COUNTRIES_NAME.index( country_text)].upper() except ValueError: pass # Look for the closest name to the one informed using # the levenshtein distance similarity_ratios = [] for index, valid_country_name in enumerate(COUNTRIES_NAME): similarity_ratios.append((COUNTRIES_3LETTER_CODE[index].upper(), jaro_winkler(valid_country_name, str(country_text)))) similarity_ratios = reversed( sorted(similarity_ratios, key=lambda x: x[1])) return similarity_ratios.next()[0]
def is_new(self, user_req=False, test=False): with open(self.log) as log, open(self.other_log) as other_log, open(self.decisions, 'a') as decisions: decisions.write('Compare case: '+self.url+'\n'+self.img_url+'\n') for url in other_log.readlines()+log.readlines()[::-1]: comp_article = cache.get(url.strip()) if not comp_article: print('set cache') comp_article = NewArticle(url=url.strip(), compare=True) array = comp_article.img_array title = comp_article.url_title lang = comp_article.language val = CacheArticle(array, title, lang) cache.set(url, val, 86400) img_index = mse(self.img_array, comp_article.img_array) text_index = jaro_winkler( self.url_title, comp_article.url_title, 0.25 ) line = url+ comp_article.img_url+'\n'+ \ 'mse='+str(img_index)+', '+'text'+str(text_index)+'\n'+'\n' decisions.write(line) if img_index < 25: #denends from order if comp_article.language != self.language: decisions.write('found fersion'+'\n'+'\n') self.similar_url = comp_article.url return False if text_index == 1: self.update_url = comp_article.url decisions.write('update'+'\n'+'\n') return False decisions.write('write to bd'+'\n'+'\n') if user_req: with open(self.other_log, 'a') as other_log: other_log.write(url+'\n') print('new') return True
def merge_positions(self, position1, position2): lposition1 = strip_punctuation(position1.lower()) lposition2 = strip_punctuation(position2.lower()) if lposition1 == lposition2: return position1 if jaro_winkler(lposition1, lposition2) >= 0.7: return position1 is_fr1 = self._is_french(lposition1) is_fr2 = self._is_french(lposition2) if is_fr1 and not is_fr2: return position1 elif is_fr2 and not is_fr1: return position2 return position1 if len(lposition1) > len(lposition2) else position2
def annotate(self, training_set): #Levenshtein distance - minimum number of single character edits distance_udf = udf(lambda x, y: distance(x, y), IntegerType()) #Levenshtein ratio - similarity of two strings ratio_udf = udf(lambda x, y: ratio(x, y), DoubleType()) #Jaro - similarity score jaro_udf = udf(lambda x, y: jaro(x, y), DoubleType()) #Jaro-winkler - similarity score, which favors strings that match prefix from the beginning jaro_winkler_udf = udf(lambda x, y: jaro_winkler(x, y), DoubleType()) #fuzz partial ratio - gives a score based on how well parts of a string match another fuzz_partial_ratio_udf = udf( lambda x, y: fuzz.partial_ratio(x, y) / 100, DoubleType()) training_set = training_set.withColumn("distance", distance_udf("concept_name_1", "concept_name_2")) \ .withColumn("ratio", ratio_udf("concept_name_1", "concept_name_2")) \ .withColumn("jaro", jaro_udf("concept_name_1", "concept_name_2")) \ .withColumn("jaro_wrinkler", jaro_winkler_udf("concept_name_1", "concept_name_2")) \ .withColumn("fuzz_partial_ratio", fuzz_partial_ratio_udf("concept_name_1", "concept_name_2")) return training_set
def agentParser(self, agents, fields): outAgents = {} for agent in agents: if agent is None: continue agentParts = agent.split('|') while len(agentParts) < len(fields): agentParts.insert(1, '') rec = dict(zip(fields, agentParts)) recKey = re.sub(r'[.,:\(\)]+', '', rec['name'].lower()) if rec['name'] == '' and rec['viaf'] == '' and rec['lcnaf'] == '': continue existingMatch = False for oaKey, oa in outAgents.items(): for checkField in ['viaf', 'lcnaf']: if rec[checkField] and rec[checkField] != '' and rec[ checkField] == oa[checkField]: existingMatch = True SFRRecordManager.mergeAgents(oa, rec) break if existingMatch is False: if jaro_winkler(oaKey, recKey) > 0.9: SFRRecordManager.mergeAgents(oa, rec) existingMatch = True break if existingMatch is False: if 'role' in rec.keys(): rec['roles'] = list(set([rec['role']])) del rec['role'] outAgents[recKey] = rec return [a for _, a in outAgents.items()]
def search_schedule_matches(self, sch_tweet_id, tweet_text_tokens, entities_matched, entities_token_matched, matched_sch_ids): """ Search for matches betweet schedule tweets annotated entities and ugc tweet. It returns the matches found. """ # Iterate over the entities of the schedule for sch_entity in self.DictSched[sch_tweet_id]['entities']: sch_entity_strip = sch_entity[0].lower().split() sch_entity_type = sch_entity[1] # Sanity check over schedule entities annotated if not sch_entity_strip: continue # Get token matched between ugc and schedule tweet token_matches = [ (t, sch_entity_type) for t in sch_entity_strip if [ s for s in tweet_text_tokens if jaro_winkler(t.lower(), s.lower()) >= 0.95 ] and t not in self.stopwords and t not in string.punctuation ] # Compute score for string similarity score = len(token_matches) / float(len(sch_entity_strip)) # Check if string similarity conditions are valid if (sch_entity_type.endswith('Contributor') and score >= self.contr_tsl) or \ (sch_entity_type.endswith('Work') and score >= self.work_tsl): # Discard matches againts already matched entities if sch_entity_strip not in entities_matched: entities_matched.append(sch_entity_strip) if token_matches not in entities_token_matched: entities_token_matched += token_matches matched_sch_ids.append(sch_tweet_id) return entities_matched, entities_token_matched, matched_sch_ids
def name_match(self, name_a, name_b): ''' For example, we cannot totally assert Professor 'J. Tang' and Professor 'Jie Tang' are the same person. We use this function to estimate how much possibility that two professors' name belong to one person. ''' name_a = name_a.lower().strip().replace('.', '').replace('-', '').replace( u'\xa0', '') name_b = name_b.lower().strip().replace('.', '').replace('-', '') if name_a == name_b: return 1 elif name_a[0] != name_b[0]: return 0 lastname_a = name_a.split(' ')[-1] lastname_b = name_b.split(' ')[-1] if lastname_a != lastname_b: return 0 firstname_a = name_a.split(' ')[0] firstname_b = name_b.split(' ')[0] if len(firstname_a) != 1 and len(firstname_b) != 1: return 0 return jaro_winkler(name_a, name_b)
def select_best_company(query_params, candidates): """ We are going to calculate the avg distance between the expected value for each parameter (name, domain) and the values of the candidates for them :param query_params: the values to check :param candidates: all the companies that matched with the previous params :return: the selected candidate that better match """ logging.debug("Looking for the best match...") # All the distances for each param for each candidate ratios = [] for index, candidate in enumerate(candidates): logging.debug("Candidate #%d: %s" % (index, candidate)) candidate_ratios = [] for param_name, param_value in query_params.iteritems(): if param_name in candidate and candidate[param_name]: candidate_ratios.append( jaro_winkler( str(param_value), str(candidate[param_name].encode('utf-8')))) # Calculate the distance ratio as the AVG between all the computed # ratios for the candidate ratios.append( (float(sum(candidate_ratios)) / (len(query_params) - 1), candidate)) # The best match first ratios = reversed(sorted(ratios, key=lambda x: x[0])) # Return the select candidate being the best match return ratios.next()[1]
def randomize_choices(self, filename): qs = [] for idq, question in enumerate(self.questions): if question['filename'] == filename: qs.append([idq, question]) #all_choices = [x[1]['answer'] for x in qs] all_choices = [] for x in qs: answer = x[1]['answer'] if isinstance(answer, list): for y in answer: all_choices.append(y) else: all_choices.append(answer) for question in qs: answers = question[1]['answer'] if isinstance(answers, list): continue #import epdb; epdb.st() answer = question[1]['answer'] choices = [ answer, random.choice(all_choices), random.choice(all_choices) ] matches = [] for ac in all_choices: if ac.lower() == answer.lower(): continue jw = jaro_winkler(answer, ac) matches.append([str(jw).lower(), str(ac).lower()]) matches = sorted(matches, key=lambda x: x[0]) choices.append(matches[-1][1]) choices.append(matches[-2][1]) self.questions[question[0]]['choices'] = choices[:]
def create_jw_blocks(list_of_lawyers): """ Receives list of blocks, where a block is a list of lawyers that all begin with the same letter. Within each block, does a pairwise jaro winkler comparison to block lawyers together """ global blocks consumed = defaultdict(int) print 'Doing pairwise Jaro-Winkler...', len(list_of_lawyers) for i, primary in enumerate(list_of_lawyers): if consumed[primary]: continue consumed[primary] = 1 blocks[primary].append(primary) for secondary in list_of_lawyers[i:]: if consumed[secondary]: continue if primary == secondary: blocks[primary].append(secondary) continue if jaro_winkler(primary, secondary, 0.0) >= THRESHOLD: consumed[secondary] = 1 blocks[primary].append(secondary) pickle.dump(blocks, open('lawyer.pickle', 'wb')) print 'lawyer blocks created!'
def is_new(self, user_req=False, test=False): '''Compare title with titles of alredy existed articles''' with open(self.log) as log, open(self.other_log) as other_log: for url in other_log.readlines()+log.readlines()[::-1]: comp_url_title = url.split('/')[-1] text_index = jaro_winkler( self.url_title, comp_url_title) if text_index == 1: if user_req: return url print(url) self.update_url = url.strip() return False if user_req: with open(self.other_log, 'a') as other_log: other_log.write(self.url+'\n') return False return True
def update(self, idx, idy, get_word): ''' idx, idy : index x and y from enumerated shit get_word : a function to fetch the saved structure ''' # idx = int(idx.encode('utf-8')) # idy = int(idy.encode('utf-8')) # logger.debug("index : ({} TYPE={}, {} TYPE={})".format(idx, type(idx), idy, type(idy))) stime = datetime.now() try: idx = int(idx) idy = int(idy) if idx == idy: return ox = self.store.get(str(idx)) oy = self.store.get(str(idy)) if ox is None or oy is None: return w1 = get_word(ox) w2 = get_word(oy) simi = jaro_winkler(w1, w2) logger.debug("({},{}) vs ({},{}) : {}".format( idx, w1, idy, w2, simi)) self.store.set_entry(idx, idy, simi) except Exception as e: logger.debug("(idx : {} type : {})".format(idx, type(idx))) raise e ntime = datetime.now() logger.debug("processing time : {}".format(ntime - stime))
async def roleinfo(self, ctx: CustomContext, *, rolename: typing.Optional[str] = None): """ Shows information about the given role. If used without any arguments, shows you a list of all roles supported in the bot. """ if rolename is None: def accumulator(facroles, role): facroles[role().faction.category_name].append(role.name) return facroles fac_roles = reduce(accumulator, all_roles.values(), defaultdict(list)) embed = discord.Embed() embed.color = 0x000000 embed.set_author(name='All supported roles', icon_url=self.bot.user.avatar_url) embed.set_footer( text='For information on a specific role, use roleinfo command.' ) embed.description = '' for faction, roles in fac_roles.items(): roles.sort() for role in roles: emote_name = role if faction == 'Neutral' else faction emote = emotes.get(emote_name, '❓') embed.description += '{} **{}**\n'.format(emote, role) embed.description += '\n' return await ctx.send(embed=embed) if rolename in role_categories: return await ctx.invoke(self.bot.get_command('categoryinfo'), category=rolename) for role in all_roles.values(): role = role() # initialize the class if role.name.lower() == rolename.lower(): if role.__doc__ is None: return await ctx.send( 'No documentation on {} available.'.format(rolename)) annotations = [] annotations.append(role.faction.category_name) if role.unique: annotations.append('Unique') embed = discord.Embed() embed.color = 0x000000 embed.set_author( name=f'{role.name} ({"; ".join(annotations)})', icon_url=self.bot.user.avatar_url) embed.description = '```diff\n' embed.description += inspect.getdoc(role) embed.description += '```' embed.set_footer( text=f'Categories: {", ".join(sorted(role.categories))}') return await ctx.send(embed=embed) for role in all_roles.values(): if jaro_winkler(role.name.lower(), rolename.lower()) > 0.85: await ctx.send( 'Couldn\'t find the role "{}". Did you mean {}?'.format( rolename, role.name)) def check(msg): return msg.author == ctx.author and msg.content.lower( ) in ['yes', 'y', 'yeah'] try: response = await self.bot.wait_for('message', timeout=10.0, check=check) return await ctx.invoke(ctx.command, rolename=role.name) except asyncio.TimeoutError: return await ctx.send("Couldn't find that role!")
def get_similarity(uuid1, uuid2): clean1 = uuid_to_cleanid[uuid1] clean2 = uuid_to_cleanid[uuid2] if clean1 == clean2: return 1.0 return jaro_winkler(clean1, clean2, 0.0)
from sys import argv from PIL import Image from pytesseract import image_to_string from Levenshtein import jaro_winkler import glob states = open("states.txt").read().split("\n")[:-1] for infile in sorted(glob.iglob(argv[1])): im = Image.open(infile) # rect = (444, 63, 661, 116) rect = (383, 58, 686, 110) im = im.crop(rect).point(lambda p: p > 111) im.convert("1") # print(image_to_string(im, config="--user-words states.txt --user-patterns states.txt -c load_system_dawg=0 -c load_freq_dawg=0")) text = image_to_string(im, config="-l states -c tessdata_char_blacklist='0123456789' bazaar").split("\n")[0] if text: match = text if text in states else max(states, key=lambda s: jaro_winkler(text, s)) print(infile, match, text, jaro_winkler(text, match))
def discogs_ordered_search(query, item_type, limit=100): name_pattern = ' \([0-9]+\)' q_stripped = query.strip("'\"") # special case when searching directly by id if q_stripped.isdigit(): url = 'http://{host}/{item_type}s/{query}'.format( host=DISCOGS_HOST, query=urllib.quote_plus(query.lower()), item_type=item_type ) log.debug('search by id: {0}'.format(url)) r = requests.get(url) if not r.status_code == 200: return [] data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST)) # TODO: not very nice - remap some fields if item_type == 'release': if 'title' in data: data['title'] = re.sub(name_pattern, '', data['title']) if 'formats' in data: formats = [] for format in [f['name'] for f in data['formats'] if 'name' in f]: formats.append(format) data['format'] = formats if 'labels' in data: try: data['catno'] = data['labels'][0]['catno'] except KeyError: pass if item_type == 'artist': if 'name' in data: data['title'] = re.sub(name_pattern, '', data['name']) if 'aliases' in data: aliases = [] for alias in [a['name'] for a in data['aliases'] if 'name' in a]: aliases.append(re.sub(name_pattern, '', alias)) data['aliases'] = aliases if 'members' in data: members = [] for member in [m['name'] for m in data['members'] if 'name' in m]: members.append(re.sub(name_pattern, '', member)) data['members'] = members if 'images' in data: for image in [i['uri150'] for i in data['images'] if 'type' in i and i['type'] == 'primary']: data['thumb'] = image break return [data,] url = 'http://{host}/database/search?q={query}&type={item_type}&per_page=100'.format( host=DISCOGS_HOST, query=urllib.quote_plus(query.encode('utf8').lower()), item_type=item_type ) results = [] results_unsorted = [] results_exact = [] results_start = [] results_other = [] x = 0 while url and x < API_MAX_REQUESTS: log.debug(url) r = requests.get(url) if not r.status_code == 200: return [] data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST)) url = reduce(dict.get, ['pagination', 'urls', 'next'], data) for r in data['results']: if 'title' in r: title = r['title'] formatted_title = re.sub(name_pattern, '', title) r['index'] = get_index(title) r['formatted_title'] = formatted_title r['uri'] = 'https://www.discogs.com%s' % r['uri'] r['dist'] = distance(formatted_title.lower(), q_stripped.lower()) r['dist1'] = jaro(formatted_title.lower(), q_stripped.lower()) r['dist2'] = jaro_winkler(formatted_title.lower(), q_stripped.lower()) r['dist3'] = ratio(formatted_title.lower(), q_stripped.lower()) # print r['dist'], # print r['dist1'], # print r['dist2'], # print r['dist3'], # print formatted_title.lower(), # print '::: {0} <> {1}'.format(formatted_title.lower(), q_stripped.lower()) results_unsorted.append(r) if formatted_title.lower() == q_stripped.lower(): #print 'exact', formatted_title.lower() results_exact.append(r) elif formatted_title.lower().startswith(q_stripped.lower()[0:10]): #print 'start', formatted_title.lower() results_start.append(r) else: #print 'other', formatted_title.lower() results_other.append(r) x += 1 #results = sort_results(results_exact) + sort_results(results_start)+ sort_results(results_other) results = sort_results_by_distance(results_unsorted) if item_type == 'artist': results = populate_results(results) return results[0:limit]
def suggest_identifier(id, names): sorted_names = sorted(names, key=lambda other: jaro_winkler(id, other), reverse=True) if len(sorted_names) > 0: if jaro_winkler(id, sorted_names[0]) > 0.0 and similarity(id, sorted_names[0]) > 0.5: return sorted_names[0]