def REACH_extraction(candidate_sentence, query_genes, Positive, Negative): all_statements = [] for sent in candidate_sentence.itertuples(): reach_processor = reach.process_text(sent.sentence) if reach_processor is not None: all_statements += reach_processor.statements for st in all_statements: st = str(st) event1 = st[st.find("(") + 1:st.rfind(")")] src = event1.split(',')[0].replace('()', '').strip().upper() trg = event1.split(',')[1].replace('()', '').strip().upper() mode = st.replace('(' + event1 + ')', '').strip().upper() if ((src in query_genes[0]) and trg in query_genes[1]): max_pos = 0 max_neg = 0 for term in Positive: score = jellyfish.jaro_distance(term, mode) if (score > max_pos): max_pos = score for term in Negative: score = jellyfish.jaro_distance(term, mode) if (score > max_neg): max_neg = score if max_pos > max_neg: return 'positive' else: return 'negative'
def fix(word): word_list = words.words(lang='es') score = 0 op = [] bst = [] bst2 = [] ind = '' opl = [] nope = [] done = {} for x in range(len(word_list)): if jellyfish.jaro_distance(word, word_list[x]) > score: score = jellyfish.jaro_distance(word, word_list[x]) op = (word_list[x]) opl.append(score) bst.append(score) bst2.append(word_list[x]) if jellyfish.jaro_distance(word, word_list[x]) == score: bst.append(score) bst2.append(word_list[x]) for x in range(len(bst)): done[x + bst[x]] = bst2[x] for x in range(len(bst)): gog = bst[x] if str(score) in str(gog): nope.append(bst2[x]) nope = list(set(nope)) print(nope[0])
def fda_process(url, gname, HLGT_dict, SOC_dict, all_reports): resp = requests.get(url=url) data = resp.json() try: for result in data['results']: try: if ('primarysource' in result.keys()): if ('qualification' in result['primarysource'].keys()): if (result['primarysource']['qualification'] == '1'): for drug in result['patient']['drug']: if ('drugcharacterization' in drug.keys()): if (drug['drugcharacterization'] == '1'): exclude = set(string.punctuation) gname = ''.join(ch for ch in gname if ch not in exclude) gname = gname.strip().upper() flag = 0 if ('activesubstance' in drug.keys()): drugname = drug['activesubstance'][ 'activesubstancename'].upper() score = jellyfish.jaro_distance( gname, drugname) if (score >= 0.8): flag = 1 if ('medicinalproduct' in drug.keys() and flag == 0): drugname = drug[ 'medicinalproduct'].upper() score = jellyfish.jaro_distance( gname, drugname) if (score >= 0.8): flag = 1 else: continue if (flag == 1): reportid = result['safetyreportid'] PTs = [] HGLTs = [] SOCs = [] for reaction in result['patient']['reaction']: if ('reactionmeddrapt' in reaction.keys()): PT = reaction[ 'reactionmeddrapt'].lower() PTs.append(PT) mapped_hglt = map_adr_to_meddra( PT, HLGT_dict) HGLTs.append(mapped_hglt) mapped_soc = map_adr_to_meddra( PT, SOC_dict) SOCs.append(mapped_soc) all_reports[reportid] = { 'PTs': PTs, 'HGLTs': HGLTs, 'SOCs': SOCs } except KeyError: continue except KeyError: pass return all_reports
def entity_cell_score(e, c): if type(e) == sparql.IRI: e_v = str(e) if dbp_prefix in e_v: e_v = e_v.split(dbp_prefix)[1].replace('_', ' ') if not Is_Number(c): jd = jaro_distance(unicode(e_v), c) if jd > FLAGS.str_match_threshold: return 1.0 elif type(e) == sparql.Literal: try: e_v = str(e) # object is a number: equality if Is_Number(e_v): if Is_Number(c) and float(c) == float(e_v): return 1.0 # object is date or datetime: consider the year only elif e.datatype in date_types: year = e_v.split('-')[0] if year in c: return 1.0 # object is text elif e.datatype is None: if not Is_Number(c): jd = jaro_distance(unicode(e_v), c) if jd > FLAGS.str_match_threshold: return 1.0 except UnicodeEncodeError: pass return 0.0
def get_closest_jaro(needle,haystack): closest = None; for x in haystack: if(closest == None): closest = (x,jellyfish.jaro_distance(needle,x)); else: temp = (x,jellyfish.jaro_distance(needle,x)); if(temp[1] > closest[1]): closest = temp; if(closest == None): return None; return closest[0];
def get_closest_jaro(needle, haystack): closest = None for x in haystack: if (closest == None): closest = (x, jellyfish.jaro_distance(needle, x)) else: temp = (x, jellyfish.jaro_distance(needle, x)) if (temp[1] > closest[1]): closest = temp if (closest == None): return None return closest[0]
def _compare_inv(f, o=None): if not f or (o and jfish.jaro_distance(f, o) > .75): return o if f not in weapons: f = common_errors.get(f, f) sim = [jfish.jaro_distance(f, w) for w in weapons] m = max(sim) if m > .75: f = weapons[sim.index(m)] else: return o return f
def jelly(): import jellyfish a = u'Korle Bu Teaching Hospital Sickle Cell Dept' b = u'Korle Bu Teaching Hospital' # a = u'x' # b = u'a' print jellyfish.levenshtein_distance(a, b) print jellyfish.jaro_distance(a, b) print jellyfish.damerau_levenshtein_distance(a, b) # print jellyfish.match_rating_comparison(a,b) from fuzzywuzzy import fuzz print fuzz.ratio(a, b)
def test_jellyfish(): text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.' text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi' lst1 = normalize(text1) lst2 = normalize(text2) text_norm1 = ' '.join(lst1) text_norm2 = ' '.join(lst2) print(jellyfish.jaro_distance(text1, text2)) print(jellyfish.jaro_distance(text_norm1, text_norm2)) print(jellyfish.jaro_winkler(text1, text2)) print(jellyfish.jaro_winkler(text_norm1, text_norm2)) print(jellyfish.nysiis(text1)) print(jellyfish.nysiis(text2)) exit()
def fuzzyLocMatch_wGT(locList1, locList2): """ Fuzzy location match using string comparision with jellyfish, can be applied to tweets extracted local gazetteers and ground truth or url extracted local gazetteers and ground truth :param locList1: tw extracted local gazetteers :param locList2: ground truth :return: score with tid (how reliable the tw is based only on either address or place name fuzzy match) """ scores = [] for loc1 in locList1: print(loc1[-1]) score = 0 for loc11 in loc1[0]: loc11 = roadNameFormat(loc11) for loc2 in locList2: loc2 = roadNameFormat(loc2) s = jellyfish.jaro_distance(str(loc11), str(loc2)) # print(str(loc11), '###', str(loc2), s) score = max(score, s) scores.append((round(score, 2), loc1[-1])) return scores
def get_similar_titles(title: str) -> list: """Main function for extracting alternate titles :type title: str :return: """ payload = { 'stype': 'title', 'search': title } link = requests.get(url=BakaUpdates.SEARCH_URL, params=payload) soup = Soup(link.text, 'html.parser') seen_titles = [] results = [] for s in soup.find_all('td', attrs={"class": "text pad col1"}): search_result = BakaUpdates.clean_title(s.text) # I decided to add a seen titles list to prevent duplicate titles in the output. # BakaUpdates search returns titles without the added keywords first, so I'll skip the second result # which is most likely a novel if search_result not in seen_titles: results.append({ 'title': search_result, 'link': s.find_next('a', href=True)['href'], 'similarity': jellyfish.jaro_distance(search_result.lower(), title.lower()) }) seen_titles.append(search_result) results.sort(key=lambda item: item['similarity'], reverse=True) return results
def ActionneEntity(name,action,myListSceneOrSwitch,conf): #derived from nice work of https://github.com/iMartyn/domoticz-snips lowest_distance = MAX_JARO_DISTANCE lowest_idx = 65534 lowest_name = "Unknown" MyWord=name DomoticzRealName="" print(" - ActionneEntity: "+MyWord) for idx,scene in myListSceneOrSwitch.items(): print("Scene/Schalter: "+str(scene['Name'],'utf-8')+" idx: "+idx) distance = 1-jellyfish.jaro_distance(str(scene['Name'],'utf-8'), MyWord) print(" Distance is "+str(distance)) if distance < lowest_distance: print(" Low enough and lowest!") lowest_distance = distance lowest_idx = idx lowest_name = scene['Name'] lowest_Type= scene['Type'] if lowest_distance < MAX_JARO_DISTANCE: print(" - ActionneEntity - lowest_Type: "+lowest_Type) DomoticzRealName=str(lowest_name,'utf-8') print(" - ActionneEntity - DomoticzRealName: "+DomoticzRealName) print(" - ActionneEntity - lowest_idx: "+lowest_idx) curlCmd(lowest_idx,action,lowest_Type,conf) return True,DomoticzRealName hermes.publish_end_session(intent_message.session_id, "Einschalten "+lowest_name) else: return False,DomoticzRealName
def calc_similarity(self, target: Entity or any, distance_table: dict, threshold: float = 1) -> float: """ to calculate the similarity between this cluster and an entity, i.e the possibility of an entity to be a member of this cluster :param target: the target entity to be compared with, or the target Cluster to be compared with (error when put Cluster in param type T_T) :param distance_table: {(ent_1_uri, ent_2_uri): distance_float} where ent_1_uri < ent_2_uri :param threshold: float, return if over this threshold :return: float number in [0, 1] to represent the similarity """ # TODO: ignore outliers? or use more sophisticated methods max_similarity = 0 if isinstance(target, Entity): if target.type in self.types: for name, cnt in self.names.items(): _key = (name, target.name) if name < target.name else ( target.name, name) if _key not in distance_table: distance_table[_key] = jaro_distance(name, target.name) if max_similarity < distance_table[_key]: max_similarity = distance_table[_key] if max_similarity > threshold: return max_similarity else: if set(target.types.keys()).union(self.types.keys()): for ent_uri, ent in target.members.items(): similarity = self.calc_similarity(ent, distance_table) if max_similarity < similarity: max_similarity = similarity if max_similarity > threshold: return max_similarity return max_similarity
def get_links_edge_list(path_to_cluster_heads, path_to_output): cluster_heads = json.load(open(path_to_cluster_heads)) IDs = list(cluster_heads.keys()) G = nx.Graph() G.add_nodes_from(IDs) for i, id1 in enumerate(IDs): for j in range(i + 1, len(IDs)): id2 = IDs[j] if cluster_heads[id1][1] == cluster_heads[id2][1]: if cluster_heads[id1][2] == cluster_heads[id2][ 2] and cluster_heads[id1][2] != '': G.add_edge(id1, id2) elif "NIL" in cluster_heads[id1][2] or "NIL" in cluster_heads[ id2][2]: name1 = unicode(cluster_heads[id1][0]) name2 = unicode(cluster_heads[id2][0]) #score = jf.jaro_winkler(name1,name2) score = jf.jaro_distance(name1, name2) if score > 0.9: print cluster_heads[id1] print cluster_heads[id2] print "" G.add_edge(id1, id2) with open(path_to_output, 'w') as output: output.write(str(G.nodes()) + '\n') for e in G.edges: output.write(str(e) + '\n')
def pair_name(name, names_and_ids, existing_names): if name in existing_names: return name, existing_names[name] options = ((jellyfish.jaro_distance(name.lower(), new_name.lower()), id_) for id_, new_name in names_and_ids.items()) options = tuple(enumerate(sorted(options, reverse=True)[:5])) selection = '' try: _, (_, selection) = options[int(input('''\ Select one of the following for {!r}. Press Enter to select the first option and ^C and Enter to skip or ^C again to exit. {} '''.format(name, '\n'.join(map(repr, options)))) or 0)] except KeyboardInterrupt: if input('''\ Create record? [y/N] ''') == 'y': mp = MP(name=MultilingualField(el=name, en=translit_elGrek2Latn(name), tr=translit_el2tr(name))) mp.insert() selection = mp._id return name, selection
def main(filename, motiflength): motif_list = {} records = list(SeqIO.parse(filename, "fasta")) pool = ThreadPool(processes=cpu_count()) for batch_counter in range(Config.batch): async_result = pool.apply_async(run, (records, motiflength)) motif_list[batch_counter] = async_result.get() pool.close() pool.join() # getting the best motif best = None for count, motif in motif_list.items(): if best is None or motif[1] < best[1]: best = motif # printing the result # enable loggin for this part sys.stdout = sys.__stdout__ print("Finale Profile") print_pseudo(best[0]) print("Consensus sequence") if Config.max_gapsize > 0: print("gapsize: " + str(best[2])) print("new solution: %d" % best[1]) if Config.palindrome_enable: print(best[0].consensus + "----" + best[0].consensus.reverse_complement()) print(jaro_distance(str(best[0].consensus), str(best[0].consensus.reverse_complement()))) else: print(best[0].consensus) return best
def get_jaro_avg(row1, row2): sum = 0 for columnIndex in xrange(1,15): a = row1[columnIndex] b = row2[columnIndex] sum += jellyfish.jaro_distance(a, b) return sum / 14.0
def check_nan(df): ''' Match fuzzy city names ''' cities = pd.read_csv("uscities.csv") cities = cities.astype({ 'city': 'str', "state_id": "str", "county_name": "str", "county_fips": "str" }) df_fuzzy = df[df.fips == "00nan"] df_tuple = list(zip(df_fuzzy.state_id, df_fuzzy.City, df_fuzzy.index)) cities_tuple = list(zip(cities.state_id, cities.city, cities.county_fips)) rv = [] for d in df_tuple: for c in cities_tuple: if c[0] == d[0]: if jellyfish.jaro_distance(c[1], d[1]) >= 0.85 or c[1] in d[1]\ or d[1] in c[1]: df.loc[df.index == d[2], ['City']] = c[1] df.loc[df.index == d[2], ['fips']] = c[2] break else: df = df.drop(d[2]) return df
def sceneOn_received(hermes, intent_message): print('Intent {}'.format(intent_message.intent)) for (slot_value, slot) in intent_message.slots.items(): print('Slot {} -> \n\tRaw: {} \tValue: {}'.format( slot_value, slot[0].raw_value, slot[0].slot_value.value.value)) scenes = getSceneNames(domoticz_base_url) lowest_distance = MAX_JARO_DISTANCE lowest_idx = 65534 lowest_name = "Unknown" for idx, scene in scenes.items(): print "Comparing " + scene + " and " + slot[0].slot_value.value.value distance = 1 - jellyfish.jaro_distance( scene, unicode(slot[0].slot_value.value.value, "utf-8")) print "Distance is " + str(distance) if distance < lowest_distance: print "Low enough and lowest!" lowest_distance = distance lowest_idx = idx lowest_name = scene if lowest_distance < MAX_JARO_DISTANCE: command_url = global_conf.get("secret").get( "domoticz url" ) + '/json.htm?type=command¶m=switchscene&idx=' + str( lowest_idx) + '&switchcmd=On' print '"curl"ing ' + command_url ignore_result = urllib2.urlopen(command_url) #ignore_result.read() # So we finish the connection correctly. hermes.publish_end_session(intent_message.session_id, "Turning on scene " + lowest_name) else: hermes.publish_end_session( intent_message.session_id, "I'm sorry, I couldn't find a scene like " + lowest_name)
def strCompare(self, new, ratiomin=0.8): """ Compare a new string against self.Lastwithstring. Uses one of the "similarity hashing" modules if availble. Returns a ratio describing the similarity level of new vs self.Lastwithstring where a ratio of 1.0 means full equality and 0.0 means no similarity. """ if self.Lastswithstring is None: self.Lastswithstring = "" old = self.Lastswithstring # can be Lastswithstring or LastComparedString if levenshtein_available: ratio = Levenshtein.ratio(new, old) elif jellyfish_available: ratio = jellyfish.jaro_distance(new, old) elif fuzzywuzzy_available: fuzzywuzzy.fuzz.ratio(new, old) elif difflib_available: print("Comparing {} with {} using difflib.SequenceMatcher.".format( new, old)) ratio = difflib.SequenceMatcher(None, new, old).ratio() elif simhash_available: ratio = simhash_compare(new, old) else: print("No string diff lib available!") ratio = 0.9 return ratio
def get_distance(string_a, string_b): # similarity scores given by edit distance functions are reversed to turn them into distances lev = 1 - fuzz.ratio(string_a, string_b) / 100 # given value is normalized in range 1-100, not in 0-1 jar = 1 - jellyfish.jaro_distance(string_a, string_b) jw = 1 - jellyfish.jaro_winkler(string_a, string_b) score = (lev + jar + jw) / 3 # calculate mean value of all distances return score
def distance(word): matches = [] for i in range(num): score = jaro_distance(word, words[i]) if score > thresh and words[i] != word: matches.append(i) return matches
async def message_resolve(client, message, cmd_prefix): if message.author.bot: return if message.author.id in variables.noflylist: return if message.content.startswith(cmd_prefix): await log(message.author, message.guild, message.content) args = split_args(message.content[len(cmd_prefix):]) command = args[0].lower() if command == "help": await print_help(client, message, *args[len(cmd_prefix):], full=False) elif command == "fullhelp": await print_help(client, message, *args[len(cmd_prefix):], full=True) elif command in functions.keys(): await functions[command][0](client, message, *args[len(cmd_prefix):]) else: jaro_dists = [(i, jellyfish.jaro_distance(command, i)) for i in functions.keys()] jaro_dists = [i for i in jaro_dists if i[1] > 0.8] if len(jaro_dists) == 0: return jaro_dists.sort(key=lambda i: i[1], reverse=True) txt = ",".join([f"`{i[0]}`" for i in jaro_dists]) await message.channel.send( f"`{variables.PREFIX}{command}` not found. Did you mean: {txt}" ) for handler in handlers: await handler(client, message)
def ActionneEntity(name, action, myListSceneOrSwitch, conf): #derived from nice work of https://github.com/iMartyn/domoticz-snips lowest_distance = MAX_JARO_DISTANCE lowest_idx = 65534 lowest_name = "Unknown" MyWord = name for idx, scene in myListSceneOrSwitch.items(): distance = 1 - jellyfish.jaro_distance(unicode(scene['Name'], 'utf-8'), MyWord) # print "Distance is "+str(distance) if distance < lowest_distance: # print "Low enough and lowest!" lowest_distance = distance lowest_idx = idx lowest_name = scene['Name'] lowest_Type = scene['Type'] if lowest_distance < MAX_JARO_DISTANCE: #print (lowest_Type) #print(lowest_name) #print(lowest_idx) curlCmd(lowest_idx, action, lowest_Type, conf) return True #hermes.publish_end_session(intent_message.session_id, "j'allume "+lowest_name) else: return False
def add_query_features(df, inc, exc, k1list, k2list): """ Return a copy of a dataframe with summary features added for the named text files defining the query """ df_new = df.copy() k1lens = list(map(len, k1list)) k2lens = list(map(len, k2list)) k1max = max(k1lens) k2max = max(k2lens) k1count = len(k1list) k2count = len(k2list) df_new['k1_count'] = k1count df_new['k2_count'] = k2count df_new['k1_max'] = k1max df_new['k2_max'] = k2max jaro_dist = jellyfish.jaro_distance(inc, exc) lev_dist = jellyfish.levenshtein_distance(inc, exc) ji = textdistance.jaccard(inc, exc) sd = textdistance.sorensen(inc, exc) ro = textdistance.ratcliff_obershelp(inc, exc) #jellyfish.damerau_levenshtein_distance(inc,exc) #jellyfish.jaro_winkler(inc,exc) df_new['inc_jaro_exc'] = jaro_dist df_new['inc_lev_exc'] = lev_dist df_new['inc_ji_exc'] = ji df_new['inc_sd_exc'] = sd df_new['inc_ro_exc'] = ro return df_new
def A_vio(self, col_target, alpha=0.8): '''N_lbl * N_lbl ''' logging.info('A_vio') J_train = self.extract(TRAIN, col_target) J_valid = self.extract(VALID, col_target) J_test = self.extract(TEST, col_target) J = [] J.extend(J_train) J.extend(J_valid) J.extend(J_test) L = [] L.extend(self.labels_train) L.extend(self.labels_test) d = dict(set(zip(L, J))) n_lbl = self.N_lbl V = np.zeros((n_lbl, n_lbl), dtype='int16') print(self.N_lbl) for (i, j) in itertools.product(range(n_lbl), range(n_lbl)): sim = jellyfish.jaro_distance(d.get(i, ''), d.get(j, '')) # if i % 1000 == 0: # print(i) if sim < alpha: V[i, j] = 1 self.A_V = V
def call_reia(): while (True): max_score = 0.1 map_val = "" with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt', 'r') as f: first_line = f.readline() while first_line == "": time.sleep(1) call_reia() print('-----------------------') user_input = first_line.split(' ', 1)[1] user_name = get_username(first_line.split(' ', 1)[0]) suggest_list = [] suggest_message = "" #prev_ts = ts print("\nINPUT = ") print(user_input) label = classify(user_input) if label == "": post_message( "Sorry, I could not understand. Please rephrase and try again." ) consume_message() continue print("Classified as : " + str(label)) tokens = nltk.word_tokenize(user_input) print(tokens) st = StanfordPOSTagger(config['tagger']['model'], path_to_jar=config['tagger']['path']) stanford_tag = st.tag(user_input.split()) print("Tags") print(stanford_tag) with open(MAPPING_PATH, 'r') as data_file: data = json.load(data_file) for i in data[label]: dist = jf.jaro_distance(str(user_input), str(i)) suggest_list.append(tuple((dist, i))) print(dist) if (dist > max_score): max_score = dist map_val = i if max_score < config['preferences']['similarity_threshold']: post_message( "Sorry, I could not understand. Please rephrase and try again." ) consume_message() if config['preferences']['suggestions'] == True: suggest = suggestions(suggest_list) post_message("Did you mean :") for i in suggest: suggest_message += (str(i[1]) + "\n") post_message(suggest_message) continue print("\nMapped to : " + map_val) #post_message(map_val) construct_command(user_input, label, tokens, map_val, stanford_tag, exec_command, user_name) #call('sed -i -e "1d " REIA/mqueue.txt') consume_message()
def build_list(file1, file2): operatorList = [] for name, count in file2.items(): index = name.rfind("_") newName = name[index + 1:] maxRatio = 0 partner = "" partnerGroup = "" for fullName, group in file1: newFullName = fullName.lower().replace(" ", "") ratio = jellyfish.jaro_distance(newName, newFullName) if ratio > maxRatio: maxRatio = ratio partner = fullName partnerGroup = group operatorList.append((name, partner, partnerGroup, count)) path = os.getcwd() with open(os.path.join(path, "particle_operators.txt"), "w") as file: text = "" for a, b, c, d in operatorList: aLen = len(a) aTabs = "\t" * (12 - int(aLen / 4)) bLen = len(b) bTabs = "\t" * (16 - int(bLen / 4)) cLen = len(c) cTabs = "\t" * (4 - int(cLen / 4)) text += a + aTabs + b + bTabs + c + cTabs + d + "\n" file.write(text) return operatorList
def score_jaro_distance(string1, string2): threshold = float(0.90) flag = False score = jellyfish.jaro_distance(unicode(string1), unicode(string2)) if score > threshold: flag = True return flag, score
def match2(self, query): bestRatio = 0 bestIndex = None print(len(self.sents)) # dists = set() for index, target in enumerate(self.sents): smaller = query larger = target if (len(smaller) > len(larger)): smaller, larger = larger, smaller smallerLength, largerLength = len(smaller), len(larger) words = larger.split(" ") offset = 0 # matcher = SequenceMatcher(None, smaller) while offset <= largerLength - smallerLength: windowed = " ".join(words[offset:smallerLength]) # matcher.set_seq2(windowed) # current = matcher.quick_ratio() current = jellyfish.jaro_distance(windowed, smaller) # current = jellyfish.jaro_winkler(windowed, smaller) # dists.add(current) if (current > bestRatio): bestRatio = current bestIndex = index offset += 1 # print(list(reversed(sorted(list(dists))))[0:20]) return (bestIndex, bestIndex), bestRatio
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() lvd = jellyfish.levenshtein_distance(xread,yread) dlvd= jellyfish.damerau_levenshtein_distance(xread,yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) # print lvd res = float( lvd / 100.00 ) dres= float(dlvd / 100.00 ) # print res # print "Levenshtein Distance=",res jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 jarowink = jellyfish.jaro_winkler(xread,yread) jaro = 1.0 - jaro jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro ham = jellyfish.hamming_distance(xread,yread) ham = float ( ham / 100.00) print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # print "Spamsum Match score: ", spsum kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres , jaro, jarowink, ham, kl, spsum
def join_by_name_distance(yelp_result, candidates, threshold=0.75): """ ### Kevin's suggestion: change match to match_list ### Edit: function needs to match on yelp_list and database_list This function calculates the Jaro distance between two strings. If this is above the threshold, it returns the best match. Inputs: - yelp_results: A yelp result (name) as a string (string) - candidates: A list of candidate matching names (strings) Outputs: - match: best match for the yelp_result (string) If there is no match above the threshold, then None """ yelp_result = yelp_list #match_list = [] eligible_matches = PriorityQueue() # for now candidates and yelp_result are simple strings # we can make that more complex if needed for i in candidates: eligible_matches.put((1 - jaro_distance(yelp_result, i), i)) match = eligible_matches.get()[1] if match >= threshold: return match else: return None
def CleanVillageNames(): import jellyfish subcenters = SubCenter.objects.all() for subc in subcenters: villages = Address.objects.filter(beneficiaries__subcenter=subc).distinct() nl_vills = villages.filter(village_mcts_id = None) l_vills = villages.exclude(village_mcts_id = None) phonetic_codes = [] for l_vill in l_vills: phonetic_codes.append(jellyfish.nysiis(l_vill.village)) #match the non-legitimate ones for nl_vill in nl_vills: pc = jellyfish.nysiis(nl_vill.village) min_dist = 100 min_ind = 0 ind = 0 for spc in phonetic_codes: dist = jellyfish.jaro_distance(spc ,pc) if dist <= min_dist: min_ind = ind min_dist = dist ind +=1 if min_dist < 1.0: match_vill = l_vills[min_ind] nl_vill.village_mcts_id = match_vill.village_mcts_id nl_vill.value = nl_vill.value+'_m' nl_vill.save()
def interpretAsColor(self, words): '''tries to map to first couple of words to a color''' best_match_color = collections.namedtuple('Match', 'index chance') best_match_color.chance = 0 for i, word in enumerate(words): for j, color in enumerate(COLORS): matches = [] for k, c in enumerate(color['name'].split(), start=0): try: m = jellyfish.jaro_distance(unicode(c), unicode(words[i + k])) except IndexError: break # end of words else: matches.append(m) match = sum(matches) / len(matches) if match > self.threshold and match > best_match_color.chance: best_match_color.chance = match best_match_color.index = j if best_match_color.chance > 0: rgb = map( lambda x: x / 255.0, list(eval( COLORS[best_match_color.index]['rgb']))) + [1] return rgb return None
def error_highlight_table(dropdown_value, data): df = pd.DataFrame(data) # print(df) a = dropdown_value b = "winter_{a}".format(a=dropdown_value) # print(a,b) for i in range(0, len(combined_columns), 2): df[combined_columns[i]] = df[combined_columns[i]].fillna( df[combined_columns[i + 1]]) df[combined_columns[i]] = df[combined_columns[i]].mask(df[combined_columns[i]] == 0).fillna( df[combined_columns[i + 1]]) for i in range(1, len(combined_columns), 2): df[combined_columns[i]] = df[combined_columns[i]].fillna( df[combined_columns[i - 1]]) df[combined_columns[i]] = df[combined_columns[i]].mask(df[combined_columns[i]] == 0).fillna( df[combined_columns[i - 1]]) temp = df.loc[df[a] != df[b], [a, b]].drop_duplicates() # print(temp) temp["jaro_distance"] = temp.apply( lambda x: jellyfish.jaro_distance(x[a], x[b]), axis=1) temp = temp.sort_values(by="jaro_distance", ascending=False) columns = [{'id': c, 'name': c, } for c in temp.columns] return temp.to_dict('records'), columns
def _similarTeams(self, optteam, opttable=None): """Do fuzzy string matching to find similar team names.""" similar = [] # empty lists to put our results in. # now do our sql work. with sqlite3.connect(self._cfbdb) as db: cursor = db.cursor() # select all fullnames, eid, rid. cursor.execute("SELECT nn, team, %s FROM cfb" % opttable) rows = cursor.fetchall() # iterate over all rows and do math. for row in rows: # row[0] = nn, row[1] = team, row[2] (what we're looking for.) similar.append({'jaro':jellyfish.jaro_distance(optteam, row[0]), 'team':row[1], 'id':row[2]}) similar.append({'jaro':jellyfish.jaro_distance(optteam, row[1]), 'team':row[1], 'id':row[2]}) # now, we do two "sorts" to find the "top5" matches. reverse is opposite on each. matching = sorted(similar, key=itemgetter('jaro'), reverse=True)[0:5] # bot five. # return matching now. return matching
def jaro_apply(x): try: return jellyfish.jaro_distance(x[0], x[1]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan else: raise err
def jaroWinklerDistanceAffiliation(authorId, paperId): authors = __builtin__.authors paperauthor = __builtin__.paperauthor if authors[authorId]['name'] and paperauthor[paperId][authorId]['authorName']: return jellyfish.jaro_distance(authors[authorId]['affiliation'], paperauthor[paperId][authorId]['affiliation']) else: return 0.5
def fuzzy_match(s1, s2, max_dist=.9): try: distance = jellyfish.jaro_distance(s1, s2) is_match = distance >= max_dist except: is_match = False distance = 0 return is_match, distance
def test_jaro_distance(self): cases = [("dicksonx", "dixon", 0.767), ("dixon", "dicksonx", 0.767), ("martha", "marhta", 0.944), ("dwayne", "duane", 0.822)] for (s1, s2, value) in cases: actual = jellyfish.jaro_distance(s1, s2) self.assertAlmostEqual(actual, value, places=3)
def _fuzzy_match(self, term, text): """ Fuzzy match on phrases. """ n = phrase_grams(term) for gram in tokenizer(text, n): d = jellyfish.jaro_distance(term, gram) if d >= self.fuzzy_threshold: return True return False
def compare_two_texts(self, string_a, string_b): """ Compare two string and return the value of Jaro algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): return jellyfish.jaro_distance(string_a, string_b) else: raise TypeError
def flag(self, text): """Very simple check for naughty words""" total_weight = 0 words = text.lower().split() for naughty in self.words: for word in words: score = jellyfish.jaro_distance(word, naughty) if score > 0.7: total_weight = total_weight + (score * self.words[naughty]) return total_weight > self.threshold
def getCityStateResolved(address): dfCities = pd.read_csv("./locdata/google_cities_nodups.csv") dfStates = pd.read_csv("./locdata/google_states.csv") city, state = getCityState(address) ## Merge the two dfs df_merged = pd.merge(dfCities, dfStates, left_on='SID', right_on='ID') ##Add two new columns for jaro scores in the df df_merged['jaroCity'] = df_merged['city'].map(lambda x: jf.jaro_distance(x.lower(), city.lower())) df_merged['jaroState'] = df_merged['state'].map(lambda x: jf.jaro_distance(x.lower(), state.lower())) df_merged['jaroFinal'] = 0.5*df_merged['jaroCity'] + 0.5*df_merged['jaroState'] ##Select those rows whose jaro is over a threshold ##df_merged = df_merged[df_merged['jaroFinal'] > 0.9] ##Find row with max value for jaro resolvedCityState = df_merged.ix[df_merged['jaroFinal'].idxmax()] return resolvedCityState['city'], resolvedCityState['state']
def findParameters(self, words, effectname): ''' only check words after a parameter indicator''' parameters = LEDMaster.getDefaultParameters( effectname) # always load default for i, word in enumerate(words): for j, p in enumerate(parameters.keys()): match = jellyfish.jaro_distance(unicode(p), unicode(word)) if match > self.threshold: value = self.understandParameterValue( p, parameters[p], words[i+1:]) parameters[p] = value return parameters
def findAreas(self, words): '''find every word that could indicate the area. TODO: handle numbers, handle aliases, choose lowest granularity''' areas = [] # TODO detect stuff like Balken1, Wand2 for i, word in enumerate(words): for a in AREAS: if a[-1] in '1234': # skip Balken1, 2 and so on continue # TODO detect if number is attached if jellyfish.jaro_distance(unicode(a), unicode(word)) > self.threshold and a not in areas: areas.append(a) return areas
def fgen_normname_tokens(x,y): l = x[1].split(' ') r = y[1].split(' ') sig1 = get_signature_from_tokens(x[2]) sig2 = get_signature_from_tokens(y[2]) if len(sig1) == len(sig2): if sig1!=sig2: return -1.0 l.pop() r.pop() dist = jellyfish.jaro_distance(' '.join(l),' '.join(r)) return dist
def get_similar(queryset, criteria): if not queryset.count(): return 0 result = { element.id: jellyfish.jaro_distance(element.name.encode("utf8"), criteria.encode("utf8")) for element in queryset } max_simular = max(result, key=result.get) if result[max_simular] > JARO_SIMULARITY: return max_simular else: return 0
def findClosest(g, actor): import jellyfish i = -1 maxMatch = -1 rs = None for node in g.nodes(): tmp = jellyfish.jaro_distance(node, actor) if tmp>maxMatch: rs = node maxMatch = tmp print actor + " is matched with value " + str(maxMatch) + " against " + rs return rs
def getParameterIndicatorIndex(self, words): '''return index of word which could mostly indicate the beginning of parameter inputs''' best_match = 0 best_index = 0 for i, word in enumerate(words): for pi in self.parameters_indicators: match = jellyfish.jaro_distance(pi, unicode(word)) if match > self.threshold and match > best_match: best_match = match best_index = i if match == 1: return i # shortcut, no need to search for more return best_index
def flag(self, text): """Very simple check for naughty words""" # Normalize diacritic characters into ASCII since current version of # jaro_distance cannot handle them. normalized_text = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')) total_weight = 0 words = normalized_text.lower().split() for naughty in self.words: for word in words: score = jellyfish.jaro_distance(word, naughty) if score > 0.7: total_weight = total_weight + (score * self.words[naughty]) return total_weight > self.threshold
def flag(text, threshold, words): """Very simple check for naughty words""" # Normalize diacritic characters into ASCII since current version of # jaro_distance cannot handle them. normalized_text = unicodedata.normalize('NFKD', force_unicode(text)).encode('ascii', 'ignore') total_weight = 0 lwords = normalized_text.lower().split() for naughty in words: for word in lwords: score = jellyfish.jaro_distance(word, naughty) if score > 0.7: total_weight = total_weight + (score * words[naughty]) return total_weight > threshold
def autocorrect(request,word): auto = [] i = 0 words = list(Master_HindiWords.objects.all()) count = Master_HindiWords.objects.all().count() while i < count: try: a = jellyfish.jaro_distance(word,str(words[i])) except: log.exception("Something wrong with call") if a > 0.85: auto.append(str(words[i])) i += 1 return HttpResponse(auto)
def similarityMeasures(row1, row2): jaro_sum = 0 jaro_winkler_sum = 0 levenshtein_sum = 0 damerau_levenshtein_sum = 0 for columnIndex in range(1,15): #skips id column a = row1[columnIndex] b = row2[columnIndex] jaro_sum += jellyfish.jaro_distance(a, b) jaro_winkler_sum += jellyfish.jaro_winkler(a, b) levenshtein_sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b))) damerau_levenshtein_sum += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b))) returnV = "%.6f,%.6f,%.6f,%.6f" % ( jaro_sum / 14.0, jaro_winkler_sum / 14.0, levenshtein_sum / 14.0, damerau_levenshtein_sum / 14.0) for i in range(1,15): returnV += ",%.6f" % (jellyfish.jaro_distance(row1[i], row2[i])) return returnV
def findEffectName(self, words): '''search only in words before buzzword paramerers''' # find effect name # word with highest match and before keyword parameters choices = self.effect_choices best_match_effect = collections.namedtuple('Match', 'effect chance') best_match_effect.chance = 0 for effect in choices: for i, word in enumerate(words): match = jellyfish.jaro_distance(unicode(effect), unicode(word)) if match > best_match_effect.chance and match > self.threshold: best_match_effect.chance = match best_match_effect.effect = effect return best_match_effect.effect
def string_compare(str1, str2, method='JARO'): ''' (string, string, string) -> double returns the similarity of str1 and str2 according to the method: LEV or JARO ''' if method == "LEV": # computes Levnenshtein distance which is an integer larger or equal to zero # return jellyfish.levenshtein_distance(str1,str2) return jellyfish.levenshtein_distance(str1.lower(), str2.lower()) if method == "JARO": # computes Jaro Winkler measure which is always between 0 and 1 return jellyfish.jaro_distance(str1, str2) print("ERROR: Choose the right string similarity measure : LEV or JARO")
def distance(string_1, string_2): """Compute the edit distance between two strings. """ return jsonify({ "levenshtein": jellyfish.levenshtein_distance(string_1, string_2), "damerau-levenshtein": jellyfish.damerau_levenshtein_distance( string_1, string_2 ), "jaro": jellyfish.jaro_distance(string_1, string_2), "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2), "match_rating_codex": jellyfish.match_rating_comparison( string_1, string_2 ), "sift3": pymailcheck.sift3_distance(string_1, string_2), })
def stringDistance(str1, str2): """ Return distance between two strings String distance : jaro + levenshtein + damerau """ distance = 0 if len(str1) > 0 and len(str2) > 0: str1 = str1.decode('utf-8') str2 = str2.decode('utf-8') jaro = jellyfish.jaro_distance(str1, str2) leven = jellyfish.levenshtein_distance(str1, str2) damerau = jellyfish.damerau_levenshtein_distance(str1, str2) norm = max(len(str1), len(str2)) distance = 0.5 * jaro + 0.25 * (1 - leven / norm) \ + 0.25 * (1 - damerau / norm) return distance
def index(request): if not DEBUG: return DEFAULT_DISTANCE = 0 person_into = request.GET.get('into', False) victims = map(lambda x: int(x), request.GET.getlist('combine')) if person_into is not False: victims.remove(int(person_into)) args_array = [person_into] + victims # call_command('mail_combine_people', *args_array) combcomm = CombineCommand() print person_into, victims result = combcomm.merge(person_into, victims, noprint=True) people = [] for p in Person.objects.filter(merged_into=None).order_by('name_hash'): people.append({'obj': p, 'dist': DEFAULT_DISTANCE}) target_person = None target_id = request.GET.get('id', False) if target_id is not False: target_person = Person.objects.get(id=target_id) if target_person: for (i,p) in enumerate(people): people[i]['dist'] = jellyfish.jaro_distance(target_person.name_hash, p['obj'].name_hash) people.sort(key=lambda x: x['dist'], reverse=True) total = len(people) template_vars = { 'people': people, 'total': total } return render_to_response('dedupe.html', template_vars, context_instance=RequestContext(request))