def maj_argus_zoe(db_zoe, df_argus): for el in db_zoe['id']: text_description = db_zoe[db_zoe['id'] == el]['description'].values text_version = db_zoe[db_zoe['id'] == el]['version'].values year_tmp = db_zoe[db_zoe['id'] == el]['year'].values[0] if str(year_tmp)[:2] == '20' and year_tmp != '': df_argus_tmp = df_argus[df_argus.year == year_tmp] else: df_argus_tmp = df_argus for el_argus in df_argus_tmp['version'].values: df_argus_sub_tmp = df_argus_tmp # Recherche de la version de zoe 'La centrale' la plus proche de la version leboncoin (titre/decription) df_argus_tmp.loc[df_argus_tmp.version == el_argus, 'select'] = pylev.levenshtein( text_description, str(el_argus)) df_argus_sub_tmp.loc[df_argus_sub_tmp.version == el_argus, 'select'] = pylev.levenshtein( text_version, str(el_argus)) df_argus_fin_tmp = df_argus_tmp.append(df_argus_sub_tmp) distance_min = df_argus_fin_tmp['select'].min() argus_price = df_argus_fin_tmp[df_argus_fin_tmp['select'] == distance_min]['argus'].values[0] db_zoe.loc[db_zoe.id == el, 'argus'] = float(argus_price) db_zoe.loc[db_zoe.id == el, 'official_version'] = el_argus return db_zoe
def isIndexRevComp(indexfile, indexes, n=500000): """Determine if the indexes are reverse complemented or not :param indexfile: filename of the Fastq index file :param indexes: list or tuple of index strings :param n: integer number of reads to sample """ print("HERE") ifile = Fastq(indexfile) ilength = len(indexes[0]) print(ilength) indexreads = collections.defaultdict(int) for i in range(n): indexreads[ifile.next().sequence[:ilength]] += 1 counts = {'normal': 0, 'revcomp': 0} for k, v in list(indexreads.items()): print(k, v) for i in indexes: if (pylev.levenshtein(k, i) <= 1): counts['normal'] += v continue if (pylev.levenshtein(k, revcomp(i)) <= 1): counts['revcomp'] += v if (counts['revcomp'] > counts['normal']): print('using revcomp') else: print('NOT revcomp') return (counts['revcomp'] > counts['normal'])
def equal_levenshtein(string1: str, string2: str, min_index: int) -> int: """recursive levenshtein. only looks at the next number of chars that could mean that the levenshtein distance of this string pair is higher than the minimum required. If it is not, calls itself on the rest of the strings. This way, we not check all the strings at once but do it in sections since the minimum distance may already be reached after checking half of the string, saving valuable computing resources (levenshtein has complexity O(m*n) where m and n are the lengths of the two strings compared)""" length = len(string1) if length <= min_index: return pylev.levenshtein(string1, string2) else: index = pylev.levenshtein(string1[0:min_index+1], string2[ 0:min_index+1]) if index > min_index: return False else: sub_index = equal_levenshtein(string1[min_index+1:], string2[min_index+1:], min_index-index) if sub_index is False: return False else: index += sub_index if index > min_index: return False else: return index
def isIndexRevComp(indexfile,indexes,n=500000): """Determine if the indexes are reverse complemented or not :param indexfile: filename of the Fastq index file :param indexes: list or tuple of index strings :param n: integer number of reads to sample """ print("HERE") ifile = Fastq(indexfile) ilength=len(indexes[0]) print(ilength) indexreads = collections.defaultdict(int) for i in range(n): indexreads[ifile.next().sequence[:ilength]]+=1 counts = {'normal':0, 'revcomp':0} for k,v in list(indexreads.items()): print(k,v) for i in indexes: if(pylev.levenshtein(k,i)<=1): counts['normal']+=v continue if(pylev.levenshtein(k,revcomp(i))<=1): counts['revcomp']+=v if(counts['revcomp']>counts['normal']): print('using revcomp') else: print('NOT revcomp') return(counts['revcomp']>counts['normal'])
def lev_distance(q1, q2, process): if process: lev = float(levenshtein(' '.join(q1), ' '.join(q2))) return [lev / float(max(1, len(' '.join(q1)) + len(' '.join(q2)))), lev / float(max(1, min(len(' '.join(q1)), len(' '.join(q2))))), lev / float(max(1, max(len(' '.join(q1)), len(' '.join(q2)))))] else: lev = float(levenshtein(q1, q2)) return [lev / float(max(1, len(q1) + len(q2))), lev / float(max(1, min(len(q1), len(q2)))), lev / float(max(1, max(len(q1), len(q2))))]
def find_alternatives(self, name, collection): """ Finds alternatives of name in collection @param name: The string @type name: str @param collection: The collection @type collection: list @return: A sorted list of similar strings """ threshold = 1e3 alternatives = {} collection_parts = {} for item in collection: collection_parts[item] = item.split(':') for i, subname in enumerate(name.split(':')): for collection_name, parts in collection_parts.items(): exists = collection_name in alternatives if i not in parts and exists: alternatives[collection_name] += threshold continue elif i not in parts: continue lev = levenshtein(subname, parts[i]) if lev <= (len(subname) / 3) or parts[i].find(subname) != -1: if exists: alternatives[collection_name] = alternatives[ collection_name] + lev else: alternatives[collection_name] = lev elif exists: alternatives[collection_name] += threshold for item in collection: lev = levenshtein(name, item) if lev <= (len(name) / 3) or item.find(name) != -1: if item in alternatives: alternatives[item] = alternatives[item] - lev else: alternatives[item] = lev alternatives = list( filter(lambda a: a[1] < 2 * threshold, alternatives.items())) sorted(alternatives, key=lambda x: x[1]) return list(map(lambda x: x[0], alternatives))
def find_alternatives(self, name, collection): """ Finds alternatives of name in collection :param name: The string :type name: str :param collection: The collection :type collection: list :return: A sorted list of similar strings """ threshold = 1e3 alternatives = {} collection_parts = {} for item in collection: collection_parts[item] = item.split(':') for i, subname in enumerate(name.split(':')): for collection_name, parts in collection_parts.items(): exists = collection_name in alternatives if i not in parts and exists: alternatives[collection_name] += threshold continue elif i not in parts: continue lev = levenshtein(subname, parts[i]) if lev <= (len(subname) / 3) or parts[i].find(subname) != -1: if exists: alternatives[collection_name] += lev else: alternatives[collection_name] = lev elif exists: alternatives[collection_name] += threshold for item in collection: lev = levenshtein(name, item) if lev <= (len(name) / 3) or item.find(name) != -1: if item in alternatives: alternatives[item] = alternatives[item] - lev else: alternatives[item] = lev alts = [] for alt, score in alternatives.items(): if score < 2 * threshold: alts.append(alt) return alts
def find_alternatives(self, name, collection): """ Finds alternatives of name in collection @param name: The string @type name: str @param collection: The collection @type collection: list @return: A sorted list of similar strings """ threshold = 1e3 alternatives = {} collection_parts = {} for item in collection: collection_parts[item] = item.split(':') for i, subname in enumerate(name.split(':')): for collection_name, parts in collection_parts.items(): exists = collection_name in alternatives if i not in parts and exists: alternatives[collection_name] += threshold continue elif i not in parts: continue lev = levenshtein(subname, parts[i]) if lev <= (len(subname) / 3) or parts[i].find(subname) != -1: if exists: alternatives[collection_name] = alternatives[collection_name] + lev else: alternatives[collection_name] = lev elif exists: alternatives[collection_name] += threshold for item in collection: lev = levenshtein(name, item) if lev <= (len(name) / 3) or item.find(name) != -1: if item in alternatives: alternatives[item] = alternatives[item] - lev else: alternatives[item] = lev alternatives = list(filter(lambda a: a[1] < 2 * threshold, alternatives.items())) sorted(alternatives, key=lambda x: x[1]) return list(map(lambda x: x[0], alternatives))
def suggest(self, key, distance=3): suggestions = set() for index in self.indices: for candidate in index: if levenshtein(key, candidate) <= distance: suggestions.add(index[candidate]) return suggestions
def patterns(f1, dist = 55, outlier=10): """Will partition elements into subsets. The elements of a subset will not have a Levenshtein distance of more than :dist: from the other members of the same subset """ sets = [] seen = set() for i in (x.strip() for x in f1 if x.strip()): if i in seen: continue s = set([i]) seen.add(i) others = set(x.strip() for x in f1 if x.strip()) - seen for j in others: v = levenshtein(i, j) if v <= dist: s.add(j) seen.add(j) sets.append(s) # Format for printing outlier /= 100.0 retval = [] total = len(list(x.strip() for x in f1 if x.strip())) for i in sets: l = float(len(i)) if l/total < outlier: retval.append("{} elements - {}".format(len(i), i)) else: retval.append("{} elements".format(len(i))) return retval
def filter_brand_name(v, threshold, logger=None): # 如果地址以品牌名称开始 """ 去除地址开头是品牌名称的情况 :param record: :param threshold: Levenshtein距离的阈值 :return: """ record, modified = v logger = logging.getLogger() if logger is None else logger record = record.copy() addr = record[u'addr_e_rev'] if record[ u'addr_e_rev'] is not None else record[u'addr_e'] if addr is None: return record, modified addr_list = tuple(temp.strip() for temp in addr.split(u',')) if len(addr_list) <= 1: return record, modified str1 = addr_list[0].lower() str2 = record[u'brandname_e'].strip().lower() dist = pylev.levenshtein(str1, str2) if dist < threshold: logger.info( unicode.format(u'{0} is similar to {1}, idstores={2}', addr, record[u'brandname_e'], record[u'idstores'])) record[u'addr_e_rev'] = u', '.join(addr_list[1:]) modified = True return record, modified
def _search_subnode(word, node_id, tolerance): word_from_id_query = "SELECT word FROM words WHERE id = {0};" get_children_query = """SELECT e.child_id, e.dist, w.word FROM edges e INNER JOIN words w ON e.child_id = w.id WHERE parent_id = {0};""" node_word = _perform_selection(word_from_id_query.format(node_id))[0][0] dist = levenshtein(word, node_word) result = set() if dist <= tolerance: result.add(node_word) children = _perform_selection(get_children_query.format(node_id)) for child_id, child_parent_dist, child_word in children: child_word_dist = levenshtein(word, child_word) if child_parent_dist >= dist-tolerance and child_parent_dist <= dist+tolerance: result ^= _search_subnode(word, child_id, tolerance) return result
def link_author(author): global author_list for lab_author in author_list.keys(): if pylev.levenshtein(lab_author, author) / max(len(lab_author), len(author)) < .30: return author_list[lab_author] return author
def find_similar_command_names( name, commands): # type: (str, CommandCollection) -> List[str] """ Finds names similar to a given command name. """ threshold = 1e3 distance_by_name = {} # Include aliases in the search actual_names = commands.get_names(True) for actual_name in actual_names: # Get Levenshtein distance between the input and each command name distance = levenshtein(name, actual_name) is_similar = distance <= len(name) / 3 is_sub_string = actual_name.find(name) != -1 if is_similar or is_sub_string: distance_by_name[actual_name] = distance # Only keep results with a distance below the threshold distance_by_name = { k: v for k, v in distance_by_name.items() if v < 2 * threshold } # Display results with shortest distance first suggested_names = [] for k, v in sorted(distance_by_name.items(), key=lambda _, v: v): if k not in suggested_names: suggested_names.append(k) return suggested_names
def cached(a, b): if 'data' not in cached.__dict__: cached.data = {} k = (a,b) if k not in cached.data: cached.data[k] = pylev.levenshtein(a, b) return cached.data[k]
def find_similar_names(name, names): # type: (str, List[str]) -> List[str] """ Finds names similar to a given command name. """ threshold = 1e3 distance_by_name = {} suggested_names = [] for actual_name in names: # Get Levenshtein distance between the input and each command name distance = levenshtein(name, actual_name) is_similar = distance <= len(name) / 3 is_sub_string = actual_name.find(name) != -1 if is_similar or is_sub_string: distance_by_name[actual_name] = ( distance, actual_name.find(name) if is_sub_string else float("inf"), ) # Only keep results with a distance below the threshold distance_by_name = { k: v for k, v in distance_by_name.items() if v[0] < 2 * threshold } # Display results with shortest distance first for k, v in sorted(distance_by_name.items(), key=lambda i: (i[1][0], i[1][1])): if k not in suggested_names: suggested_names.append(k) return suggested_names
def patterns(f1, dist=55, outlier=10): """Will partition elements into subsets. The elements of a subset will not have a Levenshtein distance of more than :dist: from the other members of the same subset """ sets = [] seen = set() for i in (x.strip() for x in f1 if x.strip()): if i in seen: continue s = set([i]) seen.add(i) others = set(x.strip() for x in f1 if x.strip()) - seen for j in others: v = levenshtein(i, j) if v <= dist: s.add(j) seen.add(j) sets.append(s) # Format for printing outlier /= 100.0 retval = [] total = len(list(x.strip() for x in f1 if x.strip())) for i in sets: l = float(len(i)) if l / total < outlier: retval.append("{} elements - {}".format(len(i), i)) else: retval.append("{} elements".format(len(i))) return retval
def get_best_match(self, response, tool_name, max_edition_percentage: Optional[float] = 0.1): # biotoolsID = response['list'][0]['biotoolsID'] biotools_item = None normalized_name = self.normalize(tool_name) min_edit = len(tool_name) * 10000 for item in response['list']: choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['biotoolsID'])) if choice_edit < min_edit: min_edit = choice_edit biotools_item = item choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['name'])) if choice_edit < min_edit: min_edit = choice_edit biotools_item = item if max_edition_percentage is not None and min_edit > len(tool_name) * max_edition_percentage: return None return biotools_item
def levenshtein_worker(queue, results): while True: work = queue.get() (hash1, hash2, sector1, sector2, score) = work distance = pylev.levenshtein(sector1, sector2) results.put_nowait((hash1, hash2, distance, score)) queue.task_done()
def _is_duplicate(a: str, b: str) -> bool: """Determine whether two stacktraces are for the same error.""" la = len(a) lb = len(b) diff = abs(la - lb) if diff > 50: return False denom = min(la, lb) + diff / 2 ratio = levenshtein(a.casefold(), b.casefold()) / denom return ratio < 0.1
def selectSeries(self, series, allSeries): """The results the TVDB returns are sometimes poorly ranked and the first result is often not what we're looking for. This function attempts to find the closest match between a series named in the results and the user's search query by calculating the Levenshtein edit distance between the search query (series) and each of the results (allSeries) in order to find the result that most precisely matches our query """ distances = [] for show in allSeries: distances.append(pylev.levenshtein(series, show["seriesName"])) return allSeries[distances.index(min(distances))]
def evaluate_individual_sentence(self, original_sentence, paraphrase) -> Dict: original_sentence_tokens = nltk.word_tokenize( normalize_spaces_remove_urls(original_sentence)) paraphrase_tokens = nltk.word_tokenize( normalize_spaces_remove_urls(paraphrase)) # Bleu score bleu_score = nltk.translate.bleu_score.sentence_bleu( [normalize_spaces_remove_urls(original_sentence)], normalize_spaces_remove_urls(paraphrase)) # Sentence embedding cosine similarity emb1 = self.model.encode(original_sentence) emb2 = self.model.encode(paraphrase) cos_sim = util.pytorch_cos_sim(emb1, emb2) # Levenshtein distance edit_distance = pylev.levenshtein(original_sentence_tokens, paraphrase_tokens) length = max(len(original_sentence_tokens), len(paraphrase_tokens)) normalized_edit_distance = (length - edit_distance) / length # Jaccard jaccard = nltk.jaccard_distance(set(original_sentence_tokens), set(paraphrase_tokens)) # Jaccard * cosine similarity jaccard_embedding_factor = jaccard * cos_sim.item() metrics = { 'original_sentence': original_sentence, 'paraphrase': paraphrase, 'bleu_score': bleu_score, 'normalized_original_sentence': normalize_spaces_remove_urls(original_sentence), 'normalized_paraphrase': normalize_spaces_remove_urls(paraphrase), 'embedding_cosine_similarity': cos_sim.item(), 'edit_distance': edit_distance, 'normalized_edit_distance': normalized_edit_distance, 'jaccard': jaccard, 'jaccard_embedding_factor': jaccard_embedding_factor } return metrics
def planet_constellation(update, context): translator = Translator() text = update.message.text text = text.split() min_distance = 1000 best_planet_choice = '' user_planet_in_text = '' for cur_word in text: for cur_planet in planet_list: if pylev.levenshtein(cur_word, cur_planet) < min_distance: min_distance = pylev.levenshtein(cur_word, cur_planet) best_planet_choice = cur_planet user_planet_in_text = cur_word full_name = find_constellation(best_planet_choice) full_name_ru = translator.translate(full_name,dest='russian', src='en').text if user_planet_in_text.upper() != best_planet_choice.upper(): ans_text = f'Did you mean {best_planet_choice}? \n {full_name} / {full_name_ru}' else: ans_text = f'{full_name} / {full_name_ru}' update.message.reply_text(ans_text)
def levenshtein(a, b): len_a = len(a) len_b = len(b) distance = pylev.levenshtein(a, b) try: maxLength = max(len_a, len_b) result = maxLength - distance percentage = (result / maxLength) * 100 return percentage except: return 0
def compare(a, b): results = { 'editdistance': editdistance.eval(a, b), 'pylev': pylev.levenshtein(a, b), 'python-Levenshtein': Levenshtein.distance(a, b), 'pyxdameraulevenshtein': pyxdameraulevenshtein.damerau_levenshtein_distance(a, b), } return results
def search(self, query, threshold): d = levenshtein(self.value, query) if d <= threshold: yield self.value lo = d - threshold hi = d + threshold for dist, node in self.children.items(): if lo <= dist <= hi: for rv in node.search(query, threshold): yield rv
def similarity(s1, s2): # Length considerations on/off if NO_LEN == 1: trunc_len = min(len(s1), len(s2), TRUNC) else: trunc_len = TRUNC # Truncate s1 = s1[:trunc_len] s2 = s2[:trunc_len] # Return the levenshtein distance between the two modified strings return pylev.levenshtein(s1, s2)
def compare_to_gold_labels(entity: str, gold_entities: List[str], para_id: int) -> str: if entity in gold_entities: return entity for gold_ent in gold_entities: if (pylev.levenshtein(entity, gold_ent) < 3): return gold_ent print(para_id) print(f"Cannot find {entity}") return entity
def similarity_index_per_item(item1, item2): if type(item1)==str and type(item2)==str: return pylev.levenshtein(item1,item2) if ((type(item1)==int and type(item1)==int) or (type(item1)==float and type(item2)==float) or (type(item1)==long and type(item2)==long)): return (abs((float)(item1 - item2)))/float(max([item1+1,item2+1])) if type(item1)==bool and type(item2)==bool: if item1 == item2: return 0 else: return 1 if (type(item1)==dict and type(item2)==dict) or (type(item1)==list and type(item2)==list): return 1-jaccard(item1,item2)
def _iter_fuzzy_entries(catalog: Catalog, search_key: Key) -> typ.Iterable[Entry]: for key in _iter_candidate_keys(catalog, search_key): msg_text_dist = pylev.levenshtein(key.msg_text, search_key.msg_text) src_line_dist = pylev.levenshtein(key.source_line, search_key.source_line) if msg_text_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS: continue if src_line_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS: continue msg_text_dist_pct = 100 * msg_text_dist / max(len(key.msg_text), len(search_key.msg_text)) src_line_dist_pct = ( 100 * src_line_dist / max(len(key.source_line), len(search_key.source_line))) if msg_text_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT: continue if src_line_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT: continue yield catalog[key]
def main(): if len(sys.argv) != 2: exit(f"Usage: {sys.argv[0]} filename") filename = sys.argv[1] outfile = 'out.txt' rows = [] with open(filename) as fh: for row in fh: rows.append(row.rstrip("\n")) with open(outfile, 'w') as fh: for a in rows: for b in rows: dist = pylev.levenshtein(a, b) fh.write(f"{a},{b},{dist}\n")
def transform(self, question_list): q1_list = question_list[0] q2_list = question_list[1] lev_distance_strings = [[a,b] for a,b in zip(q1_list, q2_list)] lev_dist_array = np.array([ (float(levenshtein(pair[0], pair[1]))/ (float(sum([x.count('') for x in pair[0]])) + float(sum([x.count('') for x in pair[1]])))) for pair in lev_distance_strings ]) return lev_dist_array.reshape(len(lev_dist_array),1)
def compare_to_gold_labels(participants, system_participants): ret = [] found = False for p in participants: p = p.lower() if p in system_participants: ret.append(p) continue for g in system_participants: if (pylev.levenshtein(p,g) < 3): #print (p, "===", g) ret.append(g) found = True if not found: print(f"Cannot find {p}") return ret
def get_similar_members(): allowed_distance = int(request.args.get('distance')) or 4 members = Member.query.all() similar = {'a': [], 'b': []} for left, right in itertools.combinations(members, 2): distance = pylev.levenshtein(left.name, right.name) if distance < allowed_distance: left_json = left.serialize left_json['pmts'] = len(left.payments) similar['a'].append(left_json) right_json = right.serialize right_json['pmts'] = len(right.payments) similar['b'].append(right_json) return render_template('similar.html', similar_members=similar)
def choose_best(city, uris): """ Chooses the string that most closely resemble to the city name. EXAMPLE ======= >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)']) 'http://dbpedia.org/resource/Montreal' >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal']) 'http://dbpedia.org/resource/Montreal' >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem']) 'http://dbpedia.org/resource/New_York_City' """ # strategy is to use the longest common subsequence first and # take the the string that has the uri that has the longest one. # If there are ties, break the tie by computing the levenshtein and # taking the uri that has the smallest. # this creates a kind of band-pass filter, so to speak. distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris] # sort them by sub sequence length distances.sort() result_subseq_length = distances[-1][0] #print("distances",distances) ties = [e for e in distances if e[0] == result_subseq_length] #print("ties") # break the tie with the levenshtein distance. if len(ties) > 1: tie_distances = [(pylev.levenshtein(city, t[1]), t[1]) for t in ties] tie_distances.sort() result = tie_distances[0][1] else: result = distances[-1][1] return result
def choose_best(city, uris): """ Chooses the string that most closely resemble to the city name. EXAMPLE ======= >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)']) 'http://dbpedia.org/resource/Montreal' >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal']) 'http://dbpedia.org/resource/Montreal' >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem']) 'http://dbpedia.org/resource/New_York_City' """ # strategy is to use the longest common subsequence first and # take the the string that has the uri that has the longest one. # If there are ties, break the tie by computing the levenshtein and # taking the uri that has the smallest. # this creates a kind of band-pass filter, so to speak. distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris] # sort them by sub sequence length distances.sort() result_subseq_length = distances[-1][0] #print("distances",distances) ties = [e for e in distances if e[0] == result_subseq_length] #print("ties") # break the tie with the levenshtein distance. if len(ties) > 1: tie_distances = [(pylev.levenshtein(city, t[1]),t[1]) for t in ties] tie_distances.sort() result = tie_distances[0][1] else: result = distances[-1][1] return result
def score_domain(provided_ioc): """Return the scores of the provided domain.""" score = 0 for suspicious_tld in suspicious["tlds"]: if provided_ioc.endswith(suspicious_tld): score += 20 try: res = tld.get_tld(provided_ioc, as_object=True, fail_silently=True, fix_protocol=True) domain = ".".join([res.subdomain, res.domain]) except Exception: domain = provided_ioc score += int(round(entropy.shannon_entropy(domain) * 50)) domain = confusables.unconfuse(domain) words_in_domain = re.split("\W+", domain) if domain.startswith("*."): domain = domain[2:] if words_in_domain[0] in ["com", "net", "org"]: score += 10 for word in suspicious["keywords"]: if word in domain: score += suspicious["keywords"][word] for key in [k for k, v in suspicious["keywords"].items() if v >= 70]: for word in [ w for w in words_in_domain if w not in ["email", "mail", "cloud"] ]: if pylev.levenshtein(str(word), str(key)) == 1: score += 70 if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 if domain.count(".") >= 3: score += domain.count(".") * 3 return score
def uniqify(corpus, occ_dict, distance): # augment with value counts (which one to keep) words = [] while corpus: center = corpus[0] related = [word for word in corpus if pylev.levenshtein(center, word) <= distance] tuples = [(word, occ_dict[word.title()]) for word in related] sorted_ts = sorted(tuples, key=lambda x: x[1], reverse=True) print(sorted_ts) winner = sorted_ts[0][0] print(corpus) for t in sorted_ts: print(t) corpus.remove(t[0]) # keep taluk with highest number of occurrences # create dict by taking difference between corpae words.append(winner) return [x.title() for x in words]
def _connect_word_to_tree(word): last_id_query = "SELECT MAX(id) AS max_id FROM words;" word_from_id_query = "SELECT word FROM words WHERE id = {0};" find_child_at_dist_query = """SELECT child_id FROM edges WHERE parent_id = {0} AND dist = {1};""" connect_to_tree_query = """INSERT INTO edges (parent_id, child_id, dist) VALUES ({0}, {1}, {2});""" root_id = 1 child_id = ['initial_id'] node_id = root_id word_id = _perform_selection(last_id_query)[0][0] while len(child_id) > 0: node_word = _perform_selection(word_from_id_query.format(node_id))[0][0] dist = levenshtein(word, node_word) child_id = _perform_selection(find_child_at_dist_query.format(node_id, dist)) if len(child_id) > 0: node_id = child_id[0][0] _perform_insertion(connect_to_tree_query.format(node_id, word_id, dist))
def insert(self, string): dist = levenshtein(string, self.value) if dist not in self.children: self.children[dist] = Node(string) return self.children[dist].insert(string)
def assoc_sites_districts(): """ Associates orphan sites with districts either through fuzzy matching or creating schools as intermediaries between sites and districts. """ with current_app.app_context(): g.db_session = create_db_session() from orvsd_central.models import Site, School, District from orvsd_central.util import create_school_by_district_site from collections import namedtuple import pylev orphan_sites = set(Site.query.filter_by(school_id=None).all()) assigned_sites = set() schools = School.query.all() num_matches = 0 match_tuple = namedtuple('match', ['id', 'name']) match = None # If a site belongs to more than 1 school, just default to creating # by a district. print 'School Matching:' for site in orphan_sites: for school in schools: # Check for names as subsets or <=3 levenshtein distance. if (site.name in school.name or school.name in site.name or pylev.levenshtein(site.name, school.name) <= 3): num_matches += 1 match = match_tuple(id=school.id, name=school.name) if num_matches == 1: print ('School: {0} and Site: {1} matched.' .format(match.name, site.name)) site.school_id = match.id assigned_sites.add(site) num_matches = 0 match = None g.db_session.commit() orphan_sites = orphan_sites - assigned_sites print '\nDistrict Matching: ' # Districts next, with anything that's left. districts = District.query.all() for site in orphan_sites: for district in districts: if site.name in district.name or district.name in site.name: print ('District: {0} or Site: {1} contained in the other.' .format(district.name, site.name)) school = create_school_by_district_site(district, site) site.school_id = school.id assigned_sites.add(site) break # Use Levenshtein Distance for fuzzy matching elif pylev.levenshtein(site.name, school.name) <= 3: print ('District: {0} and Site: {1} fuzzy matched.' .format(district.name, site.name)) school = create_school_by_district_site(district, site) site.school_id = school.id assigned_sites.add(site) break g.db_session.commit() orphan_sites = orphan_sites - assigned_sites print '\nRemaining Sites: ' print '\t' + '\n\t'.join((site.name for site in orphan_sites))
def test_painful(self): # This is much faster than the above. self.assertEqual(pylev.levenshtein('CUNsperrICY', 'conspiracy'), 8)
a = g.replace('"', '').replace("/", ' ').replace("-", " ").strip() if a not in SIRS: temp1 = one_ave(a.lower(), pattern, "av") gtfs_terms.append(temp1) orig_gtfs.append(g) f2.close() bestmatches = {} #Where we'll store matches. #Compare each station in the turnstile data to each station in the gtfs feed. for t in xrange(0, len(turn_terms)): for g in xrange(0, len(gtfs_terms)): #Compute distance: tinylist = [int(distanceoffset(turn_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], turn_terms[t])) + int(isinside(turn_terms[t], gtfs_terms[g])) + int(samewords(turn_terms[t], gtfs_terms[g])) + int(penalize(gtfs_terms[g], turn_terms[t])), orig_gtfs[g], gtfs_terms[g], orig_turn[t]] #Make the highest default so anything better will take its place. bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])]) r_best.setdefault(g, [len(gtfs_terms[g])]) #Check against previous, update if it's a better match for both words than the things they matched before. if tinylist[0] < bestmatches[turn_terms[t]][0] and tinylist[0] < r_best[g]: bestmatches[turn_terms[t]] = tinylist r_best[g] = [tinylist[0], turn_terms[t]] # print turn_terms[t], tinylist # if "av n" in turn_terms[t]: # print bestmatches[turn_terms[t]], tinylist f3 = open('./matchtable.txt', 'w') #Now stick it all in a nice file.
import pylev import editdistance import distance print pylev.levenshtein('abc', '123abc567') #print editdistance.eval('abc', 'abc')
def test_long(self): self.assertEqual(pylev.levenshtein('confide', 'deceit'), 6)
def test_classic(self): self.assertEqual(pylev.levenshtein('kitten', 'sitting'), 3)
B.add_node("Dummy1", demand = 1) B.add_node("Dummy2", demand = 1) turn_terms.append("Dummy1") turn_terms.append("Dummy2") f2.close() bestmatches = {} sawts = {} for t in turn_terms: for g in google_terms: #Compute distance with levenshtein and numbers distance = int(pylev.levenshtein(g,t)) + int(distanceoffset(t, g)) + int(isinside(t, g)) # int(samewords(t, g)) #turnstrings = orig_google[g], google_terms[g], orig_turn[t] B.add_edge(g, t, weight = distance) #if distance < 3: # print "google = ", g, "turn = ", t, "distance = ", distance #print B.number_of_edges() p_match = [] c = list(B.edges()) #(< probably don't need) for (n1, n2) in c: if B.edge[n1][n2]['weight'] <= 0: B.remove_edge(n1, n2) p_match.append((n1,n2)) #otherwise print out top five matches
f1.close() f2.close() perfectmatches = {} bestmatches = {} nextbestmatches = {} #Compare every station in the turnstile feed with every station in the google feed. for g_station in gtfs_terms: for ts_station in ts_terms: turnstile = wordnospaces(ts_station) google = wordnospaces(g_station) if pylev.levenshtein(turnstile, google) == 0: #If the distance is 0, we have a perfect match! tinylist1 = [0, ts_station] perfectmatches[g_station] = tinylist1 break else: bestmatches.setdefault(g_station, [len(g_station)]) nextbestmatches.setdefault(g_station, [len(g_station)]) tinylist = [int(distanceoffset(ts_station, g_station)) + int(pylev.levenshtein(turnstile, google)), ts_station] if tinylist[0] < bestmatches[g_station][0]: nextbestmatches[g_station] = bestmatches[g_station] bestmatches[g_station] = tinylist f3 = open('./matchtable.txt', 'w') for p in perfectmatches:
print "DEG read: " + a + " " + b print rev_comp(a+b) print "SRA" # Given a seed, for every seed-matching pair of DEG and SRA, do pairwise alignment for k in seeds[i]: if DEBUG: print k.seq l = len(k.seq) # s1: seq from around DEG sites ab = a + b s1 = rev_comp(ab)[0: l] # s2: seq from SRA (excluding the first pos, i.e. the 1st position does not matter) s2 = k.seq # print "s1: " + s1 # print "s2: " + s2 ed = pylev.levenshtein(s1[10:], s2[10:]) if ed <= 100: if DEBUG: print "DEG:" + a + b print "s1 from DEG: " + s1[0] + "|" + s1[1:10] + "|" + s1[10:] print "s2 from SRA: " + s2[0] + "|" + s2[1:10] + "|" + s2[10:] print "ed_x_pos1: " + str(ed) # Only do alignment for the rest of the seq (i.e. ignore the 1st position and the seed region ( 2-10, or [1, 10) ) # since they are supposed to be perfectly matched # alignments = pairwise2.align.globalxx(s1[10:], s2[10:]) # emphasize g10-g21 # m: A match score is the score of identical chars, otherwise mismatch score. # s: Same open and extend gap penalties for both sequences. # d: The sequences have different open and extend gap penalties. # alignments = pairwise2.align.globalms(s1[10:], s2[10:], 2, -1, -4, -1)
def levenshteinIndex(str1,str2): distance = pylev.levenshtein(str1,str2) return (1-(float)(distance)/max([len(str1),len(str2)]))
# print ts_terms[v], orig_ts[v] f1.close() f2.close() bestmatches = {} sawts = {} #Compare each station in the turnstile data to each station in the google feed. for t in xrange(0, len(turn_terms)): for g in xrange(0, len(google_terms)): #Make the highest default so anything better will take its place. bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])]) #Compute distance with levenshtein and numbers tinylist = [int(distanceoffset(turn_terms[t], google_terms[g])) + int(pylev.levenshtein(google_terms[g], turn_terms[t])) + isinside(turn_terms[t], google_terms[g]) + samewords(turn_terms[t], turn_terms[t]), orig_google[g], google_terms[g], orig_turn[t]] if tinylist[0] < bestmatches[turn_terms[t]][0]: bestmatches[turn_terms[t]] = tinylist f3 = open('./matchtable.txt', 'w') #for g in xrange(0, len(bestmatches)): # for x in xrange(0, len(g)): # print for g in bestmatches: f3.write(g + ",") for x in xrange(0, len(bestmatches[g])): if x == len(bestmatches[g])-1: f3.write(str(bestmatches[g][x]).strip()) else:
def levenshtein_ratio(str_one, str_two): """ Levenshtein ratio """ str_len = len(str_one + str_two) return (str_len - pylev.levenshtein(str_one, str_two)) / float(str_len)
# print ts_terms[v], orig_ts[v] f1.close() f2.close() bestmatches = {} sawts = {} #Compare each station in the turnstile data to each station in the google feed. for g in xrange(0, len(gtfs_terms)): for t in xrange(0, len(ts_terms)): #Make the highest default so anything better will take its place. bestmatches.setdefault(gtfs_terms[g], [len(gtfs_terms[g])]) #Compute distance with levenshtein and numbers tinylist = [int(distanceoffset(ts_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], ts_terms[t])) + isinside(ts_terms[t], gtfs_terms[g]) + samewords(ts_terms[t], gtfs_terms[g]), orig_gtfs[g], ts_terms[t], orig_ts[t]] if tinylist[0] < bestmatches[gtfs_terms[g]][0]: bestmatches[gtfs_terms[g]] = tinylist f3 = open('./matchtable2.txt', 'w') print bestmatches #for g in xrange(0, len(bestmatches)): # for x in xrange(0, len(g)): # print #for g in bestmatches: # f3.write(g + ",") # for x in xrange(0, len(bestmatches[g])): # if x == len(bestmatches[g])-1: # f3.write(str(bestmatches[g][x]).strip())
def test_same(self): self.assertEqual(pylev.levenshtein('kitten', 'kitten'), 0)
def get_sense(word, lang=u"pl_PL"): senses = get_senses(word, lang) counter[0] += 1 if counter[0] % 100 == 0: print "sense", counter[0] return min(senses, key=lambda x: pylev.levenshtein(x, word)) if senses else None
def test_empty(self): self.assertEqual(pylev.levenshtein('', ''), 0)
def penalize(string1, string2): if pylev.levenshtein(string1, string2) > min(len(string1), len(string2)): return 3 return 0
def apply_lev(self, threshold): if self.proposals: for value in self.proposals.keys(): if pylev.levenshtein(value, self.goal) > threshold: del self.proposals[value]