Python levenshtein Examples, pylev.levenshtein Python Examples

Example #1

0

Show file

def maj_argus_zoe(db_zoe, df_argus):
    for el in db_zoe['id']:
        text_description = db_zoe[db_zoe['id'] == el]['description'].values
        text_version = db_zoe[db_zoe['id'] == el]['version'].values
        year_tmp = db_zoe[db_zoe['id'] == el]['year'].values[0]
        if str(year_tmp)[:2] == '20' and year_tmp != '':
            df_argus_tmp = df_argus[df_argus.year == year_tmp]
        else:
            df_argus_tmp = df_argus

        for el_argus in df_argus_tmp['version'].values:
            df_argus_sub_tmp = df_argus_tmp

            # Recherche de la version de zoe 'La centrale' la plus proche de la version leboncoin (titre/decription)
            df_argus_tmp.loc[df_argus_tmp.version == el_argus,
                             'select'] = pylev.levenshtein(
                                 text_description, str(el_argus))
            df_argus_sub_tmp.loc[df_argus_sub_tmp.version == el_argus,
                                 'select'] = pylev.levenshtein(
                                     text_version, str(el_argus))
            df_argus_fin_tmp = df_argus_tmp.append(df_argus_sub_tmp)
            distance_min = df_argus_fin_tmp['select'].min()

            argus_price = df_argus_fin_tmp[df_argus_fin_tmp['select'] ==
                                           distance_min]['argus'].values[0]
            db_zoe.loc[db_zoe.id == el, 'argus'] = float(argus_price)
            db_zoe.loc[db_zoe.id == el, 'official_version'] = el_argus
    return db_zoe

Example #2

0

Show file

File: demultiplexer.py Project: lowks/SDST

def isIndexRevComp(indexfile, indexes, n=500000):
    """Determine if the indexes are reverse complemented or not
    
    :param indexfile: filename of the Fastq index file
    :param indexes: list or tuple of index strings
    :param n: integer number of reads to sample
    """
    print("HERE")
    ifile = Fastq(indexfile)
    ilength = len(indexes[0])
    print(ilength)
    indexreads = collections.defaultdict(int)
    for i in range(n):
        indexreads[ifile.next().sequence[:ilength]] += 1
    counts = {'normal': 0, 'revcomp': 0}
    for k, v in list(indexreads.items()):
        print(k, v)
        for i in indexes:
            if (pylev.levenshtein(k, i) <= 1):
                counts['normal'] += v
                continue
            if (pylev.levenshtein(k, revcomp(i)) <= 1):
                counts['revcomp'] += v
    if (counts['revcomp'] > counts['normal']):
        print('using revcomp')
    else:
        print('NOT revcomp')

    return (counts['revcomp'] > counts['normal'])

Example #3

0

Show file

File: algorithms.py Project: austrian-code-wizard/duplicateDetector

def equal_levenshtein(string1: str, string2: str, min_index: int) -> int:
	"""recursive levenshtein. only looks at the next number of chars that could
	mean that the levenshtein distance of this string pair is higher than the
	minimum required. If it is not, calls itself on the rest of the strings.
	This way, we not check all the strings at once but do it in sections since
	the minimum distance may already be reached after checking half of the
	string, saving valuable computing resources (levenshtein has complexity
	O(m*n) where m and n are the lengths of the two strings compared)"""

	length = len(string1)
	if length <= min_index:
		return pylev.levenshtein(string1, string2)
	else:
		index = pylev.levenshtein(string1[0:min_index+1], string2[
				0:min_index+1])
		if index > min_index:
			return False
		else:
			sub_index = equal_levenshtein(string1[min_index+1:], string2[min_index+1:], min_index-index)
			if sub_index is False:
				return False
			else:
				index += sub_index
				if index > min_index:
					return False
				else:
					return index

Example #4

0

Show file

File: demultiplexer.py Project: seandavi/SDST

def isIndexRevComp(indexfile,indexes,n=500000):
    """Determine if the indexes are reverse complemented or not
    
    :param indexfile: filename of the Fastq index file
    :param indexes: list or tuple of index strings
    :param n: integer number of reads to sample
    """
    print("HERE")
    ifile = Fastq(indexfile)
    ilength=len(indexes[0])
    print(ilength)
    indexreads = collections.defaultdict(int)
    for i in range(n):
        indexreads[ifile.next().sequence[:ilength]]+=1
    counts = {'normal':0,
              'revcomp':0}
    for k,v in list(indexreads.items()):
        print(k,v)
        for i in indexes:
            if(pylev.levenshtein(k,i)<=1):
                counts['normal']+=v
                continue
            if(pylev.levenshtein(k,revcomp(i))<=1):
                counts['revcomp']+=v
    if(counts['revcomp']>counts['normal']):
        print('using revcomp')
    else:
        print('NOT revcomp')
        
    return(counts['revcomp']>counts['normal'])

Example #5

0

Show file

    def lev_distance(q1, q2, process):
        if process:
            lev = float(levenshtein(' '.join(q1), ' '.join(q2)))
            return [lev / float(max(1, len(' '.join(q1)) + len(' '.join(q2)))),
                   lev / float(max(1, min(len(' '.join(q1)), len(' '.join(q2))))),
                   lev / float(max(1, max(len(' '.join(q1)), len(' '.join(q2)))))]

        else:
            lev = float(levenshtein(q1, q2))
            return [lev / float(max(1, len(q1) + len(q2))), 
                   lev / float(max(1, min(len(q1), len(q2)))),
                   lev / float(max(1, max(len(q1), len(q2))))]

Example #6

0

Show file

File: application.py Project: hason/cleo

    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        @param name: The string
        @type name: str
        @param collection: The collection
        @type collection: list

        @return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] = alternatives[
                            collection_name] + lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alternatives = list(
            filter(lambda a: a[1] < 2 * threshold, alternatives.items()))
        sorted(alternatives, key=lambda x: x[1])

        return list(map(lambda x: x[0], alternatives))

Example #7

0

Show file

    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        :param name: The string
        :type name: str
        :param collection: The collection
        :type collection: list

        :return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] += lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alts = []
        for alt, score in alternatives.items():
            if score < 2 * threshold:
                alts.append(alt)

        return alts

Example #8

0

Show file

File: application.py Project: hason/cleo

    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        @param name: The string
        @type name: str
        @param collection: The collection
        @type collection: list

        @return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] = alternatives[collection_name] + lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alternatives = list(filter(lambda a: a[1] < 2 * threshold, alternatives.items()))
        sorted(alternatives, key=lambda x: x[1])

        return list(map(lambda x: x[0], alternatives))

Example #9

0

Show file

File: __init__.py Project: pombredanne/python-addressable

 def suggest(self, key, distance=3):
     suggestions = set()
     for index in self.indices:
         for candidate in index:
             if levenshtein(key, candidate) <= distance:
                 suggestions.add(index[candidate])
     return suggestions

Example #10

0

Show file

File: lines.py Project: nibrahim/lines

def patterns(f1, dist = 55, outlier=10):
    """Will partition elements into subsets. The elements of a subset will
    not have a Levenshtein distance of more than :dist: from the other
    members of the same subset
    """
    sets = []
    seen = set()
    for i in (x.strip() for x in f1 if x.strip()):
        if i in seen:
            continue
        s = set([i])
        seen.add(i)
        others = set(x.strip() for x in f1 if x.strip()) - seen
        for j in others:
            v = levenshtein(i, j) 
            if v <= dist:
                s.add(j)
                seen.add(j)
        sets.append(s)

    # Format for printing
    outlier /= 100.0
    retval = []
    total = len(list(x.strip() for x in f1 if x.strip()))
    for i in sets:
        l = float(len(i))
        if l/total < outlier:
            retval.append("{} elements - {}".format(len(i), i))
        else:
            retval.append("{} elements".format(len(i)))
    
    return retval

Example #11

0

Show file

File: addr_filter.py Project: haizi-zh/ofashion

def filter_brand_name(v, threshold, logger=None):
    # 如果地址以品牌名称开始
    """
    去除地址开头是品牌名称的情况
    :param record:
    :param threshold: Levenshtein距离的阈值
    :return:
    """
    record, modified = v
    logger = logging.getLogger() if logger is None else logger
    record = record.copy()

    addr = record[u'addr_e_rev'] if record[
        u'addr_e_rev'] is not None else record[u'addr_e']
    if addr is None:
        return record, modified
    addr_list = tuple(temp.strip() for temp in addr.split(u','))
    if len(addr_list) <= 1:
        return record, modified
    str1 = addr_list[0].lower()
    str2 = record[u'brandname_e'].strip().lower()
    dist = pylev.levenshtein(str1, str2)
    if dist < threshold:
        logger.info(
            unicode.format(u'{0} is similar to {1}, idstores={2}', addr,
                           record[u'brandname_e'], record[u'idstores']))
        record[u'addr_e_rev'] = u', '.join(addr_list[1:])
        modified = True
    return record, modified

Example #12

0

Show file

File: sqltree.py Project: wojcikk2903/ocr

def _search_subnode(word, node_id, tolerance):
    word_from_id_query = "SELECT word FROM words WHERE id = {0};"
    get_children_query = """SELECT e.child_id, e.dist, w.word FROM edges e
        INNER JOIN words w ON e.child_id = w.id WHERE parent_id = {0};"""
    node_word = _perform_selection(word_from_id_query.format(node_id))[0][0]
    dist = levenshtein(word, node_word)
    result = set()
    if dist <= tolerance:
        result.add(node_word)
    children = _perform_selection(get_children_query.format(node_id))
    for child_id, child_parent_dist, child_word in children:
        child_word_dist = levenshtein(word, child_word)
        if child_parent_dist >= dist-tolerance and child_parent_dist <= dist+tolerance:
            result ^= _search_subnode(word, child_id, tolerance)

    return result

Example #13

0

Show file

File: process_bibtex.py Project: cmusmashlab/cmusmashlab.github.io

def link_author(author):
    global author_list
    for lab_author in author_list.keys():
        if pylev.levenshtein(lab_author, author) / max(len(lab_author),
                                                       len(author)) < .30:
            return author_list[lab_author]
    return author

Example #14

0

Show file

File: command.py Project: pmav99/clikit

def find_similar_command_names(
        name, commands):  # type: (str, CommandCollection) -> List[str]
    """
    Finds names similar to a given command name.
    """
    threshold = 1e3
    distance_by_name = {}

    # Include aliases in the search
    actual_names = commands.get_names(True)

    for actual_name in actual_names:
        # Get Levenshtein distance between the input and each command name
        distance = levenshtein(name, actual_name)

        is_similar = distance <= len(name) / 3
        is_sub_string = actual_name.find(name) != -1

        if is_similar or is_sub_string:
            distance_by_name[actual_name] = distance

        # Only keep results with a distance below the threshold
        distance_by_name = {
            k: v
            for k, v in distance_by_name.items() if v < 2 * threshold
        }

        # Display results with shortest distance first
        suggested_names = []
        for k, v in sorted(distance_by_name.items(), key=lambda _, v: v):
            if k not in suggested_names:
                suggested_names.append(k)

        return suggested_names

Example #15

0

Show file

File: levenshtein_pylev.py Project: tboenig/code-maven.com

def cached(a, b):
    if 'data' not in cached.__dict__:
        cached.data = {}
    k = (a,b)
    if k not in cached.data:
        cached.data[k] = pylev.levenshtein(a, b)
    return cached.data[k]

Example #16

0

Show file

File: _utils.py Project: meschac38700/cleo

def find_similar_names(name, names):  # type: (str, List[str]) -> List[str]
    """
    Finds names similar to a given command name.
    """
    threshold = 1e3
    distance_by_name = {}
    suggested_names = []

    for actual_name in names:
        # Get Levenshtein distance between the input and each command name
        distance = levenshtein(name, actual_name)

        is_similar = distance <= len(name) / 3
        is_sub_string = actual_name.find(name) != -1

        if is_similar or is_sub_string:
            distance_by_name[actual_name] = (
                distance,
                actual_name.find(name) if is_sub_string else float("inf"),
            )

    # Only keep results with a distance below the threshold
    distance_by_name = {
        k: v
        for k, v in distance_by_name.items() if v[0] < 2 * threshold
    }

    # Display results with shortest distance first
    for k, v in sorted(distance_by_name.items(),
                       key=lambda i: (i[1][0], i[1][1])):
        if k not in suggested_names:
            suggested_names.append(k)

    return suggested_names

Example #17

0

Show file

def patterns(f1, dist=55, outlier=10):
    """Will partition elements into subsets. The elements of a subset will
    not have a Levenshtein distance of more than :dist: from the other
    members of the same subset
    """
    sets = []
    seen = set()
    for i in (x.strip() for x in f1 if x.strip()):
        if i in seen:
            continue
        s = set([i])
        seen.add(i)
        others = set(x.strip() for x in f1 if x.strip()) - seen
        for j in others:
            v = levenshtein(i, j)
            if v <= dist:
                s.add(j)
                seen.add(j)
        sets.append(s)

    # Format for printing
    outlier /= 100.0
    retval = []
    total = len(list(x.strip() for x in f1 if x.strip()))
    for i in sets:
        l = float(len(i))
        if l / total < outlier:
            retval.append("{} elements - {}".format(len(i), i))
        else:
            retval.append("{} elements".format(len(i)))

    return retval

Example #18

0

Show file

File: load_tools.py Project: IFB-ElixirFr/ifbcat

 def get_best_match(self, response, tool_name, max_edition_percentage: Optional[float] = 0.1):
     # biotoolsID = response['list'][0]['biotoolsID']
     biotools_item = None
     normalized_name = self.normalize(tool_name)
     min_edit = len(tool_name) * 10000
     for item in response['list']:
         choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['biotoolsID']))
         if choice_edit < min_edit:
             min_edit = choice_edit
             biotools_item = item
         choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['name']))
         if choice_edit < min_edit:
             min_edit = choice_edit
             biotools_item = item
     if max_edition_percentage is not None and min_edit > len(tool_name) * max_edition_percentage:
         return None
     return biotools_item

Example #19

0

Show file

def levenshtein_worker(queue, results):
    while True:
        work = queue.get()
        (hash1, hash2, sector1, sector2, score) = work

        distance = pylev.levenshtein(sector1, sector2)
        results.put_nowait((hash1, hash2, distance, score))

        queue.task_done()

Example #20

0

Show file

def _is_duplicate(a: str, b: str) -> bool:
    """Determine whether two stacktraces are for the same error."""
    la = len(a)
    lb = len(b)
    diff = abs(la - lb)
    if diff > 50:
        return False
    denom = min(la, lb) + diff / 2
    ratio = levenshtein(a.casefold(), b.casefold()) / denom
    return ratio < 0.1

Example #21

0

Show file

File: tvdb_api.py Project: jdintruff/tvdb_api

 def selectSeries(self, series, allSeries):
     """The results the TVDB returns are sometimes poorly ranked and the first result is often not what we're looking
     for. This function attempts to find the closest match between a series named in the results and the user's
     search query by calculating the Levenshtein edit distance between the search query (series) and each of the
     results (allSeries) in order to find the result that most precisely matches our query
     """
     distances = []
     for show in allSeries:
         distances.append(pylev.levenshtein(series, show["seriesName"]))
     return allSeries[distances.index(min(distances))]

Example #22

0

Show file

    def evaluate_individual_sentence(self, original_sentence,
                                     paraphrase) -> Dict:

        original_sentence_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(original_sentence))
        paraphrase_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(paraphrase))

        # Bleu score
        bleu_score = nltk.translate.bleu_score.sentence_bleu(
            [normalize_spaces_remove_urls(original_sentence)],
            normalize_spaces_remove_urls(paraphrase))

        # Sentence embedding cosine similarity
        emb1 = self.model.encode(original_sentence)
        emb2 = self.model.encode(paraphrase)
        cos_sim = util.pytorch_cos_sim(emb1, emb2)

        # Levenshtein distance
        edit_distance = pylev.levenshtein(original_sentence_tokens,
                                          paraphrase_tokens)
        length = max(len(original_sentence_tokens), len(paraphrase_tokens))
        normalized_edit_distance = (length - edit_distance) / length

        # Jaccard
        jaccard = nltk.jaccard_distance(set(original_sentence_tokens),
                                        set(paraphrase_tokens))

        # Jaccard * cosine similarity
        jaccard_embedding_factor = jaccard * cos_sim.item()

        metrics = {
            'original_sentence':
            original_sentence,
            'paraphrase':
            paraphrase,
            'bleu_score':
            bleu_score,
            'normalized_original_sentence':
            normalize_spaces_remove_urls(original_sentence),
            'normalized_paraphrase':
            normalize_spaces_remove_urls(paraphrase),
            'embedding_cosine_similarity':
            cos_sim.item(),
            'edit_distance':
            edit_distance,
            'normalized_edit_distance':
            normalized_edit_distance,
            'jaccard':
            jaccard,
            'jaccard_embedding_factor':
            jaccard_embedding_factor
        }

        return metrics

Example #23

0

Show file

def planet_constellation(update, context):
    translator = Translator()
    text = update.message.text
    text = text.split()
    min_distance = 1000
    best_planet_choice = ''
    user_planet_in_text = ''
    for cur_word in text:
        for cur_planet in planet_list:
            if pylev.levenshtein(cur_word, cur_planet) < min_distance:
                min_distance = pylev.levenshtein(cur_word, cur_planet)
                best_planet_choice = cur_planet
                user_planet_in_text = cur_word
    full_name = find_constellation(best_planet_choice)
    full_name_ru = translator.translate(full_name,dest='russian', src='en').text
    if user_planet_in_text.upper() != best_planet_choice.upper():
        ans_text = f'Did you mean {best_planet_choice}? \n {full_name} / {full_name_ru}'
    else:
        ans_text = f'{full_name} / {full_name_ru}'
    update.message.reply_text(ans_text)

Example #24

0

Show file

File: lionel_enrichie.py Project: dorg-aharris/lionel-enrichie

def levenshtein(a, b):
    len_a = len(a)
    len_b = len(b)
    distance = pylev.levenshtein(a, b)
    try:
        maxLength = max(len_a, len_b)
        result = maxLength - distance
        percentage = (result / maxLength) * 100
        return percentage
    except:
        return 0

Example #25

0

Show file

File: code.py Project: collab-dev/python-levenshtein-compare

def compare(a, b):
    results = {
        'editdistance':
        editdistance.eval(a, b),
        'pylev':
        pylev.levenshtein(a, b),
        'python-Levenshtein':
        Levenshtein.distance(a, b),
        'pyxdameraulevenshtein':
        pyxdameraulevenshtein.damerau_levenshtein_distance(a, b),
    }
    return results

Example #26

0

Show file

File: bktree.py Project: eugene-eeo/algoaday

    def search(self, query, threshold):
        d = levenshtein(self.value, query)
        if d <= threshold:
            yield self.value

        lo = d - threshold
        hi = d + threshold

        for dist, node in self.children.items():
            if lo <= dist <= hi:
                for rv in node.search(query, threshold):
                    yield rv

Example #27

0

Show file

def similarity(s1, s2):
    # Length considerations on/off
    if NO_LEN == 1:
        trunc_len = min(len(s1), len(s2), TRUNC)
    else:
        trunc_len = TRUNC

    # Truncate
    s1 = s1[:trunc_len]
    s2 = s2[:trunc_len]

    # Return the levenshtein distance between the two modified strings
    return pylev.levenshtein(s1, s2)

Example #28

0

Show file

File: format_evalQA_pred.py Project: ytyz1307zzh/KOALA

def compare_to_gold_labels(entity: str, gold_entities: List[str],
                           para_id: int) -> str:

    if entity in gold_entities:
        return entity

    for gold_ent in gold_entities:
        if (pylev.levenshtein(entity, gold_ent) < 3):
            return gold_ent

    print(para_id)
    print(f"Cannot find {entity}")
    return entity

Example #29

0

Show file

File: main.py Project: johnl79/word_similarity

def similarity(s1, s2):
  # Length considerations on/off
  if NO_LEN == 1:
    trunc_len = min(len(s1), len(s2), TRUNC)
  else:
    trunc_len = TRUNC

  # Truncate
  s1 = s1[:trunc_len]
  s2 = s2[:trunc_len]

  # Return the levenshtein distance between the two modified strings
  return pylev.levenshtein(s1, s2)

Example #30

0

Show file

File: task_correction.py Project: zachamid/mypy

def similarity_index_per_item(item1, item2):
	if type(item1)==str and type(item2)==str:
		return pylev.levenshtein(item1,item2)
	if ((type(item1)==int and type(item1)==int)
			or (type(item1)==float and type(item2)==float) 
			or (type(item1)==long and type(item2)==long)):
		return (abs((float)(item1 - item2)))/float(max([item1+1,item2+1]))
	if type(item1)==bool and type(item2)==bool:
		if item1 == item2:
			return 0
		else:
			return 1
	if (type(item1)==dict and type(item2)==dict) or (type(item1)==list and type(item2)==list):
		return 1-jaccard(item1,item2)

Example #31

0

Show file

def _iter_fuzzy_entries(catalog: Catalog,
                        search_key: Key) -> typ.Iterable[Entry]:
    for key in _iter_candidate_keys(catalog, search_key):
        msg_text_dist = pylev.levenshtein(key.msg_text, search_key.msg_text)
        src_line_dist = pylev.levenshtein(key.source_line,
                                          search_key.source_line)

        if msg_text_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS:
            continue
        if src_line_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS:
            continue

        msg_text_dist_pct = 100 * msg_text_dist / max(len(key.msg_text),
                                                      len(search_key.msg_text))
        src_line_dist_pct = (
            100 * src_line_dist /
            max(len(key.source_line), len(search_key.source_line)))

        if msg_text_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT:
            continue
        if src_line_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT:
            continue

        yield catalog[key]

Example #32

0

Show file

def main():
    if len(sys.argv) != 2:
        exit(f"Usage: {sys.argv[0]} filename")
    filename = sys.argv[1]
    outfile = 'out.txt'

    rows = []
    with open(filename) as fh:
        for row in fh:
            rows.append(row.rstrip("\n"))
    with open(outfile, 'w') as fh:
        for a in rows:
            for b in rows:
                dist = pylev.levenshtein(a, b)
                fh.write(f"{a},{b},{dist}\n")

Example #33

0

Show file

	def transform(self, question_list):
		q1_list = question_list[0]
		q2_list = question_list[1]
		
		lev_distance_strings = [[a,b] 
		for a,b in zip(q1_list, q2_list)]
		
		lev_dist_array = np.array([
	(float(levenshtein(pair[0], pair[1]))/
	(float(sum([x.count('') for x in pair[0]])) + 
	float(sum([x.count('') for x in pair[1]])))) 
	for pair in lev_distance_strings 
		])
		
		return lev_dist_array.reshape(len(lev_dist_array),1)

Example #34

0

Show file

File: evalQA.py Project: ytyz1307zzh/KOALA

def compare_to_gold_labels(participants, system_participants):
    ret = []
    found = False
    for p in participants:
        p = p.lower()
        if p in system_participants:
            ret.append(p)
            continue
        for g in system_participants:
            if (pylev.levenshtein(p,g) < 3):
                #print (p, "===", g)
                ret.append(g)
                found = True
        if not found:
            print(f"Cannot find {p}")
    return ret

Example #35

0

Show file

File: __init__.py Project: charliestrawn/thundersnow

def get_similar_members():
    allowed_distance = int(request.args.get('distance')) or 4
    members = Member.query.all()
    similar = {'a': [], 'b': []}
    for left, right in itertools.combinations(members, 2):
        distance = pylev.levenshtein(left.name, right.name)
        if distance < allowed_distance:
            left_json = left.serialize
            left_json['pmts'] = len(left.payments)
            similar['a'].append(left_json)

            right_json = right.serialize
            right_json['pmts'] = len(right.payments)
            similar['b'].append(right_json)

    return render_template('similar.html', similar_members=similar)

Example #36

0

Show file

File: cityres.py Project: jopela/cityres

def choose_best(city, uris):
    """
    Chooses the string that most closely resemble to the city name.

    EXAMPLE
    =======
    >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem'])
    'http://dbpedia.org/resource/New_York_City'

    """

    # strategy is to use the longest common subsequence first and
    # take the the string that has the uri that has the longest one.
    # If there are ties, break the tie by computing the levenshtein and
    # taking the uri that has the smallest.

    # this creates a kind of band-pass filter, so to speak.

    distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris]

    # sort them by sub sequence length
    distances.sort()

    result_subseq_length = distances[-1][0]

    #print("distances",distances)

    ties = [e for e in distances if e[0] == result_subseq_length]

    #print("ties")

    # break the tie with the levenshtein distance.
    if len(ties) > 1:
        tie_distances = [(pylev.levenshtein(city, t[1]), t[1]) for t in ties]
        tie_distances.sort()
        result = tie_distances[0][1]
    else:
        result = distances[-1][1]

    return result

Example #37

0

Show file

File: cityres.py Project: jopela/cityres

def choose_best(city, uris):
    """
    Chooses the string that most closely resemble to the city name.

    EXAMPLE
    =======
    >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem'])
    'http://dbpedia.org/resource/New_York_City'

    """

    # strategy is to use the longest common subsequence first and
    # take the the string that has the uri that has the longest one.
    # If there are ties, break the tie by computing the levenshtein and
    # taking the uri that has the smallest.

    # this creates a kind of band-pass filter, so to speak.

    distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris]

    # sort them by sub sequence length
    distances.sort()

    result_subseq_length = distances[-1][0]

    #print("distances",distances)

    ties = [e for e in distances if e[0] == result_subseq_length]

    #print("ties")

    # break the tie with the levenshtein distance.
    if len(ties) > 1:
        tie_distances = [(pylev.levenshtein(city, t[1]),t[1]) for t in ties]
        tie_distances.sort()
        result = tie_distances[0][1]
    else:
        result = distances[-1][1]

    return result

Example #38

0

Show file

def score_domain(provided_ioc):
    """Return the scores of the provided domain."""
    score = 0

    for suspicious_tld in suspicious["tlds"]:
        if provided_ioc.endswith(suspicious_tld):
            score += 20

    try:
        res = tld.get_tld(provided_ioc,
                          as_object=True,
                          fail_silently=True,
                          fix_protocol=True)
        domain = ".".join([res.subdomain, res.domain])
    except Exception:
        domain = provided_ioc

    score += int(round(entropy.shannon_entropy(domain) * 50))
    domain = confusables.unconfuse(domain)
    words_in_domain = re.split("\W+", domain)

    if domain.startswith("*."):
        domain = domain[2:]

        if words_in_domain[0] in ["com", "net", "org"]:
            score += 10

    for word in suspicious["keywords"]:
        if word in domain:
            score += suspicious["keywords"][word]

    for key in [k for k, v in suspicious["keywords"].items() if v >= 70]:
        for word in [
                w for w in words_in_domain
                if w not in ["email", "mail", "cloud"]
        ]:
            if pylev.levenshtein(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
    return score

Example #39

0

Show file

File: extract_PO_location.py Project: FAB4D/agrigater

def uniqify(corpus, occ_dict, distance):
	# augment with value counts (which one to keep)
	words = []
	while corpus:
		center = corpus[0]
		related = [word for word in corpus if pylev.levenshtein(center, word) <= distance]
		tuples = [(word, occ_dict[word.title()]) for word in related]
		sorted_ts = sorted(tuples, key=lambda x: x[1], reverse=True)
		print(sorted_ts)
		winner = sorted_ts[0][0]
		print(corpus)
		for t in sorted_ts:
			print(t)
			corpus.remove(t[0])
		# keep taluk with highest number of occurrences
		# create dict by taking difference between corpae
		words.append(winner)
	return [x.title() for x in words]

Example #40

0

Show file

File: sqltree.py Project: wojcikk2903/ocr

def _connect_word_to_tree(word):
    last_id_query = "SELECT MAX(id) AS max_id FROM words;"
    word_from_id_query = "SELECT word FROM words WHERE id = {0};"
    find_child_at_dist_query = """SELECT child_id FROM edges 
        WHERE parent_id = {0} AND dist = {1};"""
    connect_to_tree_query = """INSERT INTO edges (parent_id, child_id, dist) 
        VALUES ({0}, {1}, {2});"""
    root_id = 1
    child_id = ['initial_id']
    node_id = root_id
    word_id = _perform_selection(last_id_query)[0][0]

    while len(child_id) > 0:
        node_word = _perform_selection(word_from_id_query.format(node_id))[0][0]
        dist = levenshtein(word, node_word)
        child_id = _perform_selection(find_child_at_dist_query.format(node_id, dist))
        if len(child_id) > 0:
            node_id = child_id[0][0]

    _perform_insertion(connect_to_tree_query.format(node_id, word_id, dist))

Example #41

0

Show file

File: bktree.py Project: eugene-eeo/algoaday

 def insert(self, string):
     dist = levenshtein(string, self.value)
     if dist not in self.children:
         self.children[dist] = Node(string)
         return
     self.children[dist].insert(string)

Example #42

0

Show file

File: manage.py Project: orvsd/orvsd_central

def assoc_sites_districts():
    """
    Associates orphan sites with districts either through fuzzy matching
    or creating schools as intermediaries between sites and districts.
    """

    with current_app.app_context():
        g.db_session = create_db_session()
        from orvsd_central.models import Site, School, District
        from orvsd_central.util import create_school_by_district_site
        from collections import namedtuple
        import pylev

        orphan_sites = set(Site.query.filter_by(school_id=None).all())
        assigned_sites = set()
        schools = School.query.all()
        num_matches = 0
        match_tuple = namedtuple('match', ['id', 'name'])
        match = None

        # If a site belongs to more than 1 school, just default to creating
        # by a district.

        print 'School Matching:'
        for site in orphan_sites:
            for school in schools:
                # Check for names as subsets or <=3 levenshtein distance.
                if (site.name in school.name or school.name in site.name or
                        pylev.levenshtein(site.name, school.name) <= 3):
                    num_matches += 1
                    match = match_tuple(id=school.id, name=school.name)
            if num_matches == 1:
                print ('School: {0} and Site: {1} matched.'
                       .format(match.name, site.name))
                site.school_id = match.id
                assigned_sites.add(site)
            num_matches = 0
            match = None

        g.db_session.commit()
        orphan_sites = orphan_sites - assigned_sites

        print '\nDistrict Matching: '

        # Districts next, with anything that's left.
        districts = District.query.all()
        for site in orphan_sites:
            for district in districts:
                if site.name in district.name or district.name in site.name:
                    print ('District: {0} or Site: {1} contained in the other.'
                           .format(district.name, site.name))
                    school = create_school_by_district_site(district, site)
                    site.school_id = school.id
                    assigned_sites.add(site)
                    break

                # Use Levenshtein Distance for fuzzy matching
                elif pylev.levenshtein(site.name, school.name) <= 3:
                    print ('District: {0} and Site: {1} fuzzy matched.'
                           .format(district.name, site.name))
                    school = create_school_by_district_site(district, site)
                    site.school_id = school.id
                    assigned_sites.add(site)
                    break

        g.db_session.commit()
        orphan_sites = orphan_sites - assigned_sites

        print '\nRemaining Sites: '
        print '\t' + '\n\t'.join((site.name for site in orphan_sites))

Example #43

0

Show file

File: tests.py Project: tauhid12/pylev

 def test_painful(self):
     # This is much faster than the above.
     self.assertEqual(pylev.levenshtein('CUNsperrICY', 'conspiracy'), 8)

Example #44

0

Show file

File: a_names_script_v2.py Project: msr-ds3/subway-flow

    a = g.replace('"', '').replace("/", ' ').replace("-", " ").strip()
    if a not in SIRS:
        temp1 = one_ave(a.lower(), pattern, "av")
        gtfs_terms.append(temp1)
        orig_gtfs.append(g)

f2.close()

bestmatches = {} #Where we'll store matches.

#Compare each station in the turnstile data to each station in the gtfs feed. 
for t in xrange(0, len(turn_terms)):
    for g in xrange(0, len(gtfs_terms)):
       
	#Compute distance:
        tinylist = [int(distanceoffset(turn_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], turn_terms[t])) + int(isinside(turn_terms[t], gtfs_terms[g])) + int(samewords(turn_terms[t], gtfs_terms[g])) + int(penalize(gtfs_terms[g], turn_terms[t])), orig_gtfs[g], gtfs_terms[g], orig_turn[t]]
        
	#Make the highest default so anything better will take its place.   
        bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
        r_best.setdefault(g, [len(gtfs_terms[g])])

	#Check against previous, update if it's a better match for both words than the things they matched before.
        if tinylist[0] < bestmatches[turn_terms[t]][0] and tinylist[0] < r_best[g]:
            bestmatches[turn_terms[t]] = tinylist
            r_best[g] = [tinylist[0], turn_terms[t]]
#            print turn_terms[t], tinylist
#            if "av n" in turn_terms[t]:
#                print bestmatches[turn_terms[t]], tinylist

f3 = open('./matchtable.txt', 'w') #Now stick it all in a nice file.

Example #45

0

Show file

File: tester.py Project: iamyaro/wikipy

import pylev
import editdistance
import distance



print pylev.levenshtein('abc', '123abc567')
#print editdistance.eval('abc', 'abc')

Example #46

0

Show file

File: tests.py Project: tauhid12/pylev

 def test_long(self):
     self.assertEqual(pylev.levenshtein('confide', 'deceit'), 6)

Example #47

0

Show file

File: tests.py Project: tauhid12/pylev

 def test_classic(self):
     self.assertEqual(pylev.levenshtein('kitten', 'sitting'), 3)

Example #48

0

Show file

File: graph_x.py Project: vazquezs123/subway-flow

B.add_node("Dummy1", demand = 1)
B.add_node("Dummy2", demand = 1)

turn_terms.append("Dummy1")
turn_terms.append("Dummy2")

f2.close()

bestmatches = {}
sawts = {}


for t in turn_terms:		
    for g in google_terms:      
         #Compute distance with levenshtein and numbers
        distance = int(pylev.levenshtein(g,t)) + int(distanceoffset(t, g)) + int(isinside(t, g)) 
# int(samewords(t, g))
        #turnstrings = orig_google[g], google_terms[g], orig_turn[t]
        B.add_edge(g, t, weight = distance)
        #if distance < 3:
        #    print "google = ", g, "turn = ", t, "distance = ",  distance
#print B.number_of_edges()

p_match = []
c = list(B.edges()) #(< probably don't need)
for (n1, n2) in c:
    if B.edge[n1][n2]['weight'] <= 0:
        B.remove_edge(n1, n2)
        p_match.append((n1,n2))
    #otherwise print out top five matches

Example #49

0

Show file

File: edit.py Project: vazquezs123/subway-flow

f1.close()
f2.close()


perfectmatches = {}
bestmatches = {}
nextbestmatches = {}

#Compare every station in the turnstile feed with every station in the google feed. 

for g_station in gtfs_terms:		
    for ts_station in ts_terms:	
	turnstile = wordnospaces(ts_station)
        google = wordnospaces(g_station)
        if pylev.levenshtein(turnstile, google) == 0: 	#If the distance is 0, we have a perfect match!
            tinylist1 = [0, ts_station]
            perfectmatches[g_station] = tinylist1
            break
        else:
            bestmatches.setdefault(g_station, [len(g_station)])
            nextbestmatches.setdefault(g_station, [len(g_station)])
            tinylist = [int(distanceoffset(ts_station, g_station)) + int(pylev.levenshtein(turnstile, google)), ts_station]

            if tinylist[0] < bestmatches[g_station][0]:
                nextbestmatches[g_station] = bestmatches[g_station]
                bestmatches[g_station] = tinylist


f3 = open('./matchtable.txt', 'w')
for p in perfectmatches:

Example #50

0

Show file

File: guide_target_detector.py Project: yfu/tools

                print "DEG read: " + a + " " + b
                print rev_comp(a+b)
                print "SRA"
            # Given a seed, for every seed-matching pair of DEG and SRA, do pairwise alignment 
            for k in seeds[i]:
                if DEBUG:
                    print k.seq
                l = len(k.seq)
                # s1: seq from around DEG sites
                ab = a + b
                s1 = rev_comp(ab)[0: l]
                # s2: seq from SRA (excluding the first pos, i.e. the 1st position does not matter)
                s2 = k.seq
                # print "s1: " + s1
                # print "s2: " + s2
                ed = pylev.levenshtein(s1[10:], s2[10:])
                if ed <= 100:
                    if DEBUG:
                        print "DEG:" + a + b
                        print "s1 from DEG: " + s1[0] + "|" + s1[1:10] + "|" + s1[10:]
                        print "s2 from SRA: " + s2[0] + "|" + s2[1:10] + "|" + s2[10:]
                        print "ed_x_pos1: " + str(ed)
                    # Only do alignment for the rest of the seq (i.e. ignore the 1st position and the seed region ( 2-10, or [1, 10) )
                    # since they are supposed to be perfectly matched
                    # alignments = pairwise2.align.globalxx(s1[10:], s2[10:])
                    # emphasize g10-g21
                    # m: A match score is the score of identical chars, otherwise mismatch score.

                    # s: Same open and extend gap penalties for both sequences.
                    # d: The sequences have different open and extend gap penalties.
                    # alignments = pairwise2.align.globalms(s1[10:], s2[10:], 2, -1, -4, -1)

Example #51

0

Show file

File: task_correction.py Project: zachamid/mypy

def levenshteinIndex(str1,str2):
	distance = pylev.levenshtein(str1,str2)
	return (1-(float)(distance)/max([len(str1),len(str2)]))

Example #52

0

Show file

File: r_names_script.py Project: vazquezs123/subway-flow

#    print ts_terms[v], orig_ts[v]

f1.close()
f2.close()

bestmatches = {}
sawts = {}

#Compare each station in the turnstile data to each station in the google feed. 
for t in xrange(0, len(turn_terms)):		
    for g in xrange(0, len(google_terms)):
        #Make the highest default so anything better will take its place.
        bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
            
            #Compute distance with levenshtein and numbers
        tinylist = [int(distanceoffset(turn_terms[t], google_terms[g])) + int(pylev.levenshtein(google_terms[g], turn_terms[t])) + isinside(turn_terms[t], google_terms[g]) + samewords(turn_terms[t], turn_terms[t]), orig_google[g], google_terms[g], orig_turn[t]]

        if tinylist[0] < bestmatches[turn_terms[t]][0]:
            bestmatches[turn_terms[t]] = tinylist

f3 = open('./matchtable.txt', 'w')

#for g in xrange(0, len(bestmatches)):
#    for x in xrange(0, len(g)):
#        print 
for g in bestmatches:
    f3.write(g + ",")
    for x in xrange(0, len(bestmatches[g])):
        if x == len(bestmatches[g])-1:
            f3.write(str(bestmatches[g][x]).strip())
        else:

Example #53

0

Show file

File: duplicates.py Project: ColCarroll/bugbug

def levenshtein_ratio(str_one, str_two):
  """
  Levenshtein ratio
  """
  str_len = len(str_one + str_two)
  return (str_len - pylev.levenshtein(str_one, str_two)) / float(str_len)

Example #54

0

Show file

File: a_names_script.py Project: vazquezs123/subway-flow

#    print ts_terms[v], orig_ts[v]

f1.close()
f2.close()

bestmatches = {}
sawts = {}

#Compare each station in the turnstile data to each station in the google feed. 
for g in xrange(0, len(gtfs_terms)):		
    for t in xrange(0, len(ts_terms)):
        #Make the highest default so anything better will take its place.
        bestmatches.setdefault(gtfs_terms[g], [len(gtfs_terms[g])])
            
            #Compute distance with levenshtein and numbers
        tinylist = [int(distanceoffset(ts_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], ts_terms[t])) + isinside(ts_terms[t], gtfs_terms[g]) + samewords(ts_terms[t], gtfs_terms[g]), orig_gtfs[g], ts_terms[t], orig_ts[t]]

        if tinylist[0] < bestmatches[gtfs_terms[g]][0]:
            bestmatches[gtfs_terms[g]] = tinylist

f3 = open('./matchtable2.txt', 'w')

print bestmatches
#for g in xrange(0, len(bestmatches)):
#    for x in xrange(0, len(g)):
#        print 
#for g in bestmatches:
#    f3.write(g + ",")
#    for x in xrange(0, len(bestmatches[g])):
#        if x == len(bestmatches[g])-1:
#            f3.write(str(bestmatches[g][x]).strip())

Example #55

0

Show file

File: tests.py Project: tauhid12/pylev

 def test_same(self):
     self.assertEqual(pylev.levenshtein('kitten', 'kitten'), 0)

Example #56

0

Show file

File: wordnet_wrapper.py Project: PWr-Projects-For-Courses/NLP

def get_sense(word, lang=u"pl_PL"):
    senses = get_senses(word, lang)
    counter[0] += 1
    if counter[0] % 100 == 0:
        print "sense", counter[0]
    return min(senses, key=lambda x: pylev.levenshtein(x, word)) if senses else None

Example #57

0

Show file

File: tests.py Project: tauhid12/pylev

 def test_empty(self):
     self.assertEqual(pylev.levenshtein('', ''), 0)

Example #58

0

Show file

File: a_names_script_v2.py Project: msr-ds3/subway-flow

def penalize(string1, string2):
    if pylev.levenshtein(string1, string2) > min(len(string1), len(string2)):
        return 3
    return 0

Example #59

0

Show file

File: gitter.py Project: freedesktop-unofficial-mirror/libreoffice__contrib__dev-tools

 def apply_lev(self, threshold):
     if self.proposals:
         for value in self.proposals.keys():
             if pylev.levenshtein(value, self.goal) > threshold:
                 del self.proposals[value]