コード例 #1
0
def maj_argus_zoe(db_zoe, df_argus):
    for el in db_zoe['id']:
        text_description = db_zoe[db_zoe['id'] == el]['description'].values
        text_version = db_zoe[db_zoe['id'] == el]['version'].values
        year_tmp = db_zoe[db_zoe['id'] == el]['year'].values[0]
        if str(year_tmp)[:2] == '20' and year_tmp != '':
            df_argus_tmp = df_argus[df_argus.year == year_tmp]
        else:
            df_argus_tmp = df_argus

        for el_argus in df_argus_tmp['version'].values:
            df_argus_sub_tmp = df_argus_tmp

            # Recherche de la version de zoe 'La centrale' la plus proche de la version leboncoin (titre/decription)
            df_argus_tmp.loc[df_argus_tmp.version == el_argus,
                             'select'] = pylev.levenshtein(
                                 text_description, str(el_argus))
            df_argus_sub_tmp.loc[df_argus_sub_tmp.version == el_argus,
                                 'select'] = pylev.levenshtein(
                                     text_version, str(el_argus))
            df_argus_fin_tmp = df_argus_tmp.append(df_argus_sub_tmp)
            distance_min = df_argus_fin_tmp['select'].min()

            argus_price = df_argus_fin_tmp[df_argus_fin_tmp['select'] ==
                                           distance_min]['argus'].values[0]
            db_zoe.loc[db_zoe.id == el, 'argus'] = float(argus_price)
            db_zoe.loc[db_zoe.id == el, 'official_version'] = el_argus
    return db_zoe
コード例 #2
0
ファイル: demultiplexer.py プロジェクト: lowks/SDST
def isIndexRevComp(indexfile, indexes, n=500000):
    """Determine if the indexes are reverse complemented or not
    
    :param indexfile: filename of the Fastq index file
    :param indexes: list or tuple of index strings
    :param n: integer number of reads to sample
    """
    print("HERE")
    ifile = Fastq(indexfile)
    ilength = len(indexes[0])
    print(ilength)
    indexreads = collections.defaultdict(int)
    for i in range(n):
        indexreads[ifile.next().sequence[:ilength]] += 1
    counts = {'normal': 0, 'revcomp': 0}
    for k, v in list(indexreads.items()):
        print(k, v)
        for i in indexes:
            if (pylev.levenshtein(k, i) <= 1):
                counts['normal'] += v
                continue
            if (pylev.levenshtein(k, revcomp(i)) <= 1):
                counts['revcomp'] += v
    if (counts['revcomp'] > counts['normal']):
        print('using revcomp')
    else:
        print('NOT revcomp')

    return (counts['revcomp'] > counts['normal'])
コード例 #3
0
def equal_levenshtein(string1: str, string2: str, min_index: int) -> int:
	"""recursive levenshtein. only looks at the next number of chars that could
	mean that the levenshtein distance of this string pair is higher than the
	minimum required. If it is not, calls itself on the rest of the strings.
	This way, we not check all the strings at once but do it in sections since
	the minimum distance may already be reached after checking half of the
	string, saving valuable computing resources (levenshtein has complexity
	O(m*n) where m and n are the lengths of the two strings compared)"""

	length = len(string1)
	if length <= min_index:
		return pylev.levenshtein(string1, string2)
	else:
		index = pylev.levenshtein(string1[0:min_index+1], string2[
				0:min_index+1])
		if index > min_index:
			return False
		else:
			sub_index = equal_levenshtein(string1[min_index+1:], string2[min_index+1:], min_index-index)
			if sub_index is False:
				return False
			else:
				index += sub_index
				if index > min_index:
					return False
				else:
					return index
コード例 #4
0
ファイル: demultiplexer.py プロジェクト: seandavi/SDST
def isIndexRevComp(indexfile,indexes,n=500000):
    """Determine if the indexes are reverse complemented or not
    
    :param indexfile: filename of the Fastq index file
    :param indexes: list or tuple of index strings
    :param n: integer number of reads to sample
    """
    print("HERE")
    ifile = Fastq(indexfile)
    ilength=len(indexes[0])
    print(ilength)
    indexreads = collections.defaultdict(int)
    for i in range(n):
        indexreads[ifile.next().sequence[:ilength]]+=1
    counts = {'normal':0,
              'revcomp':0}
    for k,v in list(indexreads.items()):
        print(k,v)
        for i in indexes:
            if(pylev.levenshtein(k,i)<=1):
                counts['normal']+=v
                continue
            if(pylev.levenshtein(k,revcomp(i))<=1):
                counts['revcomp']+=v
    if(counts['revcomp']>counts['normal']):
        print('using revcomp')
    else:
        print('NOT revcomp')
        
    return(counts['revcomp']>counts['normal'])
コード例 #5
0
    def lev_distance(q1, q2, process):
        if process:
            lev = float(levenshtein(' '.join(q1), ' '.join(q2)))
            return [lev / float(max(1, len(' '.join(q1)) + len(' '.join(q2)))),
                   lev / float(max(1, min(len(' '.join(q1)), len(' '.join(q2))))),
                   lev / float(max(1, max(len(' '.join(q1)), len(' '.join(q2)))))]

        else:
            lev = float(levenshtein(q1, q2))
            return [lev / float(max(1, len(q1) + len(q2))), 
                   lev / float(max(1, min(len(q1), len(q2)))),
                   lev / float(max(1, max(len(q1), len(q2))))]
コード例 #6
0
ファイル: application.py プロジェクト: hason/cleo
    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        @param name: The string
        @type name: str
        @param collection: The collection
        @type collection: list

        @return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] = alternatives[
                            collection_name] + lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alternatives = list(
            filter(lambda a: a[1] < 2 * threshold, alternatives.items()))
        sorted(alternatives, key=lambda x: x[1])

        return list(map(lambda x: x[0], alternatives))
コード例 #7
0
    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        :param name: The string
        :type name: str
        :param collection: The collection
        :type collection: list

        :return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] += lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alts = []
        for alt, score in alternatives.items():
            if score < 2 * threshold:
                alts.append(alt)

        return alts
コード例 #8
0
ファイル: application.py プロジェクト: hason/cleo
    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        @param name: The string
        @type name: str
        @param collection: The collection
        @type collection: list

        @return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] = alternatives[collection_name] + lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alternatives = list(filter(lambda a: a[1] < 2 * threshold, alternatives.items()))
        sorted(alternatives, key=lambda x: x[1])

        return list(map(lambda x: x[0], alternatives))
コード例 #9
0
 def suggest(self, key, distance=3):
     suggestions = set()
     for index in self.indices:
         for candidate in index:
             if levenshtein(key, candidate) <= distance:
                 suggestions.add(index[candidate])
     return suggestions
コード例 #10
0
ファイル: lines.py プロジェクト: nibrahim/lines
def patterns(f1, dist = 55, outlier=10):
    """Will partition elements into subsets. The elements of a subset will
    not have a Levenshtein distance of more than :dist: from the other
    members of the same subset
    """
    sets = []
    seen = set()
    for i in (x.strip() for x in f1 if x.strip()):
        if i in seen:
            continue
        s = set([i])
        seen.add(i)
        others = set(x.strip() for x in f1 if x.strip()) - seen
        for j in others:
            v = levenshtein(i, j) 
            if v <= dist:
                s.add(j)
                seen.add(j)
        sets.append(s)

    # Format for printing
    outlier /= 100.0
    retval = []
    total = len(list(x.strip() for x in f1 if x.strip()))
    for i in sets:
        l = float(len(i))
        if l/total < outlier:
            retval.append("{} elements - {}".format(len(i), i))
        else:
            retval.append("{} elements".format(len(i)))
    
    return retval
コード例 #11
0
ファイル: addr_filter.py プロジェクト: haizi-zh/ofashion
def filter_brand_name(v, threshold, logger=None):
    # 如果地址以品牌名称开始
    """
    去除地址开头是品牌名称的情况
    :param record:
    :param threshold: Levenshtein距离的阈值
    :return:
    """
    record, modified = v
    logger = logging.getLogger() if logger is None else logger
    record = record.copy()

    addr = record[u'addr_e_rev'] if record[
        u'addr_e_rev'] is not None else record[u'addr_e']
    if addr is None:
        return record, modified
    addr_list = tuple(temp.strip() for temp in addr.split(u','))
    if len(addr_list) <= 1:
        return record, modified
    str1 = addr_list[0].lower()
    str2 = record[u'brandname_e'].strip().lower()
    dist = pylev.levenshtein(str1, str2)
    if dist < threshold:
        logger.info(
            unicode.format(u'{0} is similar to {1}, idstores={2}', addr,
                           record[u'brandname_e'], record[u'idstores']))
        record[u'addr_e_rev'] = u', '.join(addr_list[1:])
        modified = True
    return record, modified
コード例 #12
0
ファイル: sqltree.py プロジェクト: wojcikk2903/ocr
def _search_subnode(word, node_id, tolerance):
    word_from_id_query = "SELECT word FROM words WHERE id = {0};"
    get_children_query = """SELECT e.child_id, e.dist, w.word FROM edges e
        INNER JOIN words w ON e.child_id = w.id WHERE parent_id = {0};"""
    node_word = _perform_selection(word_from_id_query.format(node_id))[0][0]
    dist = levenshtein(word, node_word)
    result = set()
    if dist <= tolerance:
        result.add(node_word)
    children = _perform_selection(get_children_query.format(node_id))
    for child_id, child_parent_dist, child_word in children:
        child_word_dist = levenshtein(word, child_word)
        if child_parent_dist >= dist-tolerance and child_parent_dist <= dist+tolerance:
            result ^= _search_subnode(word, child_id, tolerance)

    return result
コード例 #13
0
def link_author(author):
    global author_list
    for lab_author in author_list.keys():
        if pylev.levenshtein(lab_author, author) / max(len(lab_author),
                                                       len(author)) < .30:
            return author_list[lab_author]
    return author
コード例 #14
0
ファイル: command.py プロジェクト: pmav99/clikit
def find_similar_command_names(
        name, commands):  # type: (str, CommandCollection) -> List[str]
    """
    Finds names similar to a given command name.
    """
    threshold = 1e3
    distance_by_name = {}

    # Include aliases in the search
    actual_names = commands.get_names(True)

    for actual_name in actual_names:
        # Get Levenshtein distance between the input and each command name
        distance = levenshtein(name, actual_name)

        is_similar = distance <= len(name) / 3
        is_sub_string = actual_name.find(name) != -1

        if is_similar or is_sub_string:
            distance_by_name[actual_name] = distance

        # Only keep results with a distance below the threshold
        distance_by_name = {
            k: v
            for k, v in distance_by_name.items() if v < 2 * threshold
        }

        # Display results with shortest distance first
        suggested_names = []
        for k, v in sorted(distance_by_name.items(), key=lambda _, v: v):
            if k not in suggested_names:
                suggested_names.append(k)

        return suggested_names
コード例 #15
0
def cached(a, b):
    if 'data' not in cached.__dict__:
        cached.data = {}
    k = (a,b)
    if k not in cached.data:
        cached.data[k] = pylev.levenshtein(a, b)
    return cached.data[k]
コード例 #16
0
ファイル: _utils.py プロジェクト: meschac38700/cleo
def find_similar_names(name, names):  # type: (str, List[str]) -> List[str]
    """
    Finds names similar to a given command name.
    """
    threshold = 1e3
    distance_by_name = {}
    suggested_names = []

    for actual_name in names:
        # Get Levenshtein distance between the input and each command name
        distance = levenshtein(name, actual_name)

        is_similar = distance <= len(name) / 3
        is_sub_string = actual_name.find(name) != -1

        if is_similar or is_sub_string:
            distance_by_name[actual_name] = (
                distance,
                actual_name.find(name) if is_sub_string else float("inf"),
            )

    # Only keep results with a distance below the threshold
    distance_by_name = {
        k: v
        for k, v in distance_by_name.items() if v[0] < 2 * threshold
    }

    # Display results with shortest distance first
    for k, v in sorted(distance_by_name.items(),
                       key=lambda i: (i[1][0], i[1][1])):
        if k not in suggested_names:
            suggested_names.append(k)

    return suggested_names
コード例 #17
0
def patterns(f1, dist=55, outlier=10):
    """Will partition elements into subsets. The elements of a subset will
    not have a Levenshtein distance of more than :dist: from the other
    members of the same subset
    """
    sets = []
    seen = set()
    for i in (x.strip() for x in f1 if x.strip()):
        if i in seen:
            continue
        s = set([i])
        seen.add(i)
        others = set(x.strip() for x in f1 if x.strip()) - seen
        for j in others:
            v = levenshtein(i, j)
            if v <= dist:
                s.add(j)
                seen.add(j)
        sets.append(s)

    # Format for printing
    outlier /= 100.0
    retval = []
    total = len(list(x.strip() for x in f1 if x.strip()))
    for i in sets:
        l = float(len(i))
        if l / total < outlier:
            retval.append("{} elements - {}".format(len(i), i))
        else:
            retval.append("{} elements".format(len(i)))

    return retval
コード例 #18
0
ファイル: load_tools.py プロジェクト: IFB-ElixirFr/ifbcat
 def get_best_match(self, response, tool_name, max_edition_percentage: Optional[float] = 0.1):
     # biotoolsID = response['list'][0]['biotoolsID']
     biotools_item = None
     normalized_name = self.normalize(tool_name)
     min_edit = len(tool_name) * 10000
     for item in response['list']:
         choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['biotoolsID']))
         if choice_edit < min_edit:
             min_edit = choice_edit
             biotools_item = item
         choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['name']))
         if choice_edit < min_edit:
             min_edit = choice_edit
             biotools_item = item
     if max_edition_percentage is not None and min_edit > len(tool_name) * max_edition_percentage:
         return None
     return biotools_item
コード例 #19
0
def levenshtein_worker(queue, results):
    while True:
        work = queue.get()
        (hash1, hash2, sector1, sector2, score) = work

        distance = pylev.levenshtein(sector1, sector2)
        results.put_nowait((hash1, hash2, distance, score))

        queue.task_done()
コード例 #20
0
def _is_duplicate(a: str, b: str) -> bool:
    """Determine whether two stacktraces are for the same error."""
    la = len(a)
    lb = len(b)
    diff = abs(la - lb)
    if diff > 50:
        return False
    denom = min(la, lb) + diff / 2
    ratio = levenshtein(a.casefold(), b.casefold()) / denom
    return ratio < 0.1
コード例 #21
0
ファイル: tvdb_api.py プロジェクト: jdintruff/tvdb_api
 def selectSeries(self, series, allSeries):
     """The results the TVDB returns are sometimes poorly ranked and the first result is often not what we're looking
     for. This function attempts to find the closest match between a series named in the results and the user's
     search query by calculating the Levenshtein edit distance between the search query (series) and each of the
     results (allSeries) in order to find the result that most precisely matches our query
     """
     distances = []
     for show in allSeries:
         distances.append(pylev.levenshtein(series, show["seriesName"]))
     return allSeries[distances.index(min(distances))]
コード例 #22
0
    def evaluate_individual_sentence(self, original_sentence,
                                     paraphrase) -> Dict:

        original_sentence_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(original_sentence))
        paraphrase_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(paraphrase))

        # Bleu score
        bleu_score = nltk.translate.bleu_score.sentence_bleu(
            [normalize_spaces_remove_urls(original_sentence)],
            normalize_spaces_remove_urls(paraphrase))

        # Sentence embedding cosine similarity
        emb1 = self.model.encode(original_sentence)
        emb2 = self.model.encode(paraphrase)
        cos_sim = util.pytorch_cos_sim(emb1, emb2)

        # Levenshtein distance
        edit_distance = pylev.levenshtein(original_sentence_tokens,
                                          paraphrase_tokens)
        length = max(len(original_sentence_tokens), len(paraphrase_tokens))
        normalized_edit_distance = (length - edit_distance) / length

        # Jaccard
        jaccard = nltk.jaccard_distance(set(original_sentence_tokens),
                                        set(paraphrase_tokens))

        # Jaccard * cosine similarity
        jaccard_embedding_factor = jaccard * cos_sim.item()

        metrics = {
            'original_sentence':
            original_sentence,
            'paraphrase':
            paraphrase,
            'bleu_score':
            bleu_score,
            'normalized_original_sentence':
            normalize_spaces_remove_urls(original_sentence),
            'normalized_paraphrase':
            normalize_spaces_remove_urls(paraphrase),
            'embedding_cosine_similarity':
            cos_sim.item(),
            'edit_distance':
            edit_distance,
            'normalized_edit_distance':
            normalized_edit_distance,
            'jaccard':
            jaccard,
            'jaccard_embedding_factor':
            jaccard_embedding_factor
        }

        return metrics
コード例 #23
0
def planet_constellation(update, context):
    translator = Translator()
    text = update.message.text
    text = text.split()
    min_distance = 1000
    best_planet_choice = ''
    user_planet_in_text = ''
    for cur_word in text:
        for cur_planet in planet_list:
            if pylev.levenshtein(cur_word, cur_planet) < min_distance:
                min_distance = pylev.levenshtein(cur_word, cur_planet)
                best_planet_choice = cur_planet
                user_planet_in_text = cur_word
    full_name = find_constellation(best_planet_choice)
    full_name_ru = translator.translate(full_name,dest='russian', src='en').text
    if user_planet_in_text.upper() != best_planet_choice.upper():
        ans_text = f'Did you mean {best_planet_choice}? \n {full_name} / {full_name_ru}'
    else:
        ans_text = f'{full_name} / {full_name_ru}'
    update.message.reply_text(ans_text)            
コード例 #24
0
def levenshtein(a, b):
    len_a = len(a)
    len_b = len(b)
    distance = pylev.levenshtein(a, b)
    try:
        maxLength = max(len_a, len_b)
        result = maxLength - distance
        percentage = (result / maxLength) * 100
        return percentage
    except:
        return 0
コード例 #25
0
def compare(a, b):
    results = {
        'editdistance':
        editdistance.eval(a, b),
        'pylev':
        pylev.levenshtein(a, b),
        'python-Levenshtein':
        Levenshtein.distance(a, b),
        'pyxdameraulevenshtein':
        pyxdameraulevenshtein.damerau_levenshtein_distance(a, b),
    }
    return results
コード例 #26
0
ファイル: bktree.py プロジェクト: eugene-eeo/algoaday
    def search(self, query, threshold):
        d = levenshtein(self.value, query)
        if d <= threshold:
            yield self.value

        lo = d - threshold
        hi = d + threshold

        for dist, node in self.children.items():
            if lo <= dist <= hi:
                for rv in node.search(query, threshold):
                    yield rv
コード例 #27
0
def similarity(s1, s2):
    # Length considerations on/off
    if NO_LEN == 1:
        trunc_len = min(len(s1), len(s2), TRUNC)
    else:
        trunc_len = TRUNC

    # Truncate
    s1 = s1[:trunc_len]
    s2 = s2[:trunc_len]

    # Return the levenshtein distance between the two modified strings
    return pylev.levenshtein(s1, s2)
コード例 #28
0
def compare_to_gold_labels(entity: str, gold_entities: List[str],
                           para_id: int) -> str:

    if entity in gold_entities:
        return entity

    for gold_ent in gold_entities:
        if (pylev.levenshtein(entity, gold_ent) < 3):
            return gold_ent

    print(para_id)
    print(f"Cannot find {entity}")
    return entity
コード例 #29
0
ファイル: main.py プロジェクト: johnl79/word_similarity
def similarity(s1, s2):
  # Length considerations on/off
  if NO_LEN == 1:
    trunc_len = min(len(s1), len(s2), TRUNC)
  else:
    trunc_len = TRUNC

  # Truncate
  s1 = s1[:trunc_len]
  s2 = s2[:trunc_len]

  # Return the levenshtein distance between the two modified strings
  return pylev.levenshtein(s1, s2)
コード例 #30
0
ファイル: task_correction.py プロジェクト: zachamid/mypy
def similarity_index_per_item(item1, item2):
	if type(item1)==str and type(item2)==str:
		return pylev.levenshtein(item1,item2)
	if ((type(item1)==int and type(item1)==int)
			or (type(item1)==float and type(item2)==float) 
			or (type(item1)==long and type(item2)==long)):
		return (abs((float)(item1 - item2)))/float(max([item1+1,item2+1]))
	if type(item1)==bool and type(item2)==bool:
		if item1 == item2:
			return 0
		else:
			return 1
	if (type(item1)==dict and type(item2)==dict) or (type(item1)==list and type(item2)==list):
		return 1-jaccard(item1,item2)
コード例 #31
0
def _iter_fuzzy_entries(catalog: Catalog,
                        search_key: Key) -> typ.Iterable[Entry]:
    for key in _iter_candidate_keys(catalog, search_key):
        msg_text_dist = pylev.levenshtein(key.msg_text, search_key.msg_text)
        src_line_dist = pylev.levenshtein(key.source_line,
                                          search_key.source_line)

        if msg_text_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS:
            continue
        if src_line_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS:
            continue

        msg_text_dist_pct = 100 * msg_text_dist / max(len(key.msg_text),
                                                      len(search_key.msg_text))
        src_line_dist_pct = (
            100 * src_line_dist /
            max(len(key.source_line), len(search_key.source_line)))

        if msg_text_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT:
            continue
        if src_line_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT:
            continue

        yield catalog[key]
コード例 #32
0
def main():
    if len(sys.argv) != 2:
        exit(f"Usage: {sys.argv[0]} filename")
    filename = sys.argv[1]
    outfile = 'out.txt'

    rows = []
    with open(filename) as fh:
        for row in fh:
            rows.append(row.rstrip("\n"))
    with open(outfile, 'w') as fh:
        for a in rows:
            for b in rows:
                dist = pylev.levenshtein(a, b)
                fh.write(f"{a},{b},{dist}\n")
コード例 #33
0
	def transform(self, question_list):
		q1_list = question_list[0]
		q2_list = question_list[1]
		
		lev_distance_strings = [[a,b] 
		for a,b in zip(q1_list, q2_list)]
		
		lev_dist_array = np.array([
	(float(levenshtein(pair[0], pair[1]))/
	(float(sum([x.count('') for x in pair[0]])) + 
	float(sum([x.count('') for x in pair[1]])))) 
	for pair in lev_distance_strings 
		])
		
		return lev_dist_array.reshape(len(lev_dist_array),1)
コード例 #34
0
ファイル: evalQA.py プロジェクト: ytyz1307zzh/KOALA
def compare_to_gold_labels(participants, system_participants):
    ret = []
    found = False
    for p in participants:
        p = p.lower()
        if p in system_participants:
            ret.append(p)
            continue
        for g in system_participants:
            if (pylev.levenshtein(p,g) < 3):
                #print (p, "===", g)
                ret.append(g)
                found = True
        if not found:
            print(f"Cannot find {p}")
    return ret
コード例 #35
0
ファイル: __init__.py プロジェクト: charliestrawn/thundersnow
def get_similar_members():
    allowed_distance = int(request.args.get('distance')) or 4
    members = Member.query.all()
    similar = {'a': [], 'b': []}
    for left, right in itertools.combinations(members, 2):
        distance = pylev.levenshtein(left.name, right.name)
        if distance < allowed_distance:
            left_json = left.serialize
            left_json['pmts'] = len(left.payments)
            similar['a'].append(left_json)

            right_json = right.serialize
            right_json['pmts'] = len(right.payments)
            similar['b'].append(right_json)

    return render_template('similar.html', similar_members=similar)
コード例 #36
0
ファイル: cityres.py プロジェクト: jopela/cityres
def choose_best(city, uris):
    """
    Chooses the string that most closely resemble to the city name.

    EXAMPLE
    =======
    >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem'])
    'http://dbpedia.org/resource/New_York_City'

    """

    # strategy is to use the longest common subsequence first and
    # take the the string that has the uri that has the longest one.
    # If there are ties, break the tie by computing the levenshtein and
    # taking the uri that has the smallest.

    # this creates a kind of band-pass filter, so to speak.

    distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris]

    # sort them by sub sequence length
    distances.sort()

    result_subseq_length = distances[-1][0]

    #print("distances",distances)

    ties = [e for e in distances if e[0] == result_subseq_length]

    #print("ties")

    # break the tie with the levenshtein distance.
    if len(ties) > 1:
        tie_distances = [(pylev.levenshtein(city, t[1]), t[1]) for t in ties]
        tie_distances.sort()
        result = tie_distances[0][1]
    else:
        result = distances[-1][1]

    return result
コード例 #37
0
ファイル: cityres.py プロジェクト: jopela/cityres
def choose_best(city, uris):
    """
    Chooses the string that most closely resemble to the city name.

    EXAMPLE
    =======
    >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem'])
    'http://dbpedia.org/resource/New_York_City'

    """

    # strategy is to use the longest common subsequence first and
    # take the the string that has the uri that has the longest one.
    # If there are ties, break the tie by computing the levenshtein and
    # taking the uri that has the smallest.

    # this creates a kind of band-pass filter, so to speak.

    distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris]

    # sort them by sub sequence length
    distances.sort()

    result_subseq_length = distances[-1][0]

    #print("distances",distances)

    ties = [e for e in distances if e[0] == result_subseq_length]

    #print("ties")

    # break the tie with the levenshtein distance.
    if len(ties) > 1:
        tie_distances = [(pylev.levenshtein(city, t[1]),t[1]) for t in ties]
        tie_distances.sort()
        result = tie_distances[0][1]
    else:
        result = distances[-1][1]

    return result
コード例 #38
0
def score_domain(provided_ioc):
    """Return the scores of the provided domain."""
    score = 0

    for suspicious_tld in suspicious["tlds"]:
        if provided_ioc.endswith(suspicious_tld):
            score += 20

    try:
        res = tld.get_tld(provided_ioc,
                          as_object=True,
                          fail_silently=True,
                          fix_protocol=True)
        domain = ".".join([res.subdomain, res.domain])
    except Exception:
        domain = provided_ioc

    score += int(round(entropy.shannon_entropy(domain) * 50))
    domain = confusables.unconfuse(domain)
    words_in_domain = re.split("\W+", domain)

    if domain.startswith("*."):
        domain = domain[2:]

        if words_in_domain[0] in ["com", "net", "org"]:
            score += 10

    for word in suspicious["keywords"]:
        if word in domain:
            score += suspicious["keywords"][word]

    for key in [k for k, v in suspicious["keywords"].items() if v >= 70]:
        for word in [
                w for w in words_in_domain
                if w not in ["email", "mail", "cloud"]
        ]:
            if pylev.levenshtein(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
    return score
コード例 #39
0
def uniqify(corpus, occ_dict, distance):
	# augment with value counts (which one to keep)
	words = []
	while corpus:
		center = corpus[0]
		related = [word for word in corpus if pylev.levenshtein(center, word) <= distance]
		tuples = [(word, occ_dict[word.title()]) for word in related]
		sorted_ts = sorted(tuples, key=lambda x: x[1], reverse=True)
		print(sorted_ts)
		winner = sorted_ts[0][0]
		print(corpus)
		for t in sorted_ts:
			print(t)
			corpus.remove(t[0])
		# keep taluk with highest number of occurrences
		# create dict by taking difference between corpae
		words.append(winner)
	return [x.title() for x in words]
コード例 #40
0
ファイル: sqltree.py プロジェクト: wojcikk2903/ocr
def _connect_word_to_tree(word):
    last_id_query = "SELECT MAX(id) AS max_id FROM words;"
    word_from_id_query = "SELECT word FROM words WHERE id = {0};"
    find_child_at_dist_query = """SELECT child_id FROM edges 
        WHERE parent_id = {0} AND dist = {1};"""
    connect_to_tree_query = """INSERT INTO edges (parent_id, child_id, dist) 
        VALUES ({0}, {1}, {2});"""
    root_id = 1
    child_id = ['initial_id']
    node_id = root_id
    word_id = _perform_selection(last_id_query)[0][0]

    while len(child_id) > 0:
        node_word = _perform_selection(word_from_id_query.format(node_id))[0][0]
        dist = levenshtein(word, node_word)
        child_id = _perform_selection(find_child_at_dist_query.format(node_id, dist))
        if len(child_id) > 0:
            node_id = child_id[0][0]

    _perform_insertion(connect_to_tree_query.format(node_id, word_id, dist))
コード例 #41
0
ファイル: bktree.py プロジェクト: eugene-eeo/algoaday
 def insert(self, string):
     dist = levenshtein(string, self.value)
     if dist not in self.children:
         self.children[dist] = Node(string)
         return
     self.children[dist].insert(string)
コード例 #42
0
ファイル: manage.py プロジェクト: orvsd/orvsd_central
def assoc_sites_districts():
    """
    Associates orphan sites with districts either through fuzzy matching
    or creating schools as intermediaries between sites and districts.
    """

    with current_app.app_context():
        g.db_session = create_db_session()
        from orvsd_central.models import Site, School, District
        from orvsd_central.util import create_school_by_district_site
        from collections import namedtuple
        import pylev

        orphan_sites = set(Site.query.filter_by(school_id=None).all())
        assigned_sites = set()
        schools = School.query.all()
        num_matches = 0
        match_tuple = namedtuple('match', ['id', 'name'])
        match = None

        # If a site belongs to more than 1 school, just default to creating
        # by a district.

        print 'School Matching:'
        for site in orphan_sites:
            for school in schools:
                # Check for names as subsets or <=3 levenshtein distance.
                if (site.name in school.name or school.name in site.name or
                        pylev.levenshtein(site.name, school.name) <= 3):
                    num_matches += 1
                    match = match_tuple(id=school.id, name=school.name)
            if num_matches == 1:
                print ('School: {0} and Site: {1} matched.'
                       .format(match.name, site.name))
                site.school_id = match.id
                assigned_sites.add(site)
            num_matches = 0
            match = None

        g.db_session.commit()
        orphan_sites = orphan_sites - assigned_sites

        print '\nDistrict Matching: '

        # Districts next, with anything that's left.
        districts = District.query.all()
        for site in orphan_sites:
            for district in districts:
                if site.name in district.name or district.name in site.name:
                    print ('District: {0} or Site: {1} contained in the other.'
                           .format(district.name, site.name))
                    school = create_school_by_district_site(district, site)
                    site.school_id = school.id
                    assigned_sites.add(site)
                    break

                # Use Levenshtein Distance for fuzzy matching
                elif pylev.levenshtein(site.name, school.name) <= 3:
                    print ('District: {0} and Site: {1} fuzzy matched.'
                           .format(district.name, site.name))
                    school = create_school_by_district_site(district, site)
                    site.school_id = school.id
                    assigned_sites.add(site)
                    break

        g.db_session.commit()
        orphan_sites = orphan_sites - assigned_sites

        print '\nRemaining Sites: '
        print '\t' + '\n\t'.join((site.name for site in orphan_sites))
コード例 #43
0
ファイル: tests.py プロジェクト: tauhid12/pylev
 def test_painful(self):
     # This is much faster than the above.
     self.assertEqual(pylev.levenshtein('CUNsperrICY', 'conspiracy'), 8)
コード例 #44
0
    a = g.replace('"', '').replace("/", ' ').replace("-", " ").strip()
    if a not in SIRS:
        temp1 = one_ave(a.lower(), pattern, "av")
        gtfs_terms.append(temp1)
        orig_gtfs.append(g)

f2.close()

bestmatches = {} #Where we'll store matches.

#Compare each station in the turnstile data to each station in the gtfs feed. 
for t in xrange(0, len(turn_terms)):
    for g in xrange(0, len(gtfs_terms)):
       
	#Compute distance:
        tinylist = [int(distanceoffset(turn_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], turn_terms[t])) + int(isinside(turn_terms[t], gtfs_terms[g])) + int(samewords(turn_terms[t], gtfs_terms[g])) + int(penalize(gtfs_terms[g], turn_terms[t])), orig_gtfs[g], gtfs_terms[g], orig_turn[t]]
        
	#Make the highest default so anything better will take its place.   
        bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
        r_best.setdefault(g, [len(gtfs_terms[g])])

	#Check against previous, update if it's a better match for both words than the things they matched before.
        if tinylist[0] < bestmatches[turn_terms[t]][0] and tinylist[0] < r_best[g]:
            bestmatches[turn_terms[t]] = tinylist
            r_best[g] = [tinylist[0], turn_terms[t]]
#            print turn_terms[t], tinylist
#            if "av n" in turn_terms[t]:
#                print bestmatches[turn_terms[t]], tinylist

f3 = open('./matchtable.txt', 'w') #Now stick it all in a nice file.
コード例 #45
0
ファイル: tester.py プロジェクト: iamyaro/wikipy
import pylev
import editdistance
import distance



print pylev.levenshtein('abc', '123abc567')
#print editdistance.eval('abc', 'abc')
コード例 #46
0
ファイル: tests.py プロジェクト: tauhid12/pylev
 def test_long(self):
     self.assertEqual(pylev.levenshtein('confide', 'deceit'), 6)
コード例 #47
0
ファイル: tests.py プロジェクト: tauhid12/pylev
 def test_classic(self):
     self.assertEqual(pylev.levenshtein('kitten', 'sitting'), 3)
コード例 #48
0
ファイル: graph_x.py プロジェクト: vazquezs123/subway-flow
B.add_node("Dummy1", demand = 1)
B.add_node("Dummy2", demand = 1)

turn_terms.append("Dummy1")
turn_terms.append("Dummy2")

f2.close()

bestmatches = {}
sawts = {}


for t in turn_terms:		
    for g in google_terms:      
         #Compute distance with levenshtein and numbers
        distance = int(pylev.levenshtein(g,t)) + int(distanceoffset(t, g)) + int(isinside(t, g)) 
# int(samewords(t, g))
        #turnstrings = orig_google[g], google_terms[g], orig_turn[t]
        B.add_edge(g, t, weight = distance)
        #if distance < 3:
        #    print "google = ", g, "turn = ", t, "distance = ",  distance
#print B.number_of_edges()

p_match = []
c = list(B.edges()) #(< probably don't need)
for (n1, n2) in c:
    if B.edge[n1][n2]['weight'] <= 0:
        B.remove_edge(n1, n2)
        p_match.append((n1,n2))
    #otherwise print out top five matches
コード例 #49
0
ファイル: edit.py プロジェクト: vazquezs123/subway-flow
f1.close()
f2.close()


perfectmatches = {}
bestmatches = {}
nextbestmatches = {}

#Compare every station in the turnstile feed with every station in the google feed. 

for g_station in gtfs_terms:		
    for ts_station in ts_terms:	
	turnstile = wordnospaces(ts_station)
        google = wordnospaces(g_station)
        if pylev.levenshtein(turnstile, google) == 0: 	#If the distance is 0, we have a perfect match!
            tinylist1 = [0, ts_station]
            perfectmatches[g_station] = tinylist1
            break
        else:
            bestmatches.setdefault(g_station, [len(g_station)])
            nextbestmatches.setdefault(g_station, [len(g_station)])
            tinylist = [int(distanceoffset(ts_station, g_station)) + int(pylev.levenshtein(turnstile, google)), ts_station]

            if tinylist[0] < bestmatches[g_station][0]:
                nextbestmatches[g_station] = bestmatches[g_station]
                bestmatches[g_station] = tinylist


f3 = open('./matchtable.txt', 'w')
for p in perfectmatches:
コード例 #50
0
ファイル: guide_target_detector.py プロジェクト: yfu/tools
                print "DEG read: " + a + " " + b
                print rev_comp(a+b)
                print "SRA"
            # Given a seed, for every seed-matching pair of DEG and SRA, do pairwise alignment 
            for k in seeds[i]:
                if DEBUG:
                    print k.seq
                l = len(k.seq)
                # s1: seq from around DEG sites
                ab = a + b
                s1 = rev_comp(ab)[0: l]
                # s2: seq from SRA (excluding the first pos, i.e. the 1st position does not matter)
                s2 = k.seq
                # print "s1: " + s1
                # print "s2: " + s2
                ed = pylev.levenshtein(s1[10:], s2[10:])
                if ed <= 100:
                    if DEBUG:
                        print "DEG:" + a + b
                        print "s1 from DEG: " + s1[0] + "|" + s1[1:10] + "|" + s1[10:]
                        print "s2 from SRA: " + s2[0] + "|" + s2[1:10] + "|" + s2[10:]
                        print "ed_x_pos1: " + str(ed)
                    # Only do alignment for the rest of the seq (i.e. ignore the 1st position and the seed region ( 2-10, or [1, 10) )
                    # since they are supposed to be perfectly matched
                    # alignments = pairwise2.align.globalxx(s1[10:], s2[10:])
                    # emphasize g10-g21
                    # m: A match score is the score of identical chars, otherwise mismatch score.

                    # s: Same open and extend gap penalties for both sequences.
                    # d: The sequences have different open and extend gap penalties.
                    # alignments = pairwise2.align.globalms(s1[10:], s2[10:], 2, -1, -4, -1)
コード例 #51
0
ファイル: task_correction.py プロジェクト: zachamid/mypy
def levenshteinIndex(str1,str2):
	distance = pylev.levenshtein(str1,str2)
	return (1-(float)(distance)/max([len(str1),len(str2)]))
コード例 #52
0
#    print ts_terms[v], orig_ts[v]

f1.close()
f2.close()

bestmatches = {}
sawts = {}

#Compare each station in the turnstile data to each station in the google feed. 
for t in xrange(0, len(turn_terms)):		
    for g in xrange(0, len(google_terms)):
        #Make the highest default so anything better will take its place.
        bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
            
            #Compute distance with levenshtein and numbers
        tinylist = [int(distanceoffset(turn_terms[t], google_terms[g])) + int(pylev.levenshtein(google_terms[g], turn_terms[t])) + isinside(turn_terms[t], google_terms[g]) + samewords(turn_terms[t], turn_terms[t]), orig_google[g], google_terms[g], orig_turn[t]]

        if tinylist[0] < bestmatches[turn_terms[t]][0]:
            bestmatches[turn_terms[t]] = tinylist

f3 = open('./matchtable.txt', 'w')

#for g in xrange(0, len(bestmatches)):
#    for x in xrange(0, len(g)):
#        print 
for g in bestmatches:
    f3.write(g + ",")
    for x in xrange(0, len(bestmatches[g])):
        if x == len(bestmatches[g])-1:
            f3.write(str(bestmatches[g][x]).strip())
        else:
コード例 #53
0
ファイル: duplicates.py プロジェクト: ColCarroll/bugbug
def levenshtein_ratio(str_one, str_two):
  """
  Levenshtein ratio
  """
  str_len = len(str_one + str_two)
  return (str_len - pylev.levenshtein(str_one, str_two)) / float(str_len)
コード例 #54
0
#    print ts_terms[v], orig_ts[v]

f1.close()
f2.close()

bestmatches = {}
sawts = {}

#Compare each station in the turnstile data to each station in the google feed. 
for g in xrange(0, len(gtfs_terms)):		
    for t in xrange(0, len(ts_terms)):
        #Make the highest default so anything better will take its place.
        bestmatches.setdefault(gtfs_terms[g], [len(gtfs_terms[g])])
            
            #Compute distance with levenshtein and numbers
        tinylist = [int(distanceoffset(ts_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], ts_terms[t])) + isinside(ts_terms[t], gtfs_terms[g]) + samewords(ts_terms[t], gtfs_terms[g]), orig_gtfs[g], ts_terms[t], orig_ts[t]]

        if tinylist[0] < bestmatches[gtfs_terms[g]][0]:
            bestmatches[gtfs_terms[g]] = tinylist

f3 = open('./matchtable2.txt', 'w')

print bestmatches
#for g in xrange(0, len(bestmatches)):
#    for x in xrange(0, len(g)):
#        print 
#for g in bestmatches:
#    f3.write(g + ",")
#    for x in xrange(0, len(bestmatches[g])):
#        if x == len(bestmatches[g])-1:
#            f3.write(str(bestmatches[g][x]).strip())
コード例 #55
0
ファイル: tests.py プロジェクト: tauhid12/pylev
 def test_same(self):
     self.assertEqual(pylev.levenshtein('kitten', 'kitten'), 0)
コード例 #56
0
def get_sense(word, lang=u"pl_PL"):
    senses = get_senses(word, lang)
    counter[0] += 1
    if counter[0] % 100 == 0:
        print "sense", counter[0]
    return min(senses, key=lambda x: pylev.levenshtein(x, word)) if senses else None
コード例 #57
0
ファイル: tests.py プロジェクト: tauhid12/pylev
 def test_empty(self):
     self.assertEqual(pylev.levenshtein('', ''), 0)
コード例 #58
0
def penalize(string1, string2):
    if pylev.levenshtein(string1, string2) > min(len(string1), len(string2)):
        return 3
    return 0
 def apply_lev(self, threshold):
     if self.proposals:
         for value in self.proposals.keys():
             if pylev.levenshtein(value, self.goal) > threshold:
                 del self.proposals[value]