Python levenshteinの例、stringdist.levenshtein Pythonの例

コード例 #1

0

ファイルを表示

ファイル: response_evaluator.py プロジェクト: thesolyd/ReuBERT

    def is_response_close_enough_using_leveinstein_with_text_separation(
            self, response, expected_response):
        acceptable_levenshtein_threshold = 0.5

        if len(response) < len(expected_response):
            expected_response = self._return_response_in_same_length(
                response, expected_response)
            for res in expected_response:
                if stringdist.levenshtein(
                        response, res) / 33 < acceptable_levenshtein_threshold:
                    return True

        elif len(response) > len(expected_response):
            response = self._return_response_in_same_length(
                expected_response, response)
            for res in response:
                if stringdist.levenshtein(
                        expected_response,
                        res) / 33 < acceptable_levenshtein_threshold:
                    return True
        else:
            if stringdist.levenshtein(
                    response,
                    expected_response) / 33 < acceptable_levenshtein_threshold:
                return True
        return False

コード例 #2

0

ファイルを表示

ファイル: candidate_scrapper.py プロジェクト: jisazaTappsi/combot

def get_city(cities, city_field):

    if '/' in city_field:
        city_field = city_field.split('/')[1]

    city_name = util.remove_accents_in_string(city_field.lower().strip())

    closest_match = None
    closest_distance = 100
    for city in cities:
        db_city_name = util.remove_accents_in_string(
            city['fields']['name'].lower().strip())
        if city['fields']['alias']:
            db_city_alias = util.remove_accents_in_string(
                city['fields']['alias'].lower().strip())
        else:
            db_city_alias = 'this is never ever a city...'

        distance = stringdist.levenshtein(city_name, db_city_name)
        distance_alias = stringdist.levenshtein(city_name, db_city_alias)

        if (distance < closest_distance and distance < 6) or \
                (distance_alias < closest_distance and distance_alias < 6):  # if Levishtein distance is close enough will do!
            closest_match = city
            closest_distance = min(distance, distance_alias)
            print('city: ' + db_city_name + ' distance: ' +
                  str(closest_distance))

    return closest_match

コード例 #3

0

ファイルを表示

def ocr_metrics(pred_texts, gt_texts, lower=True):
    '''
    lower: If set, converted to lowercase
    Takes 'predicted-texts' and 'ground truth-texts' as arguments.
    Returns 
        Character Error Rate (CER)
        Word Error Rate (WER)
        Sequence Error Rate (SER)
    '''
    cer, wer, ser = [], [], []
    for pred, gt in zip(pred_texts, gt_texts):
        if lower:
            pred, gt = pred.lower(), gt.lower()

        # CER
        pred_cer, gt_cer = list(pred), list(gt)
        dist = stringdist.levenshtein(pred_cer, gt_cer)
        cer.append(dist / max(len(pred_cer), len(gt_cer)))

        # WER
        pred_wer, gt_wer = pred.split(), gt.split()
        dist = stringdist.levenshtein(pred_wer, gt_wer)
        wer.append(dist / max(len(pred_wer), len(gt_wer)))

        # SER
        pred_ser, gt_ser = [pred], [gt]
        dist = stringdist.levenshtein(pred_ser, gt_ser)
        ser.append(dist / max(len(pred_ser), len(gt_ser)))

    return np.mean([cer, wer, ser], axis=1)

コード例 #4

0

ファイルを表示

ファイル: metaextract.py プロジェクト: nuest/o2r-meta

def sort_displayfile(filename):
    dist_name = stringdist.levenshtein(os.path.splitext(filename)[0], DISPLAYFILE_PROTOTYPE_NAME)
    dist_ext = stringdist.levenshtein(os.path.splitext(filename)[1][1:], DISPLAYFILE_PROTOTYPE_EXT)
    status_note(['[displayfile] Distance between names ', DISPLAYFILE_PROTOTYPE_NAME, ' and ', os.path.splitext(filename)[0], ' is ', dist_name], d=is_debug)
    status_note(['[displayfile] Distance between extensions ', DISPLAYFILE_PROTOTYPE_EXT, ' and ', os.path.splitext(filename)[1][1:], ' is ', dist_ext], d=is_debug)
    status_note(['[displayfile] Combined distance: ', dist_name + dist_ext])
    
    return dist_name + dist_ext

コード例 #5

0

ファイルを表示

def bbc(headline):
    headline_list = []
    print("headline: " + headline)

    # Collect and parse first page
    url = ('https://www.bbc.co.uk/search?q=' + headline)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Pull all text from the BodyText div
    news_list = soup.find("ol", class_="search-results")
    
    #check if there is an Error
    if news_list is None:
        print("Error: No article with that date and headline")
        return None
    
    news_list_items = news_list.find_all('a')
    # Create for loop to print out all artists' names
    for news in news_list_items:
            names = news.text
            link = news.attrs['href']
            if(len(names) > 0 and names[0] != '\n'):
                headline_list.append((names,link))

    # calculating Levenshtein distance between entered headline and bbc headlines 
    l_min = stringdist.levenshtein(headline_list[0][0], headline)
    closest_headline = headline_list[0]
    for i in range(1,len(headline_list)):
        l_value = stringdist.levenshtein(headline_list[i][0], headline)
        if l_value < l_min:
            l_min = l_value
            closest_headline = headline_list[i]

    # setting up article's content
    article_text = ''
    
    # Collect and parse the headline with the smallest Levenshtein distance
    url = (closest_headline[1])
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Pull all text from the BodyText div
    date = soup.find("div", {"class":"date"}).text
    print("Date: ",date)
    article = soup.find("div", {"class":"story-body__inner"}).findAll('p')
    
    if article is None:
        print("Error: Content of headline ",closest_headline[0],"couldn't be found at",closest_headline[1])
        return None
    for element in article:
        article_text += '\n' + ''.join(element.findAll(text = True))
    
    result = {'title': closest_headline[0],
            'date': date,
            'content': article_text}
    return result

コード例 #6

0

ファイルを表示

ファイル: metaextract.py プロジェクト: nuest/o2r-meta

def sort_mainfile(filename):
    dist_name = stringdist.levenshtein(os.path.splitext(filename)[0], MAINFILE_PROTOTYPE_NAME)
    padded_ext = os.path.splitext(filename)[1][1:].zfill(len(MAINFILE_PROTOTYPE_EXT))
    dist_ext = stringdist.levenshtein(padded_ext, MAINFILE_PROTOTYPE_EXT)
    status_note(['[mainfile] Distance between names ', MAINFILE_PROTOTYPE_NAME, ' and ', os.path.splitext(filename)[0], ' is ', dist_name], d=is_debug)
    status_note(['[mainfile] Distance between extensions ', MAINFILE_PROTOTYPE_EXT, ' and ', padded_ext, ' is ', dist_ext], d=is_debug)
    status_note(['[mainfile] Combined distance: ', dist_name + dist_ext])

    return dist_name + dist_ext

コード例 #7

0

ファイルを表示

ファイル: URLModel.py プロジェクト: VitorFreireV/Classifier_01

    def MaliciusnessAnalysis(self, word_list, data, brand_list, keyword_list):
        check_similary = []
        data['ConsecutiveCharacterRepeat'] = 0
        for word in word_list:
            if word.lower() in dic_BrandNames.keys():
                brand_list.append(word.lower())

            if word.lower() in sensitive_list:
                keyword_list.append(word.lower())
        data['KeywordCount'] = len(keyword_list)
        data['BrandNameCount'] = len(brand_list)
        # get targets data
        list_key = []
        count_key = 0
        for word in keyword_list:
            if word not in list_key:
                count_key += 1
                list_key.append(word)
        list_brand = []
        count_brand = 0
        for word in brand_list:
            if word not in list_brand:
                count_brand += 1
                list_brand.append(word)
        data['TargetBrandNameCount'] = count_brand
        data['TargetKeywordCount'] = count_key

        for word in word_list:
            if word.lower() not in brand_list and word.lower(
            ) not in keyword_list:
                check_similary.append(word.lower())
        # for brandnames
        similar_word_list = []
        for word in check_similary:
            for brand in dic_BrandNames.keys():
                if ((stringdist.levenshtein(word, brand) < 2)
                        and word not in similar_word_list):
                    similar_word_list.append(word)
                    data['ConsecutiveCharacterRepeat'] = 1

        data['SimilarBrandNameCount'] = len(similar_word_list)

        for word in check_similary:
            for sens in sensitive_list:
                if (stringdist.levenshtein(word, sens) < 2
                        and word not in similar_word_list):
                    #print(word, sens)
                    similar_word_list.append(word)
                    data['ConsecutiveCharacterRepeat'] = 1
        data['SimilarKeywordCount'] = len(
            similar_word_list) - data['SimilarBrandNameCount']

        return similar_word_list

コード例 #8

0

ファイルを表示

ファイル: astar.py プロジェクト: abhinav1112/OptimalPath

def getLocations(src, dest):
    '''matches the user input string with the best possible locations
    in the database using edit distance'''
    mindist = 1000
    mindist1 = 1000
    srcname = ""
    destname = ""
    for node in nodes:
        if stringdist.levenshtein(src, node.name) < mindist:
            mindist = stringdist.levenshtein(src, node.name)
            srcname = node.name
        if stringdist.levenshtein(dest, node.name) < mindist1:
            mindist1 = stringdist.levenshtein(dest, node.name)
            destname = node.name
    return srcname, destname

コード例 #9

0

ファイルを表示

ファイル: webScrape.py プロジェクト: siddheshkrishnan1/Covid19Opps

def getKeyWord(rankedPhrases):
    #Set the initial ratio of the distance to 0
    min_dist_ratio = 1
    driv = ""
    for driver in drivers:
        indic = drivers.get(driver)
        div = 0
        total_ratio = 0
        for key_val in indic:
            for key_words in rankedPhrases:
                #This gets the levenshtein distance between each word if the row data is not 'None'
                if key_words is not None:
                    dist = stringdist.levenshtein(key_val.lower(),
                                                  key_words.lower())
                    curr_dist_ratio = (dist /
                                       longeststring([key_val, key_words]))
                    total_ratio += curr_dist_ratio
                    div = div + 1

        total_ratio = total_ratio / div
        if total_ratio < min_dist_ratio:
            min_dist_ratio = total_ratio
            driv = driver
    #The levenshtein distance is computed and the category with the lowest levenshtein distance among all the key words is used for
    #that specific row.
    if min_dist_ratio < 0.87:
        driv = "Other"
    return driv

コード例 #10

0

ファイルを表示

def cluster_words(words, thresh=8):
    """Return clusters of words, where word are added to clusters where
    the word has an average levenshtein of less than a threshold

    Each word is actally a tuple, with the word being the first item, and any other
    data in subsequent items

    """

    import stringdist

    clusters = []

    for w1 in words:

        placed = False
        for cluster in clusters:
            # Average dist to all words in the cluster
            ad = sum(
                stringdist.levenshtein(slugify(w1[0]), slugify(w2[0]))
                for w2 in cluster) / float(len(cluster))
            if ad < thresh:
                cluster.add(w1)
                placed = True
                break

            elif any(dmeta_sub(w1[0], w2[0]) < thresh for w2 in cluster):
                cluster.add(w1)
                placed = True
                break

        if not placed:
            clusters.append(set([w1]))

    return clusters

コード例 #11

0

ファイルを表示

ファイル: response_evaluator.py プロジェクト: thesolyd/ReuBERT

    def is_response_close_enough_using_leveinstein(self, response,
                                                   expected_response):
        acceptable_levenshtein_threshold = 0.5

        return stringdist.levenshtein(
            response,
            expected_response) / 33 < acceptable_levenshtein_threshold

コード例 #12

0

ファイルを表示

 def target_detected_call(self, data):
     global on_box
     if data.data != "invalid":
         dist_low = 1000
         corrected_target = "should not appear"
         for subject in subjects:
             dist = stringdist.levenshtein(subject, data.data)
             if dist < dist_low:
                 dist_low = dist
                 corrected_target = subject
         print "target: ", corrected_target
         self.pub_target.publish(corrected_target)
         self.state_after_speak = "speaking_again"
         self.state = "speaking"
         print "state: ", self.state
         self.pub_speak.publish("We will deliver to " + corrected_target)
         while self.state != "speaking_again":
             1 + 1
         self.last_target_read = corrected_target
     else:
         print "target: ", data.data
         self.pub_target.publish(data.data)
         self.state_after_speak = "roaming"
         self.state = "speaking"
         print "state: ", self.state
         self.pub_speak.publish("Could not read message")
         while self.state != "roaming":
             1 + 1
         #print "hi", self.state
         rospy.sleep(3)
         on_box = "no"
         print "on_box :", on_box

コード例 #13

0

ファイルを表示

def reply(prompt):

    # cleaning
    prompt = prompt.lower()
    for char in prompt:
        if not char.isalnum() and char != ' ':
            prompt = prompt.replace(char, "")

    # check for exact query matches
    for keys, value in gettuples():
        if (prompt in keys):
            return value

    # return with max similarity if max sim > 0.3
    max_value = ["Sorry! I didn't understand that."]
    max_simil = 0.3
    for keys, value in gettuples():
        for key in keys:
            simil = (get_cosine(key, prompt) +
                     SequenceMatcher(None, key, prompt).ratio() +
                     (1 - stringdist.levenshtein(key, prompt) / 15)) / 3
            if (simil > max_simil):
                max_value = value
                max_simil = simil
    return max_value

コード例 #14

0

ファイルを表示

def UMI_correct(UMI):
    """Corrects UMI by up to one letter"""
    for item in UMI_dict:
        if (stringdist.levenshtein(item, UMI)) <= 1:
            return(item)
        else:
            return("BAD")

コード例 #15

0

ファイルを表示

ファイル: itmo_nlp_hw02.py プロジェクト: pgrachev/Master-Course-1

    def rectify(self, word):
        """
            Speller predictions
        """

        # the query that is mapped to ngrams
        char_ngrams_list = self.vectorizer.transform([word]).tocoo().col
        #  print('--------')
        #  print(word)
        # calculate the number of matches for each term
        counter = Counter()

        for token_id in char_ngrams_list:
            for word_id in self.index[token_id]:
                counter[word_id] += 1

        # search for the nearest term from the selected terms
        closest_word = word
        minimal_distance = 1000
        # search for "good" fix from the top of matches by n-gramms
        for suggest in counter.most_common(n=self.n_candidates):

            suggest_word = self.words_list[suggest[0]]
            # TODO: your code here
            # you can use any libraries and sources except the original texts

            distance = Distancer.levenshtein(word, suggest_word)

            if distance < minimal_distance:
                minimal_distance = distance
                closest_word = suggest_word
            #   print(closest_word)
        return closest_word

コード例 #16

0

ファイルを表示

    def forward(self, prediction, target):
        logits = prediction[0]  # (logits, len)
        feature_lengths = prediction[1].int()
        labels = target
        logits = torch.transpose(logits, 0, 1)
        logits = logits.cpu()
        # beam decoder
        output, scores, timesteps, out_seq_len = self.decoder.decode(
            probs=logits, seq_lens=feature_lengths)

        ############# GREEDY DECODE ##########################
        _, max_probs = torch.max(logits, 2)
        strings, offsets = self.greedy_decoder.decode(probs=logits)
        predictions = []
        time_stamps = []
        ls = 0
        for i in range(len(strings)):
            pred = strings[i][0]
            phone_pred = []
            for j in pred:
                phone_pred.append(self.phoneme_list[self.label_map.index(j)])
            predictions.append(phone_pred)
            time_stamps.append(offsets[i][0].float() / 100)
            if target != None:
                true = "".join(self.label_map[l] for l in labels[i])
                ls += stringdist.levenshtein(strings[i][0], true)
        return predictions, time_stamps, ls / len(strings)

コード例 #17

0

ファイルを表示

def get_ranked_ontology_matches(cleaned_term):
    '''
    Get ranked matches from ontology
    '''
    ontology_matches = simstring_searcher.ranked_search(
        cleaned_term, SIMILARITY_THRESHOLD)

    # Weight relevant UMLS matches based on word ordering
    weighted_matches = {}
    for ontology_match in ontology_matches:
        # Get term and cui from ontology
        ontology_term = ontology_match[1]
        ontology_cui = term_to_cui[ontology_term]

        # Calculate Levenshtein distance for ranking
        levenshtein_distance = stringdist.levenshtein(ontology_term,
                                                      cleaned_term)

        # Construct match key with divisor
        key = ontology_term + ' :: UMLS ' + ontology_cui
        weighted_matches[key] = levenshtein_distance

    # Construct list of ranked terms based on levenshtein distasnce value
    ranked_matches = [
        ranked_pair[0] for ranked_pair in sorted(weighted_matches.items(),
                                                 key=lambda kv: kv[1])
    ]

    return ranked_matches

コード例 #18

0

ファイルを表示

def get_doctor_job(input_text, LIST_SPECIALITY_NAME):
    input_text = input_text.strip().lower()
    list_values_lev = [stringdist.levenshtein(input_text, spec) for spec in LIST_SPECIALITY_NAME]
    min_dist = min(list_values_lev)
    if min_dist < 5:
        return True, LIST_SPECIALITY_NAME[list_values_lev.index(min_dist)]
    else:
        return False, LIST_SPECIALITY_NAME[list_values_lev.index(min_dist)]

コード例 #19

0

ファイルを表示

ファイル: evaluate.py プロジェクト: yyht/histnorm

def cer(words):
    """Character error rate (CER), defined as Levenshtein distance normalized by
       reference word length."""
    val = [
        (0 if gold == norm else stringdist.levenshtein(gold, norm) / len(gold))
        for (gold, norm) in words
    ]
    return val

コード例 #20

0

ファイルを表示

ファイル: server.py プロジェクト: samirillion/pun_generator

def closest_string(base_word, word_list):
    lowest_distance = 100
    closest_word = ''
    for word in word_list:
        distance = stringdist.levenshtein(base_word, word)
        if distance <= lowest_distance:
            closest_word = word
            lowest_distance = distance
    return closest_word

コード例 #21

0

ファイルを表示

def compareSimilarity(name, list_name, dicordRate):
    print("내 데이터 ", name)
    print("DB의 데이터 ", list_name)
    string_length = len(name)
    value = stringdist.levenshtein(name, list_name)
    print('오차율 : ', value)
    if dicordRate >= value / string_length:
        return 1
    else:
        return 2

コード例 #22

0

ファイルを表示

ファイル: metaextract.py プロジェクト: o2r-project/o2r-meta

def sort_mainfile(filename):
    dist_name = stringdist.levenshtein(
        os.path.splitext(filename)[0], MAINFILE_PROTOTYPE_NAME)
    padded_ext = os.path.splitext(filename)[1][1:].zfill(
        len(MAINFILE_PROTOTYPE_EXT))
    dist_ext = stringdist.levenshtein(padded_ext, MAINFILE_PROTOTYPE_EXT)
    help.status_note([
        '[mainfile] Distance between names ', MAINFILE_PROTOTYPE_NAME, ' and ',
        os.path.splitext(filename)[0], ' is ', dist_name
    ],
                     d=is_debug)
    help.status_note([
        '[mainfile] Distance between extensions ', MAINFILE_PROTOTYPE_EXT,
        ' and ', padded_ext, ' is ', dist_ext
    ],
                     d=is_debug)
    help.status_note(['[mainfile] Combined distance: ', dist_name + dist_ext])

    return dist_name + dist_ext

コード例 #23

0

ファイルを表示

ファイル: sort.py プロジェクト: TimeUs-ANR/LSE-OD2M

def correct_headers(s):
    distance = 100
    mark = ""
    for k in groundtruth.headers:
        this_distance = stringdist.levenshtein(
            s.replace(" ", "").lower(),
            groundtruth.headers[k].replace(" ", "").lower())
        if this_distance < distance:
            distance = this_distance
            mark = k
    return distance, groundtruth.headers[mark]

コード例 #24

0

ファイルを表示

ファイル: iv_check.py プロジェクト: Dexter192/PoGoPvPBot

def closest_name_match(local_name, group_language, translations):
    closest = float('inf')
    closest_index = -1
    for index, name in translations[group_language].iteritems():
        if "+" not in local_name and "+" in name:
            # Quick fix for people searching for forms w/o giving the form itself.
            name = name.split("+")[0]
        dst = stringdist.levenshtein(local_name, name)
        if dst < closest:
            closest = dst
            closest_index = index
    return translations[group_language][closest_index]

コード例 #25

0

ファイルを表示

def is_typo(name0, name1):
    """
    Get:
    name0 - string of name,
    name1 - string of name.

    Returns:
    True if the Levenshtein-distance between the names is <= 1,
    otherwise the function will return False.
    """

    return levenshtein(name0, name1) <= 1

コード例 #26

0

ファイルを表示

ファイル: metaextract.py プロジェクト: o2r-project/o2r-meta

def sort_displayfile(filename):
    dist_name = stringdist.levenshtein(
        os.path.splitext(filename)[0], DISPLAYFILE_PROTOTYPE_NAME)
    dist_ext = stringdist.levenshtein(
        os.path.splitext(filename)[1][1:], DISPLAYFILE_PROTOTYPE_EXT)
    help.status_note([
        '[displayfile] Distance between names ', DISPLAYFILE_PROTOTYPE_NAME,
        ' and ',
        os.path.splitext(filename)[0], ' is ', dist_name
    ],
                     d=is_debug)
    help.status_note([
        '[displayfile] Distance between extensions ',
        DISPLAYFILE_PROTOTYPE_EXT, ' and ',
        os.path.splitext(filename)[1][1:], ' is ', dist_ext
    ],
                     d=is_debug)
    help.status_note(
        ['[displayfile] Combined distance: ', dist_name + dist_ext])

    return dist_name + dist_ext

コード例 #27

0

ファイルを表示

ファイル: comparison.py プロジェクト: ruthvick-ravula/handprint

def line_data(gt_line, htr_line, htr_index):
    # Remove leading spaces and compress runs of spaces in the line.
    expected = ' '.join(gt_line.split())
    obtained = ' '.join(htr_line.split())
    # The stringdist package definition of levenshtein_norm() divides
    # by the longest of the two strings, but it is more conventional in
    # OCR papers and software to divide by the length of the reference.
    distance = levenshtein(expected, obtained)
    if len(expected) > 0:
        cer = '{:.2f}'.format(100 * float(distance) / len(expected))
    else:
        cer = '100.00'
    return Line(htr_index, distance, cer, expected, obtained)

コード例 #28

0

ファイルを表示

def handle_cocktail_recipe(req, res, tokens):
    make_words = {
        "рецепт", "коктейль", "приготовить", "сделать", "изготовить", "создать"
    }
    if len(tokens.intersection(make_words)):
        for word in make_words:
            if word in tokens:
                tokens.remove(word)
    if "как" in tokens:
        tokens.remove("как")

    if "с" in tokens or "из" in tokens:
        if "сок" in tokens:
            tokens.remove("сок")
        answer = []
        wth = 'с'
        if wth in tokens or "из" in tokens:
            if wth in tokens:
                tokens.remove(wth)
            if "из" in tokens:
                tokens.remove("из")
            for token in tokens:
                if token in GLOBAL_DATA['INGREDIENTS']:
                    answer += GLOBAL_DATA['INGREDIENTS'][token]
        if "без" in tokens:
            pass

        if len(answer) > 0:
            res['response']['text'] = gen_text_cocktail(answer[0])
            res['response']['buttons'] = get_suggests_cocktails(answer[1:5])
            return True
    else:
        result_list = defaultdict(float)
        for word in GLOBAL_DATA['COCKTAILS_WORDS']:
            for unit in GLOBAL_DATA['COCKTAILS_WORDS'][word]:
                for token in tokens:
                    score = stringdist.levenshtein(word, token)
                    if score < 1:
                        score = 1
                    if score <= 8:
                        result_list[unit] += (1 / score)**2
        sorted_list = sorted(result_list.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

        if sorted_list and sorted_list[0][1] > 0.25:
            res['response']['text'] = gen_text_cocktail(sorted_list[0][0])
            res['response']['buttons'] = get_suggests_cocktails(
                x[0] for x in sorted_list[1:5])
            return True
    return False

コード例 #29

0

ファイルを表示

def getContactDetails(name):
    random.seed(datetime.now())
    File = open('./firstnames.txt')
    firstnames = File.read()
    namesDistance = []
    for firstname in firstnames.splitlines():
        namesDistance.append((stringdist.levenshtein(firstname,
                                                     name), firstname))

    contact = contacts[randint(0, 42000) % len(contacts)]

    contact['firstname'] = sorted(namesDistance)[0][1]

    return contact

コード例 #30

0

ファイルを表示

ファイル: main.py プロジェクト: Sargarin/Praca-in-ynierska

def parse_phrase(input_text, list_of_words, desirable_dist):
    input_text = input_text.strip().lower()
    print(input_text)
    value_of_dist = [
        stringdist.levenshtein(input_text, word) for word in list_of_words
    ]
    print(value_of_dist)
    min_dist = min(value_of_dist)
    print(min_dist)
    print(list_of_words[value_of_dist.index(min_dist)])
    if min_dist <= desirable_dist:
        return True, list_of_words[value_of_dist.index(min_dist)]
    else:
        return False, list_of_words[value_of_dist.index(min_dist)]

コード例 #31

0

ファイルを表示

def clean(output, map_string):
    map = get_map(map_string)
    result = "".join(
        [symbol for symbol in output if symbol in accepted_symbols])
    #Define an algorithm to check likely matching string for result
    strings = get_map_location_strings(map)
    #Error check
    matching_list = [(string, stringdist.levenshtein(result, string))
                     for string in strings]
    if matching_list:
        min_match = min(matching_list, key=lambda pairs: pairs[1])
        best_match = min_match[
            0] if min_match[1] <= 2 and min_match[0] != '' else None
        return best_match
    return None

コード例 #32

0

ファイルを表示

    def parse_name_body(self, address_parts):
        # stick together the remaining parts
        # TODO return best matches under a value
        s = ' '
        name = s.join(address_parts)
        match = [None, 10]
        if name in names:
            return name

        for n in names:
            score = levenshtein(name, n)
            if score < match[1]:
                match = [n, score]

        return match[0]

コード例 #33

0

ファイルを表示

ファイル: check_matches.py プロジェクト: IISH/links

def check_matches( db_links, id_mp, lvs_max ):
	print( "check_matches() id_mp: %s, lvs_max: %s" % ( id_mp, lvs_max ) )
	
	query  = "SELECT M.id_matches, M.id_linksbase_1, M.id_linksbase_2, "
	query += "X.id_base, Y.id_base, X.ego_familyname_str, Y.ego_familyname_str " 
	query += "FROM links_match.matches as M, "
	query += "links_prematch.links_base as X, "
	query += "links_prematch.links_base as Y "
	query += "WHERE M.id_match_process = %s " % id_mp
	query += "AND X.id_base = id_linksbase_1 "
	query += "AND Y.id_base = id_linksbase_2 "
	query += "ORDER BY id_matches LIMIT 5;"

	if debug: print( query )
	resp = db_links.query( query )
	
	if len( resp ) == 0:
		print( "No corresponding links_base records found for id_match_process %d" % id_mp )
	
	for rec in resp:
		#print( str( rec ) )
		id_matches           = rec[ "id_matches" ]
		id_linksbase_1       = rec[ "id_linksbase_1" ]
		id_linksbase_2       = rec[ "id_linksbase_2" ]
		X_id_base            = rec[ "id_base" ]
		Y_id_base            = rec[ "Y.id_base" ]
		X_ego_familyname_str = rec[ "ego_familyname_str" ]
		Y_ego_familyname_str = rec[ "Y.ego_familyname_str" ]

		lvs = stringdist.levenshtein( X_ego_familyname_str, Y_ego_familyname_str )
		msg = "OK "
		if lvs > lvs_max:
			msg = "ERR"
		
		print( "id_matches: %s, id_linksbase_1&2: %s, %s, lvsd: %2d: %s ego_familyname_str 1&2: %s, %s" % 
		( id_matches, id_linksbase_1, id_linksbase_2, lvs, msg, X_ego_familyname_str, Y_ego_familyname_str ) )

	print( "" )