Python StringMatcherの例、Levenshtein.StringMatcher Pythonの例

コード例 #1

0

ファイルを表示

ファイル: produceGraph.py プロジェクト: thes01/robotanik_analysis

def computeDifferentionFromSolutionsMatrix(submits, use_visited: bool):
    """
    for each submit, compute its difference from all solutions,
    so the output matrix is N_SUBMITS x N_SOLUTIONS

    :param submits
    :param use_visited: see global variable USE_VISITED
    :return: difference matrix
    """

    solutions = [submit for submit in submits if submit.flowers_left == 0]
    print("{} solutions".format(len(solutions)))

    matrix = np.zeros((len(submits), len(solutions)))

    for i in range(len(submits)):
        print(i)

        for s in range(len(solutions)):
            dist = 0
            if use_visited:
                str1 = submits[i].visited_to_unicode(100)
                str2 = solutions[s].visited_to_unicode(100)

                dist = StringMatcher.distance(str1, str2)
            else:
                str1 = submits[i].functions_to_unicode(use_canonized=True)
                str2 = solutions[s].functions_to_unicode(use_canonized=True)

                dist = StringMatcher.distance(str1, str2)

            matrix[i, s] = dist

    return matrix

コード例 #2

0

ファイルを表示

    def statistic_similarity(self, paper, min_similarity):
        """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams)
        and with a Levenshtein it check the similarity for each of them with the topics in the ontology.

        Args:
            paper (string): The paper to analyse. At this stage it is a string.
            cso (dictionary): the ontology previously loaded from the file.
            min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO.

        Returns:
            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
        """

        # analysing grams
        found_topics = {}

        unigrams = ngrams(word_tokenize(paper, preserve_line=True), 1)
        for grams in unigrams:
            gram = " ".join(grams)
            topics = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])]
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    if topic in found_topics:
                        found_topics[topic].append({'matched':gram, 'similarity':m})
                    else:
                        found_topics[topic] = [{'matched':gram, 'similarity':m}]

        bigrams = ngrams(word_tokenize(paper, preserve_line=True), 2)
        for grams in bigrams:
            gram = " ".join(grams)
            topics = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])]
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    if topic in found_topics:
                        found_topics[topic].append({'matched':gram, 'similarity':m})
                    else:
                        found_topics[topic] = [{'matched':gram, 'similarity':m}]

        trigrams = ngrams(word_tokenize(paper, preserve_line=True), 3)
        for grams in trigrams:
            gram = " ".join(grams)
            topics = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])]
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    if topic in found_topics:
                        found_topics[topic].append({'matched':gram, 'similarity':m})
                    else:
                        found_topics[topic] = [{'matched':gram, 'similarity':m}]

        return found_topics

コード例 #3

0

ファイルを表示

def validate_word(ideal_word: str, recognized_word: str,
                  levenshtein_percent: int) -> bool:

    normalized_levenshtein = 1 - (
        levenshtein.distance(ideal_word, recognized_word) /
        max(len(ideal_word), len(recognized_word)))
    return normalized_levenshtein >= levenshtein_percent

コード例 #4

0

ファイルを表示

def check(word, dictionary):
    if word is None or word is '' or str(word)[0].isnumeric():
        return []
    # if str(word)[0].isupper():
    #     return [' ']
    fl = str(word).lower()[0]
    sub_dict = list(filter(lambda x: x.startswith(fl), dictionary))
    similar = list(
        filter(lambda x: StringMatcher.distance(str(word).lower(), x) < 2,
               sub_dict))
    # similar = list(filter(lambda x: word.lower() == x, sub_dict))
    return similar

コード例 #5

0

ファイルを表示

    def __refine_found_words(self,similar_words):
        """
        Args:
            gram (string): the n-gram found (joined)
            grams (list): list of tokens to be analysed and founf in the model

        Returns:
            list_of_matched_topics (list): containing of all found topics
        """
        identified_topics = list()
        for word, sim in similar_words:
            topics = self.cso.find_closest_matches(word)
            for topic in topics:
                str_sim = ls.StringMatcher(None, topic, word).ratio() #topic is from cso, wet is from word embedding
                if str_sim >= self.min_similarity:
                    identified_topics.append({"topic":topic,"sim_t":str_sim,"wet":word,"sim_w":sim})
        return identified_topics

コード例 #6

0

ファイルを表示

ファイル: unitcalc.py プロジェクト: jakeogh/unitcalc

def find_unit(
    *,
    ulist,
    in_unit,
    verbose: bool,
    debug: bool,
):

    distance = -1
    for unit in ulist:
        dist = StringMatcher.distance(in_unit, unit)
        if distance < 0:
            distance = dist
            winning_unit = unit
        else:
            if dist < distance:
                distance = dist
                winning_unit = unit
    eprint("Warning: converting {0} to {1}".format(in_unit, winning_unit))
    return winning_unit

コード例 #7

0

ファイルを表示

ファイル: segment.py プロジェクト: wilsonify/music21

def getDifflibOrPyLev(
    seq2=None,
    junk=None,
    forceDifflib=False,
):
    '''
    Returns either a difflib.SequenceMatcher or pyLevenshtein
    StringMatcher.StringMatcher object depending on what is installed.

    If forceDifflib is True then use difflib even if pyLevenshtein is installed:
    '''
    if forceDifflib is True:
        smObject = difflib.SequenceMatcher(junk, '', seq2)
    else:
        try:
            from Levenshtein import StringMatcher as pyLevenshtein
            smObject = pyLevenshtein.StringMatcher(junk, '', seq2)
        except ImportError:
            smObject = difflib.SequenceMatcher(junk, '', seq2)
    return smObject

コード例 #8

0

ファイルを表示

ファイル: licenseguesser.py プロジェクト: jakeogh/licenseguesser

def find_closest_string_distance(
    *,
    string_dict,
    in_string,
    verbose: bool,
    debug: bool,
):

    distances_to_paths = defaultdict(list)
    distance = -1
    if verbose:
        ic(len(string_dict))
    for path_key, string in string_dict.items():
        dist = StringMatcher.distance(in_string, string)
        if verbose:
            ic(dist, path_key)
        distances_to_paths[dist].append(path_key)
        if distance < 0:
            distance = dist
            winning_key = path_key
        else:
            if dist < distance:
                distance = dist
                winning_key = path_key

    if verbose:
        for path_distance in distances_to_paths.keys():
            ic(path_distance)
            for path in distances_to_paths[path_distance]:
                ic(path)

        print("\n", file=sys.stderr)
        eprint('\n', in_string)
        ic(winning_key)
        eprint('\n', string_dict[winning_key])
        ic(distance, winning_key)
        winning_distances = sorted(distances_to_paths.keys())[:10]
        for distance in winning_distances:
            ic(distance, distances_to_paths[distance])

    return winning_key

コード例 #9

0

ファイルを表示

ファイル: phoneticfeature.py プロジェクト: bachelorbois/HumorHeadlines

    def compute_feature(cls, HL: Headline) -> np.ndarray:
        # replaced word & replacement word.
        words = [HL.sentence[HL.word_index], HL.edit]
        # transcibe each token to arpabet.
        phones = [" ".join(cls.g2p(w.lower())) for w in words]
#         for i, w in enumerate(words):
            # try:
                # s = " "
                # words[i] = s.join(cls.g2p(w))

            # except KeyError:
                # # print erroneous key
                # print(w)
                # # tracks and prints errors
                # cls.counter += 1
                # print(cls.counter)
        # calculate levenshtein distance between the two pronunciation.
        levenshtein_dist = StringMatcher.distance(*phones)
        # scale using the max difference in "word length"
        scale_factor = max([len(w) for w in phones])
        scaled_dist = levenshtein_dist/scale_factor
        return np.array([scaled_dist])

コード例 #10

0

ファイルを表示

ファイル: zky_phase_compare.py プロジェクト: lcy081099/Identification-of-drug-side-effects

def phase_compare(docu, file, n):
    dom = docu.lower()
    drs = filter_source(file, n)
    for i, e in enumerate(drs):
        try:
            lis = []
            word = e['Word']
            phase_ = e['Pahse']
            index = dom.find(str(phase_))
            # print(e['Pahse'])
            # print(index)
            # print('\n')
            if index != -1:
                patten = re.compile('\d*')
                result = re.search(patten, word)
                index_begin = word.index(str(result.group()))
                index_middle = word.find('(')
                index_end = word.find(')')
                index_last = word.find('[')
                if len(result.group()) > 3:
                    if ')' in word:
                        tag1 = word[index_begin + 4:index_middle]
                        tag2 = word[index_middle + 1:index_end]
                        lis.append(tag1.strip())
                        lis.append(tag2.strip())
                    else:
                        tag3 = word[index_begin + 4:index_last]
                        lis.append(tag3.strip())
                else:
                    if ')' in word:
                        tag1 = word[index_begin + 4:index_middle]
                        tag2 = word[index_middle + 1:index_end]
                        lis.append(tag1.strip())
                        lis.append(tag2.strip())
                    else:
                        tag3 = word[index_begin + 4:index_last]
                        lis.append(tag3.strip())
                #去重
                aim_word = set(lis)
                aim_word_ = list(aim_word)
                #比对
                if len(aim_word_) == 2:
                    s1 = aim_word_[0]
                    s2 = aim_word_[1]
                    len1 = StringMatcher.ratio(s1, phase_)
                    len2 = StringMatcher.ratio(s2, phase_)
                    aim_word_.clear()
                    if len1 > len2:
                        aim_word_.append(s1)
                    else:
                        aim_word_.append(s2)
                if len(aim_word_) == 1:
                    s3 = aim_word_[0]
                    aim_word_.clear()
                    aim_word_.append(s3)
                index1 = phase_.find(aim_word_[0].strip())
                if index1 != -1:
                    global index2
                    index2 = index + index1
                    yield {
                        'index': index2,
                        'word': aim_word_[0],
                        'len': len(aim_word_[0])
                    }
                    X = 'a' * len(aim_word_[0])
                    word = dom[index2:index2 + len(aim_word_[0])]
                    dom = dom.replace(word, X, 1)
        except KeyError:
            continue

コード例 #11

0

ファイルを表示

ファイル: searcher.py プロジェクト: perchrh/sanction_list_search

def search(name_string,
           bin_to_id,
           id_to_name,
           gender=None,
           birthdate=None,
           similarity_threshold=60):
    # TODO should distinguish between first name (less reliable match) and other names.
    # consider storing in each bin, a namepart object linking to its name linking to its subject, that has name.isFirstName:bool

    # TODO consider searching per name alias instead of per candidate (list of aliases), requires a different data structure for lookups

    # 1. calculate the phonetics bins of the input name
    name_parts = [NamePart(name_string)]
    name_parts = normalizer.normalize_name_alias(NameAlias(name_parts, None))

    bins = set()
    for name_part in name_parts:
        name_part_bins = [
            b for b in dmeta(name_part) if b
        ]  # dmeta sometimes outputs an empty 'None' bin, filter it out
        for bin in name_part_bins:
            bins.add((bin, name_part))

    # 2. find candidates with one or more matching bins
    candidates = set()
    name_parts_matched = set()
    bad_candidates = []  # candidates found to be bad matches for the query
    for (bin, name_part) in bins:
        if bin in bin_to_id:
            candidates_in_bin = bin_to_id[bin]
            for c in candidates_in_bin:
                (candidate_id, candidate_name_part) = c
                if candidate_id in bad_candidates:
                    # we already know this candidate is a bad match
                    continue

                (names, birthdates) = id_to_name[candidate_id]
                registered_genders = [
                    g for g in [x.gender for x in names] if g
                ]  # filter out None value for gender, i.e. unknown
                if gender and len(registered_genders
                                  ) == 1 and gender not in registered_genders:
                    # mark the candidate as bad, so that we don't have to consider it again for this search query
                    bad_candidates.append(candidate_id)
                    continue  # skip to next candidate
                if birthdate and birthdates:
                    # exact birthdates are known
                    if birthdate not in birthdates:
                        # mark the candidate as bad, so that we don't have to consider it again for this search query
                        bad_candidates.append(candidate_id)
                        continue  # skip to next candidate
                # TODO also check birthdate ranges, or birthyear list only
                # TODO could optionally check birth country

                if levenshtein_distance.ratio(
                        name_part, candidate_name_part
                ) >= 0.6:  # do not add really bad matches
                    candidates.add(candidate_id)
                    name_parts_matched.add(name_part)

    # 3. calculate phonetic string similarity
    name_parts_missed = name_parts - name_parts_matched
    matching_character_count = sum(map(len, name_parts_matched))
    missing_character_count = sum(map(len, name_parts_missed))
    phonetic_similarity_ratio = 100 * matching_character_count / (
        matching_character_count + missing_character_count)
    if phonetic_similarity_ratio < 25:  # performance: Early exit for really bad matches
        return []  # return no matches

    # 4. look up candidate names, filter out matches that are really bad, sort the remaining matches by similarity ratio
    normalized_query_name = " ".join(name_parts)
    # TODO word counts can be precomputed for better performance
    input_word_count = 1 if normalized_query_name.find(" ") < 0 else len(
        normalized_query_name.split()
    )  # makes sure to split only on whitespace,
    short_name_length_limit = 12
    is_short_input_name = len(normalized_query_name) <= short_name_length_limit
    shortness = max(0, short_name_length_limit - len(normalized_query_name))

    filtered_candidates = []
    for candidate_id in candidates:
        list_subject = id_to_name[candidate_id]
        (list_subject_aliases, birthdays) = list_subject
        for candidate_name in list_subject_aliases:
            normalized_candidate_name = " ".join(
                normalizer.normalize_name_alias(candidate_name)
            )  # TODO precompute this for better performance
            string_similarity = fuzz.token_sort_ratio(
                normalized_candidate_name, normalized_query_name)

            exact_match = string_similarity == 100
            similarity_score = string_similarity - 5

            if not exact_match:
                # 1. apply boosts:

                # boost phonetically similar matches
                boost_from_phonetic_similarity = similarity_threshold / 100.0 * phonetic_similarity_ratio / 16  # up to approx 6 points at 90% threshold
                similarity_score += boost_from_phonetic_similarity

                # 2. apply penalties:

                if is_short_input_name:
                    # TODO hackish, look for a better solution
                    # short matches must be extra good. Reduces false positives.
                    debuff = 2 * (similarity_threshold / 100.0) * shortness
                    similarity_score -= debuff

                # TODO word counts can be precomputed for better performance
                candidate_word_count = 1 if normalized_candidate_name.find(
                    " ") < 0 else len(normalized_candidate_name.split())
                missing_words = abs(candidate_word_count - input_word_count)
                if missing_words:
                    missing_words_score = missing_words * 5 * similarity_threshold / 100.0
                    missing_words_penalty = min(
                        20,
                        missing_words_score)  # set a ceiling for the penalty
                    similarity_score -= missing_words_penalty  # 0 if missing 0 words, -4 if missing 2 words, etc

                # 3. normalize score after applying boosts and penalties
                similarity_score = max(
                    0, min(similarity_score, 99.9)
                )  # present all non-exact matches as no more than 99.9

            if similarity_score >= similarity_threshold:
                element = (candidate_id, similarity_score, candidate_name)
                filtered_candidates.append(element)

    filtered_candidates.sort(key=lambda tup: tup[1],
                             reverse=True)  # sort by ratio, descending

    unique_candidates = []
    seen_candidates = set()
    for c in filtered_candidates:
        # only report one match against each list-subject, the best matching alias
        (candidate_id, similarity_score, candidate_name) = c
        if candidate_id not in seen_candidates:
            unique_candidates.append(c)
            seen_candidates.add(candidate_id)

    return unique_candidates

コード例 #12

0

ファイルを表示

ファイル: syntacticmodule.py プロジェクト: jamesdunham/cso-classifier

    def statistic_similarity(self, paper, min_similarity):
        """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams)
        and with a Levenshtein it check the similarity for each of them with the topics in the ontology.

        Args:
            paper (string): The paper to analyse. At this stage it is a string.
            min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. 

        Returns:
            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
        """

        # analysing grams
        found_topics = defaultdict(list)
        matches = set()
        tokens = word_tokenize(paper, preserve_line=True)
        # TODO: this is constant; factor out
        topic_stems = defaultdict(list)
        for k in self.cso['topics'].keys():
            topic_stems[k[:4]].append(k)

        for n in range(3, 0, -1):
            for i, grams in enumerate(ngrams(tokens, n)):
                if i in matches:
                    continue
                gram = " ".join(grams)
                try:
                    topic_block = topic_stems[gram[:4]]
                except KeyError:
                    continue
                for topic in topic_block:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(topic, self.cso['primary_labels'])
                        found_topics[topic].append({'matched': gram, 'similarity': m})
                        matches.add(i)

        # idx = 0
        # trigrams = ngrams(word_tokenize(paper, preserve_line=True), 3)
        # matched_trigrams = []
        # for grams in trigrams:
        #     idx += 1
        #     gram = " ".join(grams)
        #     topic_block = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])]
        #     for topic in topic_block:
        #         m = ls.StringMatcher(None, topic, gram).ratio()
        #         if m >= min_similarity:
        #             topic = self.get_primary_label(topic, self.cso['primary_labels'])
        #             if topic in found_topics:
        #                 found_topics[topic].append({'matched': gram, 'similarity': m})
        #             else:
        #                 found_topics[topic] = [{'matched': gram, 'similarity': m}]
        #             matched_trigrams.append(idx)
        #
        # idx = 0
        # bigrams = ngrams(word_tokenize(paper, preserve_line=True), 2)
        # matched_bigrams = []
        # for grams in bigrams:
        #     idx += 1
        #     if (idx not in matched_trigrams) and ((idx - 1) not in matched_trigrams):
        #         gram = " ".join(grams)
        #         topic_block = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])]
        #         for topic in topic_block:
        #             m = ls.StringMatcher(None, topic, gram).ratio()
        #             if m >= min_similarity:
        #                 topic = self.get_primary_label(topic, self.cso['primary_labels'])
        #                 if topic in found_topics:
        #                     found_topics[topic].append({'matched': gram, 'similarity': m})
        #                 else:
        #                     found_topics[topic] = [{'matched': gram, 'similarity': m}]
        #                 matched_bigrams.append(idx)
        #
        # idx = 0
        # unigrams = ngrams(word_tokenize(paper, preserve_line=True), 1)
        # for grams in unigrams:
        #     idx += 1
        #     if (idx not in matched_trigrams) and ((idx - 1) not in matched_trigrams) and (
        #             idx not in matched_bigrams) and ((idx - 1) not in matched_bigrams) and (
        #             (idx - 1) not in matched_bigrams):
        #         gram = " ".join(grams)
        #         topic_block = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])]
        #         for topic in topic_block:
        #             m = ls.StringMatcher(None, topic, gram).ratio()
        #             if m >= min_similarity:
        #                 topic = self.get_primary_label(topic, self.cso['primary_labels'])
        #                 if topic in found_topics:
        #                     found_topics[topic].append({'matched': gram, 'similarity': m})
        #                 else:
        #                     found_topics[topic] = [{'matched': gram, 'similarity': m}]

        return found_topics

コード例 #13

0

ファイルを表示

ファイル: syntacticmodule.py プロジェクト: lucian-whu/cso-classifier

    def statistic_similarity(self, paper, min_similarity):
        """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams)
        and with a Levenshtein it check the similarity for each of them with the topics in the ontology.

        Args:
            paper (string): The paper to analyse. At this stage it is a string.
            min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. 

        Returns:
            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
        """

        # analysing grams
        found_topics = {}

        idx = 0
        trigrams = ngrams(word_tokenize(paper, preserve_line=True), 3)
        matched_trigrams = []
        for grams in trigrams:
            idx += 1
            gram = " ".join(grams)
            topics = [
                key for key, _ in self.cso['topics'].items()
                if key.startswith(gram[:4])
            ]
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    topic = self.get_primary_label(topic,
                                                   self.cso['primary_labels'])
                    if topic in found_topics:
                        found_topics[topic].append({
                            'matched': gram,
                            'similarity': m
                        })
                    else:
                        found_topics[topic] = [{
                            'matched': gram,
                            'similarity': m
                        }]
                    matched_trigrams.append(idx)

        idx = 0
        bigrams = ngrams(word_tokenize(paper, preserve_line=True), 2)
        matched_bigrams = []
        for grams in bigrams:
            idx += 1
            if (idx not in matched_trigrams) and ((idx - 1)
                                                  not in matched_trigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                ]
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                            found_topics[topic].append({
                                'matched': gram,
                                'similarity': m
                            })
                        else:
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m
                            }]
                        matched_bigrams.append(idx)

        idx = 0
        unigrams = ngrams(word_tokenize(paper, preserve_line=True), 1)
        for grams in unigrams:
            idx += 1
            if (idx not in matched_trigrams) and (
                (idx - 1) not in matched_trigrams) and (
                    idx not in matched_bigrams) and (
                        (idx - 1) not in matched_bigrams) and (
                            (idx - 1) not in matched_bigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                ]
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                            found_topics[topic].append({
                                'matched': gram,
                                'similarity': m
                            })
                        else:
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m
                            }]

        return found_topics