Ejemplo n.º 1
0
    def spell_score(misspelling, candidates, method=1):
        """
        Calculates the edit distance between a misspelling and each candidate according to the chosen method
        :param misspelling: misspelling
        :param candidates: list of candidates
        :param method: chosen method from [1, 2, 3, 4]
        :return: list of edit distances between misspelling and each candidate
        """
        lexical_scores = [damerau_levenshtein_distance(misspelling, candidate)
                          for candidate in candidates]

        if method == 1:
            return lexical_scores
        else:
            phonetic_scores = [damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0])
                               for candidate in candidates]

        if method == 2:
            return [phonetic_score if phonetic_score != 0 else 1 for phonetic_score in phonetic_scores]
        elif method == 3:
            return [0.5 * (a + b) for a, b in zip(lexical_scores, phonetic_scores)]
        elif method == 4:
            return [(2 * a + b) ** 2 for a, b in zip(lexical_scores, phonetic_scores)]
        else:
            raise ValueError('Method must be element from [1, 2, 3, 4]')
Ejemplo n.º 2
0
def homepage():
    value1 = str(request.form['input1'])
    value2 = str(request.form['input2'])

    fuzzyNoDM = fuzz.ratio(value1, value2)
    fuzzyDM = fuzz.ratio(dm(value1)[0], dm(value2)[1])
    ptr = homophone(value1, value2)
    print(ptr)
    redirect('/results')

    return correct(value1, value2, ptr, fuzzyDM, fuzzyNoDM)
Ejemplo n.º 3
0
def candidates(misspellings, language='en'):

    vocab = json.load(open("lexicon_" + language + ".json", 'r'))
    vocab_dict = load_vocab(vocab)

    print(str(len(misspellings)) + ' misspellings to generate candidates for')

    candidates_list = []

    print("Generating Damerau-Levenshtein candidates")
    for i, misspelling in enumerate(misspellings):
        candidates_list.append(
            levenshtein_candidates(misspelling, vocab_dict, editdistance=2))

    print("Generating Double Metaphone candidates edit distance 1")
    metaphone_dict = load_metaphones(vocab)
    vocab_dict = load_vocab(list(metaphone_dict.keys()))
    metaphone_candidates = [
        levenshtein_candidates(dm(misspelling)[0], vocab_dict, editdistance=1)
        for misspelling in misspellings
    ]
    soundslike_candidates = [
        convert_candidates(candidates, detection, metaphone_dict)
        for candidates, detection in zip(metaphone_candidates, misspellings)
    ]
    candidates_list = [
        list(set(candidates1 + candidates2)) for candidates1, candidates2 in
        zip(candidates_list, soundslike_candidates)
    ]

    return candidates_list
Ejemplo n.º 4
0
def homophone(str1, str2):
    print(dm(str1))
    print(dm(str2))
    if fuzz.ratio(dm(str1)[0], dm(str2)[0]) == 100:
        return 1
    elif fuzz.ratio(
            dm(str1)[1],
            dm(str2)[1]) == 100 and dm(str1)[1] != '' and dm(str2)[1] != '':
        return 1
Ejemplo n.º 5
0
    def noisychannel_ranking(self, candidates_list):
        """
        An approximate implementation of the ranking method described in Lai et al. (2015), 'Automated Misspelling Detection and Correction in Clinical Free-Text Records'
        :param candidates_list: list of candidate list per misspelling
        :return: list with corrections or k-best corrections
        """
        correction_list = []
        confidences = []

        for misspelling, candidates in zip(self.misspellings, candidates_list):

            if not candidates:
                correction_list.append('')
                continue

            score_list = []
            for candidate in candidates:
                orthographic_edit_distance = damerau_levenshtein_distance(
                    misspelling, candidate)
                phonetic_edit_distance = damerau_levenshtein_distance(
                    dm(misspelling)[0],
                    dm(candidate)[0])
                spell_score = (2 * orthographic_edit_distance +
                               phonetic_edit_distance)**2  # P(m|c)
                try:
                    frequency = self.frequency_dict[candidate]
                except KeyError:
                    frequency = 1
                frequency_score = 1 / (1 + log(frequency))  # P(c)
                score = spell_score * frequency_score  # P(c|m) = P(m|c)*P(c)
                score_list.append(score)

            score_list = np.array(score_list)
            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmin(score_list)])
                except ValueError:
                    correction_list.append('')
            else:
                correction_list.append(
                    [candidates[i] for i in np.argsort(score_list)[:self.k]])

        return correction_list
Ejemplo n.º 6
0
    def noisychannel_ranking(self, detection_list, candidates_list):
        """
        An approximate implementation of the ranking method described in (Lai et al. 2015)
        :param detection_list: list of misspellings
        :param candidates_list: list of candidate list per misspelling
        :param frequency_dict: corpus frequencies from training data
        :param k_best: if True, return k highest ranked candidates instead of single one
        :return: list with corrections or k-best corrections
        """

        correction_list = []
        confidences = []

        for misspelling, candidates in zip(detection_list, candidates_list):
            score_list = []
            for candidate in candidates:
                orthographic_edit_distance = damerau_levenshtein_distance(misspelling, candidate)
                phonetic_edit_distance = damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0])

                spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance) ** 2  # P(m|c)

                try:
                    frequency = self.frequency_dict[candidate]
                except KeyError:
                    frequency = 1

                frequency_score = 1 / (1 + log(frequency))  # P(c)

                score = spell_score * frequency_score  # P(c|m) = P(m|c)*P(c)
                score_list.append(score)

            score_list = np.array(score_list)

            if len(score_list) > 1:
                sorted_distances = [score_list[i] for i in np.argsort(score_list)]
                top1 = sorted_distances[0]
                top2 = sorted_distances[1]
                confidence = abs(top1 - top2) / top1
                confidences.append(confidence)
            else:
                confidences.append(0)

            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmin(score_list)])
                except ValueError:
                    correction_list.append('')
            elif self.k > 1:
                correction_list.append([candidates[i] for i in np.argsort(score_list)[:self.k]])
            else:
                raise ValueError('k must be positive natural number')

        self.confidences = confidences

        return correction_list
Ejemplo n.º 7
0
def load_metaphones(vocab):
    """
    :param vocab_file: either a list containing the vocabulary, or a text file which contains one lexical item per line
    :return: dictionary with mappings between Double Metaphone representations and corresponding lexical items
    """

    # MAKE METAPHONE-LEXICAL MAPPING

    metaphone_dict = {}
    for item in vocab:
        metaphones = dm(item)
        for metaphone in metaphones:
            if metaphone:
                try:
                    metaphone_dict[metaphone].append(item)
                except KeyError:
                    metaphone_dict[metaphone] = []
                    metaphone_dict[metaphone].append(item)

    return metaphone_dict
Ejemplo n.º 8
0
            candidates_list.append(
                levenshtein_candidates(misspelling, vocab_dict,
                                       editdistance=1))
    else:
        print("Generating Damerau-Levenshtein candidates edit distance 2")
        for i, misspelling in enumerate(detection_list):
            print(i)
            candidates_list.append(
                levenshtein_candidates(misspelling, vocab_dict,
                                       editdistance=2))
        if sys.argv[2] == "all":
            print("Generating Double Metaphone candidates edit distance 1")
            metaphone_dict = load_metaphones(vocab)
            vocab_dict = load_vocab(list(metaphone_dict.keys()))
            metaphone_candidates = [
                levenshtein_candidates(dm(misspelling)[0],
                                       vocab_dict,
                                       editdistance=1)
                for misspelling in detection_list
            ]
            soundslike_candidates = [
                convert_candidates(candidates, detection, metaphone_dict)
                for candidates, detection in zip(metaphone_candidates,
                                                 detection_list)
            ]
            candidates_list = [
                list(set(candidates1 + candidates2)) for candidates1,
                candidates2 in zip(candidates_list, soundslike_candidates)
            ]

    with open(sys.argv[3], 'w') as f: