Beispiel #1
0
 def test_postprocessor(self):
     self.assertEqual("The cat eats fish.", RuleBasedPostprocessor.correct("The cat eats fish."))
     self.assertEqual("The cat, who likes fish.",
                      RuleBasedPostprocessor.correct("The cat , who likes fish ."))
     self.assertEqual("Bla's 'statement': \"bli.\"",
                      RuleBasedPostprocessor.correct("Bla 's 'statement' : \" bli . \""))
     self.assertEqual('""', RuleBasedPostprocessor.correct('" "'))
     self.assertEqual('bla "" bli', RuleBasedPostprocessor.correct('bla " " bli'))
     self.assertEqual("I ate 123 apples.", RuleBasedPostprocessor.correct("I ate 1 2 3 apples."))
     self.assertEqual("I use character-based language models.",
                      RuleBasedPostprocessor.correct("I use character - based language models ."))
     self.assertEqual("bla (bli) blu", RuleBasedPostprocessor.correct("bla ( bli ) blu"))
     self.assertEqual("()", RuleBasedPostprocessor.correct("( )"))
     self.assertEqual("bla () bli", RuleBasedPostprocessor.correct("bla ( ) bli"))
 def correct(self, sequence: str) -> str:
     sequence = sequence.replace(' ', '')
     word_locations = self.locate_words(sequence)
     solutions = []
     for i in range(len(sequence)):
         candidates = []
         beginnings = word_locations[i]
         if len(beginnings) == 0:
             beginnings = [i]
         for b in beginnings:
             if b == 0:
                 token = sequence[:(i + 1)]
                 candidate = self._new_candidate(token)
                 candidates.append(candidate)
             elif solutions[b - 1] is not None:
                 previous = solutions[b - 1]
                 token = sequence[b:(i + 1)]
                 candidate = self._expand_candidate(previous, token)
                 candidates.append(candidate)
         solutions.append(self._pick_best_candidate(candidates))
     final_solution = solutions[-1]
     predicted = ' '.join(final_solution.tokens)
     if self.bigram_postprocessor is not None:
         predicted = self.bigram_postprocessor.correct(predicted)
     predicted = RuleBasedPostprocessor.correct(predicted)
     return predicted
class WordSegment:
    def __init__(self):
        load()
        self.postprocessor = RuleBasedPostprocessor()

    def correct(self, sequence: str) -> str:
        sequence = ''.join(sequence.split())
        segmented = ' '.join(segment(sequence))
        #print(segmented)
        predicted = reinsert_punctuation(segmented, sequence)
        #print(predicted)
        predicted = self.postprocessor.correct(predicted)
        return predicted
Beispiel #4
0
class FuzzyGreedyCorrector:
    PENALTY = 0.1

    def __init__(self):
        unigrams = UnigramHolder()
        print("%i unigrams" % len(unigrams))
        bigrams = BigramHolder.load()
        print("%i bigrams" % len(bigrams))
        self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY)
        print("%i stumps" % len(self.matcher.stump_dict))
        self.tokenizer = Tokenizer()
        self.rule_based_postprocessor = RuleBasedPostprocessor()

    def correct(self, sequence: str):
        tokens = self.tokenizer.tokenize(sequence)
        texts = [token.text for token in tokens]
        predicted = ""
        t_i = 0
        while t_i < len(texts):
            if t_i > 0:
                predicted += ' '
            text = texts[t_i]
            if not text.isalpha():
                predicted += text
                t_i += 1
                continue
            # try merge:
            if t_i + 1 < len(texts) and texts[t_i + 1].isalpha():
                _, bigram_frequency = self.matcher.fuzzy_bigram_frequency(
                    text, texts[t_i + 1])
                merge = text + texts[t_i + 1]
                _, merge_frequency = self.matcher.fuzzy_unigram_frequency(
                    merge)
                if merge_frequency * self.PENALTY > bigram_frequency:
                    predicted += merge
                    t_i += 2
                    continue
            # try split:
            if len(text) > 1:
                _, unigram_frequency = self.matcher.fuzzy_unigram_frequency(
                    text)
                split, _, split_frequency = self.matcher.best_fuzzy_split(
                    text, lower_bound=unigram_frequency)
                if split_frequency * self.PENALTY > unigram_frequency:
                    predicted += ' '.join(split)
                    t_i += 1
                    continue
            predicted += text
            t_i += 1
        predicted = self.rule_based_postprocessor.correct(predicted)
        return predicted
class BigramDynamicCorrector:
    def __init__(self):
        self.model = BigramModel()
        self.rule_based_postprocessor = RuleBasedPostprocessor()

    def is_token(self, text) -> bool:
        return self.model.unigrams.is_unigram(text)

    def locate_words(self, text: str) -> List[List[str]]:
        located_words = [[] for _ in text]
        for i in range(len(text)):
            for j in range(i + 1, min(i + MAX_WORD_LEN, len(text)) + 1):
                word = text[i:j]
                if self.is_token(word) or len(word) == 1:
                    located_words[i].append(word)
        return located_words

    def correct(self, sequence: str) -> str:
        sequence = sequence.replace(' ', '')
        words_at_position = self.locate_words(sequence)
        solutions = [{} for _ in sequence]
        for position in range(len(sequence)):
            words = words_at_position[position]
            for word in words:
                end_pos = position + len(word) - 1
                if position == 0:
                    p = self.model.get_unigram_probability(word) + EPSILON
                    solutions[end_pos][word] = Solution(word, word, np.log(p))
                else:
                    for previous_word in solutions[position - 1]:
                        prefix_solution = solutions[position -
                                                    1][previous_word]
                        bigram = (prefix_solution.last_token, word)
                        p = self.model.get_probability(bigram) + EPSILON
                        score = prefix_solution.score + np.log(p)
                        if word not in solutions[end_pos] or score > solutions[
                                end_pos][word].score:
                            solutions[end_pos][word] = Solution(
                                prefix_solution.sequence + ' ' + word, word,
                                score)
        predicted = sequence
        best_score = -np.inf
        for last_word in solutions[-1]:
            solution = solutions[-1][last_word]
            if solution.score > best_score:
                predicted = solution.sequence
                best_score = solution.score
        predicted = self.rule_based_postprocessor.correct(predicted)
        return predicted
class LeftToRightCorrector:
    def __init__(self):
        self.unigrams = UnigramHolder()
        self.bigrams = BigramHolder.load()
        self.tokenizer = Tokenizer()
        self.postprocessor = RuleBasedPostprocessor()

    def try_merge(self, token: str, next: str) -> bool:
        return self.unigrams.get(token + next) > self.bigrams.get(
            (token, next))

    def best_split(self, token: str) -> str:
        best = token
        best_freqency = self.unigrams.get(token)
        best_unigram_frequency = best_freqency
        for i in range(1, len(token)):
            left, right = token[:i], token[i:]
            frequency = self.bigrams.get((left, right))
            unigram_frequency = min(self.unigrams.get(left),
                                    self.unigrams.get(right))
            if frequency > best_freqency or (
                    frequency == best_freqency
                    and unigram_frequency > best_unigram_frequency):
                best = left + ' ' + right
                best_freqency = frequency
                best_unigram_frequency = unigram_frequency
        return best

    def correct(self, sequence: str) -> str:
        tokens = self.tokenizer.tokenize(sequence)
        texts = [token.text for token in tokens]
        predicted = ""
        t_i = 0
        while t_i < len(texts):
            if t_i > 0:
                predicted += ' '
            if t_i + 1 < len(texts) and self.try_merge(texts[t_i],
                                                       texts[t_i + 1]):
                predicted += texts[t_i] + texts[t_i + 1]
                t_i += 2
            else:
                predicted += self.best_split(texts[t_i])
                t_i += 1
        predicted = self.postprocessor.correct(predicted)
        return predicted
import project
from src.interactive.sequence_generator import interactive_sequence_generator
from src.postprocessing.rule_based import RuleBasedPostprocessor

if __name__ == "__main__":
    for sequence in interactive_sequence_generator():
        predicted = RuleBasedPostprocessor.correct(sequence)
        print(predicted)
Beispiel #8
0
 def test_combined_case(self):
     self.assertEqual("bla (\"bli\") blu", RuleBasedPostprocessor.correct("bla ( \" bli \" ) blu"))
Beispiel #9
0
 def test_quotation_beginning(self):
     self.assertEqual("\"bla\"", RuleBasedPostprocessor.correct("\" bla \""))