def count_unigrams(n_sequences: int): total_start = timestamp() tokenizer = Tokenizer() counts_delim = {} counts_no_delim = {} tokenization_time = 0 for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)): start = timestamp() tokens = tokenizer.tokenize(sequence) tokens[0].delimiter_before = True tokenization_time += time_diff(start) for token in tokens: counts = counts_delim if token.delimiter_before else counts_no_delim if token.text not in counts: counts[token.text] = 1 else: counts[token.text] += 1 if (s_i + 1) % K10 == 0: print("%ik sequences, %.2f s total time, %.2f s tokenization" % ((s_i + 1) / K, time_diff(total_start), tokenization_time)) if (s_i + 1) % M == 0: print("saving...") dump_object(counts_delim, paths.UNIGRAM_DELIM_FREQUENCY_DICT) dump_object(counts_no_delim, paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
def __init__(self): unigrams = UnigramHolder() print("%i unigrams" % len(unigrams)) bigrams = BigramHolder.load() print("%i bigrams" % len(bigrams)) self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY) print("%i stumps" % len(self.matcher.stump_dict)) self.tokenizer = Tokenizer() self.rule_based_postprocessor = RuleBasedPostprocessor()
def search_tokens(): n = int(sys.argv[2]) tokenizer = Tokenizer() for query in interactive_sequence_generator(): if query.startswith(' '): query_token = Token(query[1:], True) else: query_token = Token(query, False) for sequence in Wikipedia.training_sequences(n): tokens = tokenizer.tokenize(sequence) if query_token in tokens: print(sequence)
def count_bigrams(n_sequences: int): tokenizer = Tokenizer() holder = BigramHolder() for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)): tokens = tokenizer.tokenize(sequence) texts = [token.text for token in tokens] for i in range(len(tokens) - 1): bigram = texts[i:(i + 2)] holder.increment(bigram) if (s_i + 1) % K10 == 0: print("%ik sequences" % ((s_i + 1) / K)) if (s_i + 1) % M == 0: print("saving...") holder.save() holder.save()
class FuzzyGreedyCorrector: PENALTY = 0.1 def __init__(self): unigrams = UnigramHolder() print("%i unigrams" % len(unigrams)) bigrams = BigramHolder.load() print("%i bigrams" % len(bigrams)) self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY) print("%i stumps" % len(self.matcher.stump_dict)) self.tokenizer = Tokenizer() self.rule_based_postprocessor = RuleBasedPostprocessor() def correct(self, sequence: str): tokens = self.tokenizer.tokenize(sequence) texts = [token.text for token in tokens] predicted = "" t_i = 0 while t_i < len(texts): if t_i > 0: predicted += ' ' text = texts[t_i] if not text.isalpha(): predicted += text t_i += 1 continue # try merge: if t_i + 1 < len(texts) and texts[t_i + 1].isalpha(): _, bigram_frequency = self.matcher.fuzzy_bigram_frequency( text, texts[t_i + 1]) merge = text + texts[t_i + 1] _, merge_frequency = self.matcher.fuzzy_unigram_frequency( merge) if merge_frequency * self.PENALTY > bigram_frequency: predicted += merge t_i += 2 continue # try split: if len(text) > 1: _, unigram_frequency = self.matcher.fuzzy_unigram_frequency( text) split, _, split_frequency = self.matcher.best_fuzzy_split( text, lower_bound=unigram_frequency) if split_frequency * self.PENALTY > unigram_frequency: predicted += ' '.join(split) t_i += 1 continue predicted += text t_i += 1 predicted = self.rule_based_postprocessor.correct(predicted) return predicted
class LeftToRightCorrector: def __init__(self): self.unigrams = UnigramHolder() self.bigrams = BigramHolder.load() self.tokenizer = Tokenizer() self.postprocessor = RuleBasedPostprocessor() def try_merge(self, token: str, next: str) -> bool: return self.unigrams.get(token + next) > self.bigrams.get( (token, next)) def best_split(self, token: str) -> str: best = token best_freqency = self.unigrams.get(token) best_unigram_frequency = best_freqency for i in range(1, len(token)): left, right = token[:i], token[i:] frequency = self.bigrams.get((left, right)) unigram_frequency = min(self.unigrams.get(left), self.unigrams.get(right)) if frequency > best_freqency or ( frequency == best_freqency and unigram_frequency > best_unigram_frequency): best = left + ' ' + right best_freqency = frequency best_unigram_frequency = unigram_frequency return best def correct(self, sequence: str) -> str: tokens = self.tokenizer.tokenize(sequence) texts = [token.text for token in tokens] predicted = "" t_i = 0 while t_i < len(texts): if t_i > 0: predicted += ' ' if t_i + 1 < len(texts) and self.try_merge(texts[t_i], texts[t_i + 1]): predicted += texts[t_i] + texts[t_i + 1] t_i += 2 else: predicted += self.best_split(texts[t_i]) t_i += 1 predicted = self.postprocessor.correct(predicted) return predicted
def __init__(self): self.unigrams = UnigramHolder() self.bigrams = BigramHolder.load() self.tokenizer = Tokenizer() self.postprocessor = RuleBasedPostprocessor()
def __init__(self, n: Optional[int]): self.tokenizer = Tokenizer() self.holder = UnigramHolder(n) self.bigrams = BigramHolder.load()
class UnigramCorrector: def __init__(self, n: Optional[int]): self.tokenizer = Tokenizer() self.holder = UnigramHolder(n) self.bigrams = BigramHolder.load() def split_candidates(self, token: Token) -> List[CorrectionCandidate]: text = token.text candidates = [] for i in range(1, len(text)): left = text[:i] right = text[i:] frequency = self.bigrams.get((left, right)) if frequency > 0: tokens = [ Token(left, token.delimiter_before), Token(right, True) ] candidates.append( CorrectionCandidate(frequency, tokens, False, False)) return candidates def merge_candidates(self, token: Token, previous_token: Optional[Token], next_token: Optional[Token]): candidates = [] if previous_token is not None: merged = previous_token.text + token.text frequency = self.holder.get(merged) if frequency > 0: candidates.append( CorrectionCandidate( frequency, [Token(merged, previous_token.delimiter_before)], consume_previous=True, consume_next=False)) if next_token is not None: merged = token.text + next_token.text frequency = self.holder.get(merged) if frequency > 0: candidates.append( CorrectionCandidate( frequency, [Token(merged, token.delimiter_before)], consume_previous=False, consume_next=True)) return candidates def select_best_candidate(self, candidates: List[CorrectionCandidate]): best_score = -1 best = None for candidate in candidates: if candidate.score > best_score: best_score = candidate.score best = candidate return best def repair_token(self, token: Token, next_token: Optional[Token], predicted_tokens: List[Token]) -> int: candidates = [ CorrectionCandidate(self.holder.get(token.text), [token], False, False) ] candidates.extend(self.split_candidates(token)) previous_token = predicted_tokens[-1] if len( predicted_tokens) > 0 else None candidates.extend( self.merge_candidates(token, previous_token, next_token)) if len(candidates) > 0: best_candidate = self.select_best_candidate(candidates) if best_candidate.consume_previous: predicted_tokens.pop() predicted_tokens.extend(best_candidate.tokens) return 2 if best_candidate.consume_next else 1 predicted_tokens.append(token) return 1 def correct(self, sequence: str) -> str: tokens = self.tokenizer.tokenize(sequence) n_tokens = len(tokens) tokens.append(None) predicted_tokens = [] t_i = 0 while t_i < n_tokens: token = tokens[t_i] next_token = tokens[t_i + 1] t_i += self.repair_token(token, next_token, predicted_tokens) predicted = tokens2sequence(predicted_tokens) return predicted
import project from src.ngram.tokenizer import Tokenizer from src.interactive.sequence_generator import interactive_sequence_generator if __name__ == "__main__": tokenizer = Tokenizer() for sequence in interactive_sequence_generator(): tokens = tokenizer.tokenize(sequence) print(tokens)