def count_unigrams(n_sequences: int): total_start = timestamp() tokenizer = Tokenizer() counts_delim = {} counts_no_delim = {} tokenization_time = 0 for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)): start = timestamp() tokens = tokenizer.tokenize(sequence) tokens[0].delimiter_before = True tokenization_time += time_diff(start) for token in tokens: counts = counts_delim if token.delimiter_before else counts_no_delim if token.text not in counts: counts[token.text] = 1 else: counts[token.text] += 1 if (s_i + 1) % K10 == 0: print("%ik sequences, %.2f s total time, %.2f s tokenization" % ((s_i + 1) / K, time_diff(total_start), tokenization_time)) if (s_i + 1) % M == 0: print("saving...") dump_object(counts_delim, paths.UNIGRAM_DELIM_FREQUENCY_DICT) dump_object(counts_no_delim, paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
def _get_probabilities(self, beam: SpellingBeam): if beam.needs_update: start_time = timestamp() beam.state = self.model.step(beam.state, beam.label, include_sequence=False) self.total_model_time += time_diff(start_time) beam.needs_update = False probabilities = beam.state["probabilities"] return probabilities
def _update_beams(self, beams: List[SpellingBeam]): start_time = timestamp() update_indices = [i for i in range(len(beams)) if beams[i].needs_update] if len(update_indices) > 0: states = [beams[i].state for i in update_indices] labels = [beams[i].label for i in update_indices] states = self.model.step_batch(states, labels) for i, index in enumerate(update_indices): beams[index].state = states[i] beams[index].needs_update = False self.total_model_time += time_diff(start_time)
def main(args): inducer = ACLNoiseInducer(args.p, 0.2079, args.seed) if args.print_insertion_prob: error_dict = inducer.error_dict total_count = 0 insertion_count = 0 for correct in error_dict: for wrong, freq in error_dict[correct]: total_count += freq if correct == "": insertion_count += freq insertion_prob = insertion_count / total_count print(len([e for e, f in error_dict[""] if f >= 0]), "insertions") print( f"{insertion_prob * 100:.2f}% char insertions ({insertion_count}/{total_count})" ) if args.runtime: sequence = "Tokenization Repair in the Presence of Spelling Errors" start_time = timestamp() corrupt_sequences = [] for _ in range(100): corrupt_sequences.append(inducer.induce_noise(sequence)) runtime = time_diff(start_time) for s in corrupt_sequences: print(s) print(runtime) elif args.input_file: out_file = open(args.output_file, "w") if args.output_file else None lines = read_lines(args.input_file) for line in lines: corrupt = inducer.induce_noise(line) print(corrupt) if out_file is not None: out_file.write(corrupt + "\n") if out_file is not None: out_file.close() else: while True: sequence = input("> ") for _ in range(100): corrupt = inducer.induce_noise(sequence) print(corrupt)
return predicted if __name__ == "__main__": if len(sys.argv) > 1: benchmark_name = sys.argv[1] subset = SUBSETS[sys.argv[2]] benchmark = Benchmark(benchmark_name, subset) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) writer = PredictionsFileWriter(benchmark.get_results_directory() + "wordsegment.txt") else: sequences = interactive_sequence_generator() writer = None segmenter = WordSegment() for s_i, sequence in enumerate(sequences): start_time = timestamp() try: predicted = segmenter.correct(sequence) except RecursionError: predicted = sequence runtime = time_diff(start_time) print(predicted) if writer is not None: writer.add(predicted, runtime) if writer is not None: writer.save()
benchmarks = [ get_benchmark(noise_level, np.inf, subset) for noise_level in NOISE_LEVELS ] for benchmark in benchmarks: if benchmark is None: sequences = interactive_sequence_generator() file_writer = None else: print(benchmark.name) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) file_writer = PredictionsFileWriter( benchmark.get_results_directory() + parameters["file_name"]) for sequence in sequences: if parameters["verbose"]: print(sequence) start = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start) if benchmark is None or parameters["verbose"]: print(predicted) else: file_writer.add(predicted, runtime) if file_writer is not None: file_writer.save()