Beispiel #1
0
def count_unigrams(n_sequences: int):
    total_start = timestamp()

    tokenizer = Tokenizer()
    counts_delim = {}
    counts_no_delim = {}

    tokenization_time = 0

    for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)):
        start = timestamp()
        tokens = tokenizer.tokenize(sequence)
        tokens[0].delimiter_before = True
        tokenization_time += time_diff(start)
        for token in tokens:
            counts = counts_delim if token.delimiter_before else counts_no_delim
            if token.text not in counts:
                counts[token.text] = 1
            else:
                counts[token.text] += 1
        if (s_i + 1) % K10 == 0:
            print("%ik sequences, %.2f s total time, %.2f s tokenization" %
                  ((s_i + 1) / K, time_diff(total_start), tokenization_time))
        if (s_i + 1) % M == 0:
            print("saving...")
            dump_object(counts_delim, paths.UNIGRAM_DELIM_FREQUENCY_DICT)
            dump_object(counts_no_delim, paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
Beispiel #2
0
 def _get_probabilities(self, beam: SpellingBeam):
     if beam.needs_update:
         start_time = timestamp()
         beam.state = self.model.step(beam.state, beam.label, include_sequence=False)
         self.total_model_time += time_diff(start_time)
         beam.needs_update = False
     probabilities = beam.state["probabilities"]
     return probabilities
Beispiel #3
0
 def _update_beams(self, beams: List[SpellingBeam]):
     start_time = timestamp()
     update_indices = [i for i in range(len(beams)) if beams[i].needs_update]
     if len(update_indices) > 0:
         states = [beams[i].state for i in update_indices]
         labels = [beams[i].label for i in update_indices]
         states = self.model.step_batch(states, labels)
         for i, index in enumerate(update_indices):
             beams[index].state = states[i]
             beams[index].needs_update = False
     self.total_model_time += time_diff(start_time)
def main(args):
    inducer = ACLNoiseInducer(args.p, 0.2079, args.seed)

    if args.print_insertion_prob:
        error_dict = inducer.error_dict
        total_count = 0
        insertion_count = 0
        for correct in error_dict:
            for wrong, freq in error_dict[correct]:
                total_count += freq
                if correct == "":
                    insertion_count += freq
        insertion_prob = insertion_count / total_count
        print(len([e for e, f in error_dict[""] if f >= 0]), "insertions")
        print(
            f"{insertion_prob * 100:.2f}% char insertions ({insertion_count}/{total_count})"
        )

    if args.runtime:
        sequence = "Tokenization Repair in the Presence of Spelling Errors"
        start_time = timestamp()
        corrupt_sequences = []
        for _ in range(100):
            corrupt_sequences.append(inducer.induce_noise(sequence))
        runtime = time_diff(start_time)
        for s in corrupt_sequences:
            print(s)
        print(runtime)
    elif args.input_file:
        out_file = open(args.output_file, "w") if args.output_file else None
        lines = read_lines(args.input_file)
        for line in lines:
            corrupt = inducer.induce_noise(line)
            print(corrupt)
            if out_file is not None:
                out_file.write(corrupt + "\n")
        if out_file is not None:
            out_file.close()
    else:
        while True:
            sequence = input("> ")
            for _ in range(100):
                corrupt = inducer.induce_noise(sequence)
                print(corrupt)
        return predicted


if __name__ == "__main__":
    if len(sys.argv) > 1:
        benchmark_name = sys.argv[1]
        subset = SUBSETS[sys.argv[2]]
        benchmark = Benchmark(benchmark_name, subset)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                       "wordsegment.txt")
    else:
        sequences = interactive_sequence_generator()
        writer = None

    segmenter = WordSegment()

    for s_i, sequence in enumerate(sequences):
        start_time = timestamp()
        try:
            predicted = segmenter.correct(sequence)
        except RecursionError:
            predicted = sequence
        runtime = time_diff(start_time)
        print(predicted)
        if writer is not None:
            writer.add(predicted, runtime)

    if writer is not None:
        writer.save()
Beispiel #6
0
        benchmarks = [
            get_benchmark(noise_level, np.inf, subset)
            for noise_level in NOISE_LEVELS
        ]

    for benchmark in benchmarks:
        if benchmark is None:
            sequences = interactive_sequence_generator()
            file_writer = None
        else:
            print(benchmark.name)
            sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
            file_writer = PredictionsFileWriter(
                benchmark.get_results_directory() + parameters["file_name"])

        for sequence in sequences:
            if parameters["verbose"]:
                print(sequence)

            start = timestamp()
            predicted = corrector.correct(sequence)
            runtime = time_diff(start)

            if benchmark is None or parameters["verbose"]:
                print(predicted)
            else:
                file_writer.add(predicted, runtime)

        if file_writer is not None:
            file_writer.save()