if __name__ == "__main__":
    benchmarks = [
        "ACL", "arXiv.OCR", "arXiv.pdftotext", "Wiki.no_spaces", "Wiki.spaces",
        "Wiki.typos.no_spaces", "Wiki.typos.spaces"
    ]

    approaches = [
        "corrupt.txt", "bigrams.txt", "wordsegment.txt", "google_deduced.txt",
        "BS-fwd.txt", "BS-fwd-OCR.txt", "BS-bid.txt", "BS-bid-OCR.txt"
    ]

    out_dir = "/home/hertel/tokenization-repair-dumps/data/zero_one_sequences/"

    for benchmark in benchmarks:
        print(benchmark)
        benchmark = Benchmark(benchmark, Subset.TEST)
        corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
        with open(out_dir + benchmark.name + ".txt", "w") as f:
            for approach in approaches:
                f.write(approach + "\n")
                predicted_sequences = corrupt_sequences if approach == "corrupt.txt" \
                    else benchmark.get_predicted_sequences(approach)
                zero_one_sequence = [
                    1 if predicted == correct else 0 for predicted, correct in
                    zip(predicted_sequences, correct_sequences)
                ]
                f.write("".join([str(i) for i in zero_one_sequence]) + "\n")
                print("", approach, np.mean(zero_one_sequence))
        labeling_model = None
    else:
        labeling_model = BidirectionalLabelingEstimator()
        labeling_model.load(parameters["labeling_model"])

    benchmark_name = parameters["benchmark"]

    if benchmark_name == "0":
        sequences = interactive_sequence_generator()
        file_writer = None
    else:
        benchmark = Benchmark(benchmark_name, get_subset(parameters["subset"]))
        if parameters["sequences"] == "corrupt":
            sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        else:
            sequences = benchmark.get_predicted_sequences(
                parameters["sequences"])
        file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                            parameters["out_file"])
        if parameters["continue"]:
            file_writer.load()

    penalties = parameters["penalties"]
    if penalties == "0":
        insertion_penalty = deletion_penalty = 0
    else:
        # penalty_holder = PenaltyHolder(two_pass=parameters["sequences"] != "corrupt")
        penalty_holder = PenaltyHolder(seq_acc=True)  # TODO
        penalty_name = model_name
        if parameters["labeling_model"] != "0":
            penalty_name += "_" + parameters["labeling_model"]
        # penalty_name = penalty_name.replace("_acl", "")  # dirty hack for fine-tuned models which have no fitted penalties
    if labeling:
        from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
        labeling_model = BidirectionalLabelingEstimator()
        labeling_model.load(parameters["labeling"])
    else:
        labeling_model = None

    insertion_cases = []
    deletion_cases = []

    benchmark = Benchmark(benchmark_name, Subset.TUNING)
    correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)

    two_pass = sequence_file != "corrupt"
    if two_pass:
        input_sequences = benchmark.get_predicted_sequences(sequence_file)
    else:
        input_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)

    n_sequences = 0
    for s_i, correct, corrupt in izip(correct_sequences, input_sequences):
        n_sequences += 1
        if s_i >= len(sequence_cases):
            break
        print(s_i)
        print(corrupt)

        cases = sequence_cases[s_i]
        if model_name.startswith("bwd"):
            cases = cases[1:]
import sys

import project
from src.postprocessing.bigram import BigramPostprocessor
from src.benchmark.benchmark import Benchmark, Subset
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp

if __name__ == "__main__":
    benchmark_name = sys.argv[1]
    in_file_name = sys.argv[2]
    out_file_name = sys.argv[3]

    corrector = BigramPostprocessor()
    benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
    file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                        out_file_name)

    for sequence in benchmark.get_predicted_sequences(in_file_name):
        start_time = timestamp()
        predicted = corrector.correct(sequence)
        runtime = time_diff(start_time)
        if predicted != sequence:
            print(predicted)
        file_writer.add(predicted, runtime)

    file_writer.save()
Example #5
0
    original_sequences = list(Wikipedia.test_sequences())
    for b in BENCHMARKS:
        print("**" + b + "**")
        benchmark = Benchmark(
            b, Subset.DEVELOPMENT if b == "nastase-big" else Subset.TEST)
        path = benchmark.get_results_directory()
        ground_truth_sequences = benchmark.get_sequences(
            BenchmarkFiles.CORRECT)
        with open(paths.DUMP_DIR + "zero_one_sequences/%s.txt" % b, "w") as f:
            for approach in APPROACHES:
                print(approach)
                if approach == "do nothing":
                    predicted_sequences = benchmark.get_sequences(
                        BenchmarkFiles.CORRUPT)
                else:
                    file = None
                    for ff in FILE_NAMES[approach]:
                        if os.path.exists(benchmark.get_results_directory() +
                                          ff):
                            file = ff
                            break
                    predicted_sequences = benchmark.get_predicted_sequences(
                        file)
                sequence = zero_one_sequence(
                    ground_truth_sequences, predicted_sequences,
                    original_sequences if b.startswith("0.1") else None)
                f.write(approach + "\n")
                f.write(''.join(str(x) for x in sequence) + "\n")
                print(np.mean(sequence))
        print()
    results_holder = ResultsHolder()

    for noise_level in NOISE_LEVELS:
        for p in ERROR_PROBABILITIES:
            benchmark_name = get_benchmark_name(noise_level, p)
            benchmark_subset = Subset.TEST
            print(benchmark_name)
            benchmark = Benchmark(benchmark_name, benchmark_subset)
            sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)

            if file_name == "corrupt.txt":
                predicted_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
                mean_runtime = 0
            else:
                try:
                    predicted_sequences = benchmark.get_predicted_sequences(file_name)[:len(sequence_pairs)]
                    mean_runtime = benchmark.get_mean_runtime(file_name)
                except FileNotFoundError:
                    predicted_sequences = []
                    mean_runtime = 0

            if len(predicted_sequences) == len(sequence_pairs):
                evaluator = Evaluator()

                for i, (correct, corrupt) in enumerate(sequence_pairs):
                    predicted = predicted_sequences[i]
                    evaluator.evaluate(file_name=None,
                                       line=None,
                                       original_sequence=correct,
                                       corrupt_sequence=corrupt,
                                       predicted_sequence=predicted,