Example #1
0
        self.checker = SpellChecker("en")

    def correct(self, sequence: str) -> str:
        self.checker.set_text(sequence)
        for error in self.checker:
            for suggestion in self.checker.suggest():
                if is_split(error.word, suggestion):
                    self.checker.replace(suggestion)
                    break
        return self.checker.get_text()


if __name__ == "__main__":
    import sys

    corrector = PyEnchantTokenizationCorrector()

    for benchmark_name in sys.argv[1:]:
        benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
        print(benchmark.name)
        file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                            "enchant.txt")
        for s_i, sequence in enumerate(
                benchmark.get_sequences(BenchmarkFiles.CORRUPT)):
            print(s_i)
            start_time = timestamp()
            predicted = corrector.correct(sequence)
            runtime = time_diff(start_time)
            file_writer.add(predicted, runtime)
        file_writer.save()
import sys

import project
from src.postprocessing.bigram import BigramPostprocessor
from src.benchmark.benchmark import Benchmark, Subset
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp

if __name__ == "__main__":
    benchmark_name = sys.argv[1]
    in_file_name = sys.argv[2]
    out_file_name = sys.argv[3]

    corrector = BigramPostprocessor()
    benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
    file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                        out_file_name)

    for sequence in benchmark.get_predicted_sequences(in_file_name):
        start_time = timestamp()
        predicted = corrector.correct(sequence)
        runtime = time_diff(start_time)
        if predicted != sequence:
            print(predicted)
        file_writer.add(predicted, runtime)

    file_writer.save()
    def correct(self, sequence: str) -> str:
        sequence = ''.join(sequence.split())
        segmented = ' '.join(segment(sequence))
        #print(segmented)
        predicted = reinsert_punctuation(segmented, sequence)
        #print(predicted)
        predicted = self.postprocessor.correct(predicted)
        return predicted


if __name__ == "__main__":
    if len(sys.argv) > 1:
        benchmark_name = sys.argv[1]
        subset = SUBSETS[sys.argv[2]]
        benchmark = Benchmark(benchmark_name, subset)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                       "wordsegment.txt")
    else:
        sequences = interactive_sequence_generator()
        writer = None

    segmenter = WordSegment()

    for s_i, sequence in enumerate(sequences):
        start_time = timestamp()
        try:
            predicted = segmenter.correct(sequence)
        except RecursionError:
            predicted = sequence
    ERROR_PROBABILITIES
from src.evaluation.results_holder import ResultsHolder, Metric


if __name__ == "__main__":
    file_name = sys.argv[1]
    approach_name = sys.argv[2]

    results_holder = ResultsHolder()

    for noise_level in NOISE_LEVELS:
        for p in ERROR_PROBABILITIES:
            benchmark_name = get_benchmark_name(noise_level, p)
            benchmark_subset = Subset.TEST
            print(benchmark_name)
            benchmark = Benchmark(benchmark_name, benchmark_subset)
            sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)

            if file_name == "corrupt.txt":
                predicted_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
                mean_runtime = 0
            else:
                try:
                    predicted_sequences = benchmark.get_predicted_sequences(file_name)[:len(sequence_pairs)]
                    mean_runtime = benchmark.get_mean_runtime(file_name)
                except FileNotFoundError:
                    predicted_sequences = []
                    mean_runtime = 0

            if len(predicted_sequences) == len(sequence_pairs):
                evaluator = Evaluator()
    sequence_cases = load_object(cases_path)
    # sequence_cases: List[Case]

    labeling = parameters["labeling"] != "0"
    if labeling:
        from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
        labeling_model = BidirectionalLabelingEstimator()
        labeling_model.load(parameters["labeling"])
    else:
        labeling_model = None

    insertion_cases = []
    deletion_cases = []

    benchmark = Benchmark(benchmark_name, Subset.TUNING)
    correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)

    two_pass = sequence_file != "corrupt"
    if two_pass:
        input_sequences = benchmark.get_predicted_sequences(sequence_file)
    else:
        input_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)

    n_sequences = 0
    for s_i, correct, corrupt in izip(correct_sequences, input_sequences):
        n_sequences += 1
        if s_i >= len(sequence_cases):
            break
        print(s_i)
        print(corrupt)
    model = UnidirectionalModel(model_name)
    backward = model.model.specification.backward

    if parameters["labeling_model"] == "0":
        labeling_model = None
    else:
        labeling_model = BidirectionalLabelingEstimator()
        labeling_model.load(parameters["labeling_model"])

    benchmark_name = parameters["benchmark"]

    if benchmark_name == "0":
        sequences = interactive_sequence_generator()
        file_writer = None
    else:
        benchmark = Benchmark(benchmark_name, get_subset(parameters["subset"]))
        if parameters["sequences"] == "corrupt":
            sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        else:
            sequences = benchmark.get_predicted_sequences(
                parameters["sequences"])
        file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                            parameters["out_file"])
        if parameters["continue"]:
            file_writer.load()

    penalties = parameters["penalties"]
    if penalties == "0":
        insertion_penalty = deletion_penalty = 0
    else:
        # penalty_holder = PenaltyHolder(two_pass=parameters["sequences"] != "corrupt")
              (self.space_tp + self.char_tp, self.space_fp + self.char_fp,
               self.space_fn + self.char_fn))
        print(
            "(total)    (all)    %.4f F-score (%.4f precision, %.4f recall)" %
            (f1, precision, recall))
        print("(total)    %.4f sequence accuracy (%i / %i correct)" %
              (sequence_accuracy, self.correct_sequences, self.n_sequences))
        print()


if __name__ == "__main__":
    benchmark_name = sys.argv[1]
    file_name = sys.argv[2]
    n_sequences = int(sys.argv[3])

    benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
    correct_sequences = Wikipedia.development_sequences()
    corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
    predicted_sequences = benchmark.get_predicted_sequences(file_name)
    n_sequences = (len(predicted_sequences) -
                   1) if n_sequences == -1 else n_sequences

    evaluator = SpellingEvaluator()

    for s_i, correct, corrupt, predicted in izip(correct_sequences,
                                                 corrupt_sequences,
                                                 predicted_sequences):
        if s_i == n_sequences:
            break
        evaluator.evaluate_sequence(correct, corrupt, predicted)
import sys

from project import src
from src.datasets.wikipedia import Wikipedia
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.sequence.pdf2text_token_corruptor import Pdf2textTokenCorruptor
from src.helper.files import open_file

if __name__ == "__main__":
    BENCHMARK_NAME = "pdf2text"

    development_sequences = Wikipedia.development_sequences()
    test_sequences = Wikipedia.test_sequences()

    development_benchmark = Benchmark(BENCHMARK_NAME, Subset.DEVELOPMENT)
    test_benchmark = Benchmark(BENCHMARK_NAME, Subset.TEST)

    corruptor = Pdf2textTokenCorruptor()

    for benchmark, true_sequences in ((development_benchmark,
                                       development_sequences),
                                      (test_benchmark, test_sequences)):
        correct_file = open_file(benchmark.get_file(BenchmarkFiles.CORRECT))
        corrupt_file = open_file(benchmark.get_file(BenchmarkFiles.CORRUPT))
        for true_sequence in true_sequences:
            corrupt_sequence = corruptor.corrupt(true_sequence)
            correct_file.write(true_sequence + '\n')
            corrupt_file.write(corrupt_sequence + '\n')
        correct_file.close()
        corrupt_file.close()
        exit(1)
    subset = SUBSETS[subset]
    file_name = sys.argv[3]
    return benchmark, subset, file_name


if __name__ == "__main__":
    if len(
            sys.argv
    ) == 1 or "-h" in sys.argv or "-help" in sys.argv or "help" in sys.argv:
        print_help()
        exit(0)

    benchmark, subset, file_name = get_arguments()

    benchmark = Benchmark(benchmark, subset)
    correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
    corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
    if file_name == "corrupt.txt":
        predicted_sequences = corrupt_sequences
    else:
        predicted_sequences = benchmark.get_predicted_sequences(file_name)
    original_sequences = correct_sequences

    evaluator = Evaluator()
    for seq_id, (original, correct, corrupt, predicted) in \
            enumerate(zip(original_sequences, correct_sequences, corrupt_sequences, predicted_sequences)):

        if benchmark.name == "acl" and original.startswith("#"):
            print(original)
            continue
from src.helper.time import time_diff, timestamp

if __name__ == "__main__":
    model = UnidirectionalLMEstimator()
    model.load("fwd1024")
    corrector = SpellingBeamSearchCorrector(
        model,
        n_beams=parameters["n_beams"],
        branching_factor=parameters["n_beams"],
        consecutive_insertions=2,
        char_penalty=parameters["char_penalty"],
        space_penalty=parameters["space_penalty"])

    benchmark_name = parameters["benchmark"]
    if benchmark_name != "0":
        benchmark = Benchmark(benchmark_name, SUBSETS[parameters["subset"]])
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        n_sequences = parameters["n_sequences"]
        file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                            parameters["out_file"])
        segmentation_file_writer = PredictionsFileWriter(
            benchmark.get_results_directory() +
            parameters["segmentation_file"])
        if parameters["continue"]:
            file_writer.load()
            segmentation_file_writer.load()
        corrector.verbose = False
    else:
        sequences = interactive_sequence_generator()
        n_sequences = -1
        file_writer = None
from src.evaluation.predictions_file_writer import PredictionsFileWriter


if __name__ == "__main__":
    if len(sys.argv) == 1:
        benchmark_names = [None]
    else:
        benchmark_names = sys.argv[1:]

    corrector = BigramDynamicCorrector()

    for benchmark_name in benchmark_names:
        if benchmark_name is None:
            sequences = interactive_sequence_generator()
            file_writer = None
        else:
            benchmark = Benchmark(benchmark_name, Subset.TEST)
            sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
            file_writer = PredictionsFileWriter(benchmark.get_results_directory() + "bigrams.txt")

        for sequence in sequences:
            start_time = timestamp()
            predicted = corrector.correct(sequence)
            runtime = time_diff(start_time)
            print(predicted)
            if file_writer is not None:
                file_writer.add(predicted, runtime)

        if file_writer is not None:
            file_writer.save()
if __name__ == "__main__":
    model = load_default_char_lm(parameters["model"])
    space_index = model.get_encoder().encode_char(' ')
    window_size = 10

    benchmark_name = parameters["benchmark"]

    if benchmark_name == "0":
        sequences = interactive_sequence_generator()
        file_writer = None
        threshold_holder = ThresholdHolder(
            fitting_method=FittingMethod.SINGLE_RUN)
    else:
        subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT
        if parameters["two_pass"] == "0":
            benchmark = Benchmark(benchmark_name, subset=subset)
            threshold_holder = ThresholdHolder(
                fitting_method=FittingMethod.SINGLE_RUN)
        else:
            benchmark = TwoPassBenchmark(benchmark_name,
                                         parameters["two_pass"], subset)
            threshold_holder = ThresholdHolder(
                fitting_method=FittingMethod.TWO_PASS)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        file_writer = PredictionsFileWriter(
            benchmark.get_results_directory() +
            ("two_pass_" if parameters["two_pass"] != "0" else "") +
            parameters["model"] + ".txt")
        if parameters["continue"]:
            file_writer.load()
Example #13
0
from src.interactive.sequence_generator import interactive_sequence_generator
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp

if __name__ == "__main__":
    n_tokens = parameters["n_tokens"]
    corrector = MaximumMatchingCorrector(n=n_tokens)

    benchmark_name = parameters["benchmark"]
    if benchmark_name == "0":
        sequences = interactive_sequence_generator()
        file_writer = None
    else:
        subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT
        benchmark = Benchmark(benchmark_name, subset)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        file = benchmark.get_results_directory(
        ) + "maximum_matching_%i.txt" % n_tokens
        file_writer = PredictionsFileWriter(file)

    for sequence in sequences:
        start_time = timestamp()
        predicted = corrector.correct(sequence)
        runtime = time_diff(start_time)
        print(predicted)
        print(runtime)
        if file_writer is not None:
            file_writer.add(predicted, runtime)

    if file_writer is not None:
Example #14
0
if __name__ == "__main__":
    corrector = FuzzyGreedyCorrector()

    if len(sys.argv) > 1:
        benchmark_names = sys.argv[1:]

    else:
        benchmark_names = [None]

    for benchmark_name in benchmark_names:
        if benchmark_name is None:
            sequences = interactive_sequence_generator()
            file_writer = None
        else:
            benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
            sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
            file_writer = PredictionsFileWriter(
                benchmark.get_results_directory() + "fuzzy_greedy.txt")

        for sequence in sequences:
            start_time = timestamp()
            predicted = corrector.correct(sequence)
            runtime = time_diff(start_time)
            print(predicted)
            if file_writer is not None:
                file_writer.add(predicted, runtime)

        if file_writer is not None:
            file_writer.save()
Example #15
0
from src.datasets.wikipedia import Wikipedia
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.noise.typo_noise_inducer import TypoNoiseInducer
from src.sequence.token_corruptor import TokenCorruptor
from src.settings import constants
from src.helper.files import open_file

if __name__ == "__main__":
    SEED = 3010

    p = parameters["corruption_probability"]

    # create empty benchmarks
    benchmark_name = parameters["name"]
    development_benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
    test_benchmark = Benchmark(benchmark_name, Subset.TEST)

    # read sequences
    development_sequences = Wikipedia.development_sequences()
    test_sequences = Wikipedia.test_sequences()

    # typo inserter
    p_noise = parameters["noise_probability"]
    typo_inducer = TypoNoiseInducer(p_noise, SEED) if p_noise > 0 else None

    # token corruptor
    corruptor = TokenCorruptor(
        p=p,
        positions_per_token=constants.POSITIONS_PER_TOKEN,
        token_pairs_per_token=constants.TOKEN_PAIRS_PER_TOKEN,
Example #16
0
    benchmark_name = parameters["benchmark"]
    n_sequences = parameters["n_sequences"]

    LOOKAHEAD = 2

    model = UnidirectionalLMEstimator()
    model.load(model_name)

    space_label = model.encoder.encode_char(' ')

    #if parameters["continue"]:
    #    cases = load_object(path)
    #else:
    cases = []

    benchmark = Benchmark(benchmark_name, Subset.TUNING)
    sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)[:n_sequences]

    for s_i, sequence in enumerate(sequences):
        if s_i < len(cases):
            continue

        print("sequence %i" % s_i)
        print(sequence)
        cases.append([])

        encoded = model.encoder.encode_sequence(sequence)
        if model.specification.backward:
            sequence = sequence[::-1]
            encoded = encoded[::-1]
    print("p_ins:", p_ins)
    print("p_del:", p_del)
    print("benchmark:", benchmark)
    print("test:", test)
    print("out file:", out_file)

    corrector = get_corrector(approach, penalties, p_ins, p_del)
    print("P_ins = %.2f" % -corrector.insertion_penalty)
    print("P_del = %.2f" % -corrector.deletion_penalty)

    if benchmark is None:
        sequences = interactive_sequence_generator()
        file_writer = None
    else:
        subset = Subset.TEST if test else Subset.DEVELOPMENT
        benchmark = Benchmark(benchmark, subset)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        if out_file is not None:
            file_writer = PredictionsFileWriter(
                benchmark.get_results_directory() + out_file)

    for sequence in sequences:
        if sequence.startswith("#"):
            if out_file is not None:
                file_writer.add(sequence, 0)
            continue
        start_time = timestamp()
        predicted = corrector.correct(sequence)
        runtime = time_diff(start_time)
        print(predicted)
        if out_file is not None:
            model_name=model_name,
            noise_type=parameters["noise_type"]
        )

    # make greedy corrector:
    corrector = GreedyCorrector(model,
                                insertion_threshold=insertion_threshold,
                                deletion_threshold=deletion_threshold)

    # load benchmark sequences or interactive:
    benchmark_name = parameters["benchmark"]
    if benchmark_name == "0":
        benchmark = None
        sequences = interactive_sequence_generator()
    else:
        benchmark = Benchmark(benchmark_name,
                              subset=Subset.TEST if benchmark_name == "project" else Subset.DEVELOPMENT)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)

    # output file:
    if benchmark is None:
        file_writer = None
    else:
        file_writer = PredictionsFileWriter(benchmark.get_results_directory() + parameters["out_file"])
        if not parameters["initialize"]:
            file_writer.load()

    # iterate over sequences:
    total_runtime = 0
    n_sequences = 0
    for s_i, sequence in enumerate(sequences):
        if file_writer is not None and s_i < file_writer.n_sequences():
Example #19
0

def zero_one_sequence(a, b, originals=None):
    res = []
    for i, (aa, bb) in enumerate(zip(a, b)):
        if originals is not None:
            aa, _, bb = tolerant_preprocess_sequences(originals[i], aa, aa, bb)
        res.append(1 if aa == bb else 0)
    return res


if __name__ == "__main__":
    original_sequences = list(Wikipedia.test_sequences())
    for b in BENCHMARKS:
        print("**" + b + "**")
        benchmark = Benchmark(
            b, Subset.DEVELOPMENT if b == "nastase-big" else Subset.TEST)
        path = benchmark.get_results_directory()
        ground_truth_sequences = benchmark.get_sequences(
            BenchmarkFiles.CORRECT)
        with open(paths.DUMP_DIR + "zero_one_sequences/%s.txt" % b, "w") as f:
            for approach in APPROACHES:
                print(approach)
                if approach == "do nothing":
                    predicted_sequences = benchmark.get_sequences(
                        BenchmarkFiles.CORRUPT)
                else:
                    file = None
                    for ff in FILE_NAMES[approach]:
                        if os.path.exists(benchmark.get_results_directory() +
                                          ff):
                            file = ff
              "str",
              help_message="Name of the file containing predicted sequences.")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.benchmark.benchmark import Benchmark, BenchmarkFiles, SUBSETS
from src.evaluation.evaluator import Evaluator
from src.helper.data_structures import izip
from src.evaluation.print_methods import print_evaluator

if __name__ == "__main__":
    benchmark_name = parameters["benchmark"]
    benchmark_subset = SUBSETS[parameters["set"]]
    benchmark = Benchmark(benchmark_name, subset=benchmark_subset)

    sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)
    if parameters["file"] == "corrupt.txt":
        predicted_sequences = [corrupt for _, corrupt in sequence_pairs]
    else:
        predicted_sequences = benchmark.get_predicted_sequences(
            parameters["file"])

    evaluator = Evaluator()

    for s_i, (correct, corrupt), predicted in izip(sequence_pairs,
                                                   predicted_sequences):
        if s_i == parameters["sequences"]:
            break
        evaluator.evaluate(benchmark,
Example #21
0
from project import src
from src.evaluation.samples import get_space_corruptions
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.plot.histogram import plot_rate_histogram, plot_histogram, save_histogram_data

if __name__ == "__main__":
    exclude_zero = "no-zero" in sys.argv

    out_folder = "acl_error_distribution/"

    absolute_values = []
    error_rates = []

    for subset in (Subset.DEVELOPMENT, Subset.TEST):
        benchmark = Benchmark("ACL", subset)
        for correct, corrupt in benchmark.get_sequence_pairs(
                BenchmarkFiles.CORRUPT):
            print(corrupt)
            print(correct)
            edits = get_space_corruptions(correct, corrupt)
            n_edits = len(edits)
            n_chars = len(correct)
            ratio = n_edits / n_chars
            absolute_values.append(n_edits)
            error_rates.append(ratio)

    save_histogram_data(error_rates,
                        out_folder + "tokenization_character_error_rates.txt")
    plot_rate_histogram(
        error_rates,
if __name__ == "__main__":
    benchmarks = [
        "ACL", "arXiv.OCR", "arXiv.pdftotext", "Wiki.no_spaces", "Wiki.spaces",
        "Wiki.typos.no_spaces", "Wiki.typos.spaces"
    ]

    approaches = [
        "corrupt.txt", "bigrams.txt", "wordsegment.txt", "google_deduced.txt",
        "BS-fwd.txt", "BS-fwd-OCR.txt", "BS-bid.txt", "BS-bid-OCR.txt"
    ]

    out_dir = "/home/hertel/tokenization-repair-dumps/data/zero_one_sequences/"

    for benchmark in benchmarks:
        print(benchmark)
        benchmark = Benchmark(benchmark, Subset.TEST)
        corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
        with open(out_dir + benchmark.name + ".txt", "w") as f:
            for approach in approaches:
                f.write(approach + "\n")
                predicted_sequences = corrupt_sequences if approach == "corrupt.txt" \
                    else benchmark.get_predicted_sequences(approach)
                zero_one_sequence = [
                    1 if predicted == correct else 0 for predicted, correct in
                    zip(predicted_sequences, correct_sequences)
                ]
                f.write("".join([str(i) for i in zero_one_sequence]) + "\n")
                print("", approach, np.mean(zero_one_sequence))
Example #23
0
        if parameters["insert"]:
            if parameters["model_type"] == "combined":
                THRESHOLD = 0.99
            elif "softmax" in parameters["model_name"]:
                THRESHOLD = 0.95
            else:
                THRESHOLD = 0.1
        else:
            THRESHOLD = 0.01
    else:
        THRESHOLD = parameters["threshold"]

    corrupt_file = BenchmarkFiles.DELETIONS if parameters[
        "insert"] else BenchmarkFiles.INSERTIONS

    benchmark = Benchmark(parameters["benchmark"], Subset.DEVELOPMENT)

    corrector = GreedyCorrector(model,
                                insert=parameters["insert"],
                                delete=not parameters["insert"],
                                insertion_threshold=THRESHOLD,
                                deletion_threshold=THRESHOLD)

    fitter_holder = ThresholdFitterHolder(
        model_name=parameters["model_name"]
        if "model_name" in parameters else None,
        fwd_model_name=parameters["fwd_model_name"]
        if "fwd_model_name" in parameters else None,
        bwd_model_name=parameters["bwd_model_name"]
        if "bwd_model_name" in parameters else None,
        benchmark_name=parameters["benchmark"],
Example #24
0
    all_deletion_intervals = []

    for benchmark in BENCHMARKS:
        cases_path = paths.CASES_FILE_NOISY if benchmark.startswith("0.1") else paths.CASES_FILE_CLEAN
        cases_path = cases_path % (model_name, "wikipedia" if benchmark.startswith("0") else benchmark)

        sequence_cases = load_object(cases_path)

        print(len(sequence_cases))

        if labeling_model_name != "0":
            from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
            labeling_model = BidirectionalLabelingEstimator()
            labeling_model.load(labeling_model_name)

        benchmark = Benchmark(benchmark, Subset.TUNING)
        case_db = []

        correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
        corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)

        for s_i, (correct, corrupt) in enumerate(zip(correct_sequences, corrupt_sequences)):
            if s_i == n:
                break

            print(benchmark.name, s_i)
            cases = sequence_cases[s_i]
            case_db.append([])

            labeling_space_probs = labeling_model.predict(correct.replace(' ', ''))["probabilities"] if labeling \
                else None
Example #25
0
    for value in f1_list + acc_list:
        row += "&   %.2f " % (value * 100)
    return row


if __name__ == "__main__":
    benchmark_name = sys.argv[1]
    subset = SUBSETS[sys.argv[2]]
    predictions_file_name = sys.argv[3]
    n_sequences = int(sys.argv[4]) if len(sys.argv) > 4 else -1

    eval_all = benchmark_name == "all"
    if eval_all:
        benchmarks = all_benchmarks(subset)
    else:
        benchmarks = [Benchmark(benchmark_name, subset)]

    f1_list = []
    acc_list = []

    for benchmark in benchmarks:
        original_sequences = {
            Subset.TUNING: read_sequences(paths.WIKI_TUNING_SENTENCES),
            Subset.DEVELOPMENT: Wikipedia.development_sequences(),
            Subset.TEST: Wikipedia.test_sequences()
        }[subset]

        sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)
        if predictions_file_name == "corrupt.txt":
            predicted_sequences = [corrupt for _, corrupt in sequence_pairs]
        else: