self.checker = SpellChecker("en") def correct(self, sequence: str) -> str: self.checker.set_text(sequence) for error in self.checker: for suggestion in self.checker.suggest(): if is_split(error.word, suggestion): self.checker.replace(suggestion) break return self.checker.get_text() if __name__ == "__main__": import sys corrector = PyEnchantTokenizationCorrector() for benchmark_name in sys.argv[1:]: benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT) print(benchmark.name) file_writer = PredictionsFileWriter(benchmark.get_results_directory() + "enchant.txt") for s_i, sequence in enumerate( benchmark.get_sequences(BenchmarkFiles.CORRUPT)): print(s_i) start_time = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start_time) file_writer.add(predicted, runtime) file_writer.save()
import sys import project from src.postprocessing.bigram import BigramPostprocessor from src.benchmark.benchmark import Benchmark, Subset from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import time_diff, timestamp if __name__ == "__main__": benchmark_name = sys.argv[1] in_file_name = sys.argv[2] out_file_name = sys.argv[3] corrector = BigramPostprocessor() benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT) file_writer = PredictionsFileWriter(benchmark.get_results_directory() + out_file_name) for sequence in benchmark.get_predicted_sequences(in_file_name): start_time = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start_time) if predicted != sequence: print(predicted) file_writer.add(predicted, runtime) file_writer.save()
def correct(self, sequence: str) -> str: sequence = ''.join(sequence.split()) segmented = ' '.join(segment(sequence)) #print(segmented) predicted = reinsert_punctuation(segmented, sequence) #print(predicted) predicted = self.postprocessor.correct(predicted) return predicted if __name__ == "__main__": if len(sys.argv) > 1: benchmark_name = sys.argv[1] subset = SUBSETS[sys.argv[2]] benchmark = Benchmark(benchmark_name, subset) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) writer = PredictionsFileWriter(benchmark.get_results_directory() + "wordsegment.txt") else: sequences = interactive_sequence_generator() writer = None segmenter = WordSegment() for s_i, sequence in enumerate(sequences): start_time = timestamp() try: predicted = segmenter.correct(sequence) except RecursionError: predicted = sequence
ERROR_PROBABILITIES from src.evaluation.results_holder import ResultsHolder, Metric if __name__ == "__main__": file_name = sys.argv[1] approach_name = sys.argv[2] results_holder = ResultsHolder() for noise_level in NOISE_LEVELS: for p in ERROR_PROBABILITIES: benchmark_name = get_benchmark_name(noise_level, p) benchmark_subset = Subset.TEST print(benchmark_name) benchmark = Benchmark(benchmark_name, benchmark_subset) sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if file_name == "corrupt.txt": predicted_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) mean_runtime = 0 else: try: predicted_sequences = benchmark.get_predicted_sequences(file_name)[:len(sequence_pairs)] mean_runtime = benchmark.get_mean_runtime(file_name) except FileNotFoundError: predicted_sequences = [] mean_runtime = 0 if len(predicted_sequences) == len(sequence_pairs): evaluator = Evaluator()
sequence_cases = load_object(cases_path) # sequence_cases: List[Case] labeling = parameters["labeling"] != "0" if labeling: from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator labeling_model = BidirectionalLabelingEstimator() labeling_model.load(parameters["labeling"]) else: labeling_model = None insertion_cases = [] deletion_cases = [] benchmark = Benchmark(benchmark_name, Subset.TUNING) correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) two_pass = sequence_file != "corrupt" if two_pass: input_sequences = benchmark.get_predicted_sequences(sequence_file) else: input_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) n_sequences = 0 for s_i, correct, corrupt in izip(correct_sequences, input_sequences): n_sequences += 1 if s_i >= len(sequence_cases): break print(s_i) print(corrupt)
model = UnidirectionalModel(model_name) backward = model.model.specification.backward if parameters["labeling_model"] == "0": labeling_model = None else: labeling_model = BidirectionalLabelingEstimator() labeling_model.load(parameters["labeling_model"]) benchmark_name = parameters["benchmark"] if benchmark_name == "0": sequences = interactive_sequence_generator() file_writer = None else: benchmark = Benchmark(benchmark_name, get_subset(parameters["subset"])) if parameters["sequences"] == "corrupt": sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) else: sequences = benchmark.get_predicted_sequences( parameters["sequences"]) file_writer = PredictionsFileWriter(benchmark.get_results_directory() + parameters["out_file"]) if parameters["continue"]: file_writer.load() penalties = parameters["penalties"] if penalties == "0": insertion_penalty = deletion_penalty = 0 else: # penalty_holder = PenaltyHolder(two_pass=parameters["sequences"] != "corrupt")
(self.space_tp + self.char_tp, self.space_fp + self.char_fp, self.space_fn + self.char_fn)) print( "(total) (all) %.4f F-score (%.4f precision, %.4f recall)" % (f1, precision, recall)) print("(total) %.4f sequence accuracy (%i / %i correct)" % (sequence_accuracy, self.correct_sequences, self.n_sequences)) print() if __name__ == "__main__": benchmark_name = sys.argv[1] file_name = sys.argv[2] n_sequences = int(sys.argv[3]) benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT) correct_sequences = Wikipedia.development_sequences() corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) predicted_sequences = benchmark.get_predicted_sequences(file_name) n_sequences = (len(predicted_sequences) - 1) if n_sequences == -1 else n_sequences evaluator = SpellingEvaluator() for s_i, correct, corrupt, predicted in izip(correct_sequences, corrupt_sequences, predicted_sequences): if s_i == n_sequences: break evaluator.evaluate_sequence(correct, corrupt, predicted)
import sys from project import src from src.datasets.wikipedia import Wikipedia from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.sequence.pdf2text_token_corruptor import Pdf2textTokenCorruptor from src.helper.files import open_file if __name__ == "__main__": BENCHMARK_NAME = "pdf2text" development_sequences = Wikipedia.development_sequences() test_sequences = Wikipedia.test_sequences() development_benchmark = Benchmark(BENCHMARK_NAME, Subset.DEVELOPMENT) test_benchmark = Benchmark(BENCHMARK_NAME, Subset.TEST) corruptor = Pdf2textTokenCorruptor() for benchmark, true_sequences in ((development_benchmark, development_sequences), (test_benchmark, test_sequences)): correct_file = open_file(benchmark.get_file(BenchmarkFiles.CORRECT)) corrupt_file = open_file(benchmark.get_file(BenchmarkFiles.CORRUPT)) for true_sequence in true_sequences: corrupt_sequence = corruptor.corrupt(true_sequence) correct_file.write(true_sequence + '\n') corrupt_file.write(corrupt_sequence + '\n') correct_file.close() corrupt_file.close()
exit(1) subset = SUBSETS[subset] file_name = sys.argv[3] return benchmark, subset, file_name if __name__ == "__main__": if len( sys.argv ) == 1 or "-h" in sys.argv or "-help" in sys.argv or "help" in sys.argv: print_help() exit(0) benchmark, subset, file_name = get_arguments() benchmark = Benchmark(benchmark, subset) correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) if file_name == "corrupt.txt": predicted_sequences = corrupt_sequences else: predicted_sequences = benchmark.get_predicted_sequences(file_name) original_sequences = correct_sequences evaluator = Evaluator() for seq_id, (original, correct, corrupt, predicted) in \ enumerate(zip(original_sequences, correct_sequences, corrupt_sequences, predicted_sequences)): if benchmark.name == "acl" and original.startswith("#"): print(original) continue
from src.helper.time import time_diff, timestamp if __name__ == "__main__": model = UnidirectionalLMEstimator() model.load("fwd1024") corrector = SpellingBeamSearchCorrector( model, n_beams=parameters["n_beams"], branching_factor=parameters["n_beams"], consecutive_insertions=2, char_penalty=parameters["char_penalty"], space_penalty=parameters["space_penalty"]) benchmark_name = parameters["benchmark"] if benchmark_name != "0": benchmark = Benchmark(benchmark_name, SUBSETS[parameters["subset"]]) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) n_sequences = parameters["n_sequences"] file_writer = PredictionsFileWriter(benchmark.get_results_directory() + parameters["out_file"]) segmentation_file_writer = PredictionsFileWriter( benchmark.get_results_directory() + parameters["segmentation_file"]) if parameters["continue"]: file_writer.load() segmentation_file_writer.load() corrector.verbose = False else: sequences = interactive_sequence_generator() n_sequences = -1 file_writer = None
from src.evaluation.predictions_file_writer import PredictionsFileWriter if __name__ == "__main__": if len(sys.argv) == 1: benchmark_names = [None] else: benchmark_names = sys.argv[1:] corrector = BigramDynamicCorrector() for benchmark_name in benchmark_names: if benchmark_name is None: sequences = interactive_sequence_generator() file_writer = None else: benchmark = Benchmark(benchmark_name, Subset.TEST) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) file_writer = PredictionsFileWriter(benchmark.get_results_directory() + "bigrams.txt") for sequence in sequences: start_time = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start_time) print(predicted) if file_writer is not None: file_writer.add(predicted, runtime) if file_writer is not None: file_writer.save()
if __name__ == "__main__": model = load_default_char_lm(parameters["model"]) space_index = model.get_encoder().encode_char(' ') window_size = 10 benchmark_name = parameters["benchmark"] if benchmark_name == "0": sequences = interactive_sequence_generator() file_writer = None threshold_holder = ThresholdHolder( fitting_method=FittingMethod.SINGLE_RUN) else: subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT if parameters["two_pass"] == "0": benchmark = Benchmark(benchmark_name, subset=subset) threshold_holder = ThresholdHolder( fitting_method=FittingMethod.SINGLE_RUN) else: benchmark = TwoPassBenchmark(benchmark_name, parameters["two_pass"], subset) threshold_holder = ThresholdHolder( fitting_method=FittingMethod.TWO_PASS) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) file_writer = PredictionsFileWriter( benchmark.get_results_directory() + ("two_pass_" if parameters["two_pass"] != "0" else "") + parameters["model"] + ".txt") if parameters["continue"]: file_writer.load()
from src.interactive.sequence_generator import interactive_sequence_generator from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import time_diff, timestamp if __name__ == "__main__": n_tokens = parameters["n_tokens"] corrector = MaximumMatchingCorrector(n=n_tokens) benchmark_name = parameters["benchmark"] if benchmark_name == "0": sequences = interactive_sequence_generator() file_writer = None else: subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT benchmark = Benchmark(benchmark_name, subset) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) file = benchmark.get_results_directory( ) + "maximum_matching_%i.txt" % n_tokens file_writer = PredictionsFileWriter(file) for sequence in sequences: start_time = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start_time) print(predicted) print(runtime) if file_writer is not None: file_writer.add(predicted, runtime) if file_writer is not None:
if __name__ == "__main__": corrector = FuzzyGreedyCorrector() if len(sys.argv) > 1: benchmark_names = sys.argv[1:] else: benchmark_names = [None] for benchmark_name in benchmark_names: if benchmark_name is None: sequences = interactive_sequence_generator() file_writer = None else: benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) file_writer = PredictionsFileWriter( benchmark.get_results_directory() + "fuzzy_greedy.txt") for sequence in sequences: start_time = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start_time) print(predicted) if file_writer is not None: file_writer.add(predicted, runtime) if file_writer is not None: file_writer.save()
from src.datasets.wikipedia import Wikipedia from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.noise.typo_noise_inducer import TypoNoiseInducer from src.sequence.token_corruptor import TokenCorruptor from src.settings import constants from src.helper.files import open_file if __name__ == "__main__": SEED = 3010 p = parameters["corruption_probability"] # create empty benchmarks benchmark_name = parameters["name"] development_benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT) test_benchmark = Benchmark(benchmark_name, Subset.TEST) # read sequences development_sequences = Wikipedia.development_sequences() test_sequences = Wikipedia.test_sequences() # typo inserter p_noise = parameters["noise_probability"] typo_inducer = TypoNoiseInducer(p_noise, SEED) if p_noise > 0 else None # token corruptor corruptor = TokenCorruptor( p=p, positions_per_token=constants.POSITIONS_PER_TOKEN, token_pairs_per_token=constants.TOKEN_PAIRS_PER_TOKEN,
benchmark_name = parameters["benchmark"] n_sequences = parameters["n_sequences"] LOOKAHEAD = 2 model = UnidirectionalLMEstimator() model.load(model_name) space_label = model.encoder.encode_char(' ') #if parameters["continue"]: # cases = load_object(path) #else: cases = [] benchmark = Benchmark(benchmark_name, Subset.TUNING) sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)[:n_sequences] for s_i, sequence in enumerate(sequences): if s_i < len(cases): continue print("sequence %i" % s_i) print(sequence) cases.append([]) encoded = model.encoder.encode_sequence(sequence) if model.specification.backward: sequence = sequence[::-1] encoded = encoded[::-1]
print("p_ins:", p_ins) print("p_del:", p_del) print("benchmark:", benchmark) print("test:", test) print("out file:", out_file) corrector = get_corrector(approach, penalties, p_ins, p_del) print("P_ins = %.2f" % -corrector.insertion_penalty) print("P_del = %.2f" % -corrector.deletion_penalty) if benchmark is None: sequences = interactive_sequence_generator() file_writer = None else: subset = Subset.TEST if test else Subset.DEVELOPMENT benchmark = Benchmark(benchmark, subset) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) if out_file is not None: file_writer = PredictionsFileWriter( benchmark.get_results_directory() + out_file) for sequence in sequences: if sequence.startswith("#"): if out_file is not None: file_writer.add(sequence, 0) continue start_time = timestamp() predicted = corrector.correct(sequence) runtime = time_diff(start_time) print(predicted) if out_file is not None:
model_name=model_name, noise_type=parameters["noise_type"] ) # make greedy corrector: corrector = GreedyCorrector(model, insertion_threshold=insertion_threshold, deletion_threshold=deletion_threshold) # load benchmark sequences or interactive: benchmark_name = parameters["benchmark"] if benchmark_name == "0": benchmark = None sequences = interactive_sequence_generator() else: benchmark = Benchmark(benchmark_name, subset=Subset.TEST if benchmark_name == "project" else Subset.DEVELOPMENT) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) # output file: if benchmark is None: file_writer = None else: file_writer = PredictionsFileWriter(benchmark.get_results_directory() + parameters["out_file"]) if not parameters["initialize"]: file_writer.load() # iterate over sequences: total_runtime = 0 n_sequences = 0 for s_i, sequence in enumerate(sequences): if file_writer is not None and s_i < file_writer.n_sequences():
def zero_one_sequence(a, b, originals=None): res = [] for i, (aa, bb) in enumerate(zip(a, b)): if originals is not None: aa, _, bb = tolerant_preprocess_sequences(originals[i], aa, aa, bb) res.append(1 if aa == bb else 0) return res if __name__ == "__main__": original_sequences = list(Wikipedia.test_sequences()) for b in BENCHMARKS: print("**" + b + "**") benchmark = Benchmark( b, Subset.DEVELOPMENT if b == "nastase-big" else Subset.TEST) path = benchmark.get_results_directory() ground_truth_sequences = benchmark.get_sequences( BenchmarkFiles.CORRECT) with open(paths.DUMP_DIR + "zero_one_sequences/%s.txt" % b, "w") as f: for approach in APPROACHES: print(approach) if approach == "do nothing": predicted_sequences = benchmark.get_sequences( BenchmarkFiles.CORRUPT) else: file = None for ff in FILE_NAMES[approach]: if os.path.exists(benchmark.get_results_directory() + ff): file = ff
"str", help_message="Name of the file containing predicted sequences.") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.benchmark.benchmark import Benchmark, BenchmarkFiles, SUBSETS from src.evaluation.evaluator import Evaluator from src.helper.data_structures import izip from src.evaluation.print_methods import print_evaluator if __name__ == "__main__": benchmark_name = parameters["benchmark"] benchmark_subset = SUBSETS[parameters["set"]] benchmark = Benchmark(benchmark_name, subset=benchmark_subset) sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if parameters["file"] == "corrupt.txt": predicted_sequences = [corrupt for _, corrupt in sequence_pairs] else: predicted_sequences = benchmark.get_predicted_sequences( parameters["file"]) evaluator = Evaluator() for s_i, (correct, corrupt), predicted in izip(sequence_pairs, predicted_sequences): if s_i == parameters["sequences"]: break evaluator.evaluate(benchmark,
from project import src from src.evaluation.samples import get_space_corruptions from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.plot.histogram import plot_rate_histogram, plot_histogram, save_histogram_data if __name__ == "__main__": exclude_zero = "no-zero" in sys.argv out_folder = "acl_error_distribution/" absolute_values = [] error_rates = [] for subset in (Subset.DEVELOPMENT, Subset.TEST): benchmark = Benchmark("ACL", subset) for correct, corrupt in benchmark.get_sequence_pairs( BenchmarkFiles.CORRUPT): print(corrupt) print(correct) edits = get_space_corruptions(correct, corrupt) n_edits = len(edits) n_chars = len(correct) ratio = n_edits / n_chars absolute_values.append(n_edits) error_rates.append(ratio) save_histogram_data(error_rates, out_folder + "tokenization_character_error_rates.txt") plot_rate_histogram( error_rates,
if __name__ == "__main__": benchmarks = [ "ACL", "arXiv.OCR", "arXiv.pdftotext", "Wiki.no_spaces", "Wiki.spaces", "Wiki.typos.no_spaces", "Wiki.typos.spaces" ] approaches = [ "corrupt.txt", "bigrams.txt", "wordsegment.txt", "google_deduced.txt", "BS-fwd.txt", "BS-fwd-OCR.txt", "BS-bid.txt", "BS-bid-OCR.txt" ] out_dir = "/home/hertel/tokenization-repair-dumps/data/zero_one_sequences/" for benchmark in benchmarks: print(benchmark) benchmark = Benchmark(benchmark, Subset.TEST) corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) with open(out_dir + benchmark.name + ".txt", "w") as f: for approach in approaches: f.write(approach + "\n") predicted_sequences = corrupt_sequences if approach == "corrupt.txt" \ else benchmark.get_predicted_sequences(approach) zero_one_sequence = [ 1 if predicted == correct else 0 for predicted, correct in zip(predicted_sequences, correct_sequences) ] f.write("".join([str(i) for i in zero_one_sequence]) + "\n") print("", approach, np.mean(zero_one_sequence))
if parameters["insert"]: if parameters["model_type"] == "combined": THRESHOLD = 0.99 elif "softmax" in parameters["model_name"]: THRESHOLD = 0.95 else: THRESHOLD = 0.1 else: THRESHOLD = 0.01 else: THRESHOLD = parameters["threshold"] corrupt_file = BenchmarkFiles.DELETIONS if parameters[ "insert"] else BenchmarkFiles.INSERTIONS benchmark = Benchmark(parameters["benchmark"], Subset.DEVELOPMENT) corrector = GreedyCorrector(model, insert=parameters["insert"], delete=not parameters["insert"], insertion_threshold=THRESHOLD, deletion_threshold=THRESHOLD) fitter_holder = ThresholdFitterHolder( model_name=parameters["model_name"] if "model_name" in parameters else None, fwd_model_name=parameters["fwd_model_name"] if "fwd_model_name" in parameters else None, bwd_model_name=parameters["bwd_model_name"] if "bwd_model_name" in parameters else None, benchmark_name=parameters["benchmark"],
all_deletion_intervals = [] for benchmark in BENCHMARKS: cases_path = paths.CASES_FILE_NOISY if benchmark.startswith("0.1") else paths.CASES_FILE_CLEAN cases_path = cases_path % (model_name, "wikipedia" if benchmark.startswith("0") else benchmark) sequence_cases = load_object(cases_path) print(len(sequence_cases)) if labeling_model_name != "0": from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator labeling_model = BidirectionalLabelingEstimator() labeling_model.load(labeling_model_name) benchmark = Benchmark(benchmark, Subset.TUNING) case_db = [] correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) for s_i, (correct, corrupt) in enumerate(zip(correct_sequences, corrupt_sequences)): if s_i == n: break print(benchmark.name, s_i) cases = sequence_cases[s_i] case_db.append([]) labeling_space_probs = labeling_model.predict(correct.replace(' ', ''))["probabilities"] if labeling \ else None
for value in f1_list + acc_list: row += "& %.2f " % (value * 100) return row if __name__ == "__main__": benchmark_name = sys.argv[1] subset = SUBSETS[sys.argv[2]] predictions_file_name = sys.argv[3] n_sequences = int(sys.argv[4]) if len(sys.argv) > 4 else -1 eval_all = benchmark_name == "all" if eval_all: benchmarks = all_benchmarks(subset) else: benchmarks = [Benchmark(benchmark_name, subset)] f1_list = [] acc_list = [] for benchmark in benchmarks: original_sequences = { Subset.TUNING: read_sequences(paths.WIKI_TUNING_SENTENCES), Subset.DEVELOPMENT: Wikipedia.development_sequences(), Subset.TEST: Wikipedia.test_sequences() }[subset] sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if predictions_file_name == "corrupt.txt": predicted_sequences = [corrupt for _, corrupt in sequence_pairs] else: