Python read_linesの例、src.helper.files.read_lines Pythonの例

コード例 #1

0

ファイルを表示

ファイル: extended_benchmark.py プロジェクト: ad-freiburg/tokenization-repair

 def initialize(self):
     directory = self.benchmark_dir()
     correct_sequences = read_lines(directory + "correct.txt")
     corrupt_sequences = read_lines(directory + "corrupt.txt")
     self.sequences.extend([
         BenchmarkSequence(correct_sequence, corrupt_sequence)
         for correct_sequence, corrupt_sequence in zip(correct_sequences, corrupt_sequences)
     ])

コード例 #2

0

ファイルを表示

ファイル: hunspell_spell_checker.py プロジェクト: ad-freiburg/tokenization-repair

def main(args):
    corrector = HunspellSpellChecker()
    if args.input is None:
        sequences = interactive_sequences()
    else:
        sequences = read_lines(args.input)
    for sequence in sequences:
        corrected = corrector.correct(sequence)
        print(corrected)

コード例 #3

0

ファイルを表示

def read_typos(typo_dict, test: bool):
    file_name = "typos_test.txt" if test else "typos_training.txt"
    for line in read_lines(TYPO_DIR + file_name):
        vals = line.split(" ")
        correct = vals[0]
        for i in range(1, len(vals), 2):
            misspelling = vals[i]
            frequency = int(vals[i + 1])
            add_typo(typo_dict, correct, misspelling, frequency)

コード例 #4

0

ファイルを表示

ファイル: split_typo_corpus.py プロジェクト: ad-freiburg/tokenization-repair

def read_typos(path):
    typos = []
    for line in read_lines(path):
        vals = line.split(" ")
        correct = vals[0]
        for i in range(1, len(vals), 2):
            misspelling = vals[i]
            frequency = int(vals[i + 1])
            for _ in range(frequency):
                typos.append((correct, misspelling))
    return typos

コード例 #5

0

ファイルを表示

def get_article_jsons(wiki_text_directory):
    """
    Reads all articles as jsons from an extracted Wikipedia dump.

    :param wiki_text_directory: Link to a directory created by the WikiExtractor script.
        Assumes subdirectories to contain files where each line corresponds to an article json.
    :return: iterator over article jsons
    """
    json_files = get_files_depth_two(wiki_text_directory)
    for file in json_files:
        lines = read_lines(file)
        for line in lines:
            article = json.loads(line)
            yield article

コード例 #6

0

ファイルを表示

ファイル: acl_noise_inducer.py プロジェクト: ad-freiburg/tokenization-repair

def read_error_dict(
        tsv_file: str,
        min_frequency: int = 1) -> Dict[str, List[Tuple[str, int]]]:
    errors = {}
    for line in read_lines(tsv_file):
        wrong, correct, freq = line.split("\t")
        freq = int(freq)
        if freq >= min_frequency:
            if len(correct) <= 3 and " " not in wrong and " " not in correct:
                if correct not in errors:
                    errors[correct] = [(wrong, freq)]
                else:
                    errors[correct].append((wrong, freq))
    return errors

コード例 #7

0

ファイルを表示

ファイル: acl_ocr_error_model.py プロジェクト: ad-freiburg/tokenization-repair

def main(args):
    inducer = ACLNoiseInducer(args.p, 0.2079, args.seed)

    if args.print_insertion_prob:
        error_dict = inducer.error_dict
        total_count = 0
        insertion_count = 0
        for correct in error_dict:
            for wrong, freq in error_dict[correct]:
                total_count += freq
                if correct == "":
                    insertion_count += freq
        insertion_prob = insertion_count / total_count
        print(len([e for e, f in error_dict[""] if f >= 0]), "insertions")
        print(
            f"{insertion_prob * 100:.2f}% char insertions ({insertion_count}/{total_count})"
        )

    if args.runtime:
        sequence = "Tokenization Repair in the Presence of Spelling Errors"
        start_time = timestamp()
        corrupt_sequences = []
        for _ in range(100):
            corrupt_sequences.append(inducer.induce_noise(sequence))
        runtime = time_diff(start_time)
        for s in corrupt_sequences:
            print(s)
        print(runtime)
    elif args.input_file:
        out_file = open(args.output_file, "w") if args.output_file else None
        lines = read_lines(args.input_file)
        for line in lines:
            corrupt = inducer.induce_noise(line)
            print(corrupt)
            if out_file is not None:
                out_file.write(corrupt + "\n")
        if out_file is not None:
            out_file.close()
    else:
        while True:
            sequence = input("> ")
            for _ in range(100):
                corrupt = inducer.induce_noise(sequence)
                print(corrupt)

コード例 #8

0

ファイルを表示

ファイル: create_gamma_benchmark.py プロジェクト: ad-freiburg/tokenization-repair

def read_data(path: str) -> List[float]:
    lines = read_lines(path)
    data = [float(line) for line in lines]
    return data

コード例 #9

0

ファイルを表示

ファイル: create_gamma_benchmark.py プロジェクト: ad-freiburg/tokenization-repair

        tokenization_error_rates = [
            rate for rate in tokenization_error_rates if rate > 0
        ]
    params_ocr = fit_distribution(ocr_error_rates, distribution, fscale)
    print("ocr", params_ocr)
    params_tokenization = fit_distribution(tokenization_error_rates,
                                           distribution, fscale)
    print("tokenization", params_tokenization)
    ocr_noise_inducer = ACLNoiseInducer(p=0, insertion_prob=0.2079, seed=seed)

    hyphenator = HyphenationIntroducer(hyphenation_rate)

    with open(out_dir + "/correct.txt",
              "w") as correct_file, open(out_dir + "/corrupt.txt",
                                         "w") as corrupt_file:
        for sequence in read_lines(in_file):
            print(sequence)
            spans = create_sequence_spans(sequence)
            misspelled_spans = []
            mistokenized_spans = []
            for span in spans:
                p_ocr = sample_distribution(params_ocr, distribution)
                ocr_noise_inducer.p = p_ocr
                p_space = sample_distribution(params_tokenization,
                                              distribution)
                misspelled = hyphenator.introduce_hyphens(span)
                if not (zero and flip_coin(random, ocr_p_zero)):
                    misspelled = ocr_noise_inducer.induce_noise(misspelled)
                if not (zero and flip_coin(random, tokenization_p_zero)):
                    mistokenized = corrupt_tokenization(misspelled, p_space)
                else:

コード例 #10

0

ファイルを表示

    files = os.listdir(directory)

    prefix_input = "[OCR_toInput] "
    prefix_aligned = "[OCR_aligned] "
    prefix_truth = "[ GS_aligned] "

    NO_GS_SYMBOL = "#"
    ALIGNMENT_SYMBOL = "@"

    if out_dir is not None:
        corrupt_file = open(out_dir + "/corrupt.txt", "w")
        ground_truth_file = open(out_dir + "/spelling.txt", "w")

    for file in files:
        print(file)
        corrupt, aligned, truth = read_lines(directory + "/" + file)
        corrupt = corrupt[len(prefix_input):]
        aligned = aligned[len(prefix_aligned):]
        truth = truth[len(prefix_truth):]
        print(len(corrupt), len(aligned), len(truth))
        input_sequences, ground_truth_sequences = create_input(
            corrupt, aligned, truth)
        for s_in, s_true in zip(input_sequences, ground_truth_sequences):
            print(s_in)
            print(s_true)
            if out_dir is not None:
                corrupt_file.write(s_in + "\n")
                ground_truth_file.write(s_true + "\n")

    if out_dir is not None:
        corrupt_file.close()

コード例 #11

0

ファイルを表示

ファイル: create_doval_benchmark.py プロジェクト: ad-freiburg/tokenization-repair

    split = [val for val in split if len(val) > 0]
    split_pt = 0
    acc_len = 0
    while acc_len < len(input):
        acc_len += len(split[split_pt])
        split_pt += 1
    predicted = ' '.join(split[:split_pt])
    correct = ' '.join(split[(split_pt + 1):])
    return predicted, correct


if __name__ == "__main__":
    in_file = sys.argv[1]
    lines_per_case = int(sys.argv[2])
    mode = sys.argv[3]  # correct, corrupt, predicted
    lines = read_lines(in_file)
    lines = [line[:-1] if line[-1] == '\t' else line for line in lines]
    n_cases = len(lines) // lines_per_case
    for i in range(n_cases):
        if lines_per_case == 3:
            input = lines[lines_per_case * i]
            result = lines[lines_per_case * i + 1]
        else:
            result = lines[lines_per_case * i]
            input = ''.join(result.split(" vs ")[0].split(' ')[1:])
        if result.startswith(CORRECT_MARKER):
            correct = predicted = result[(len(CORRECT_MARKER) + 1):]
        elif result.startswith(FALSE_MARKER):
            predicted, correct = extract_sequences(result, input)
        else:
            raise Exception()

コード例 #12

0

ファイルを表示

                                   positions_per_token=constants.POSITIONS_PER_TOKEN,
                                   token_pairs_per_token=constants.TOKEN_PAIRS_PER_TOKEN,
                                   seed=13052021)
        benchmark_name += ".spaces"
    else:
        corruptor = SpaceRemover()
        benchmark_name += ".no_spaces"

    out_dir = "/home/hertel/tokenization-repair-dumps/data/benchmarks/" + benchmark_name + "/"
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    i = 0
    for set in ("tuning", "development", "test"):
        if typos:
            test = set == "test"
            typo_inducer = TypoNoiseInducer(0.1, seed=20210513 + i, test=test)
            i += 1
        subdir = out_dir + set + "/"
        if not os.path.exists(subdir):
            os.mkdir(subdir)
        with open(subdir + "correct.txt", "w") as correct_file, open(subdir + "corrupt.txt", "w") as corrupt_file:
            sequences = read_lines(in_dir + set + ".txt")
            for correct in sequences:
                if typos:
                    correct = typo_inducer.corrupt(correct)
                corrupt = corruptor.corrupt(correct)
                print(corrupt)
                correct_file.write(correct + "\n")
                corrupt_file.write(corrupt + "\n")

コード例 #13

0

ファイルを表示

ファイル: print_runtimes.py プロジェクト: ad-freiburg/tokenization-repair

import numpy as np

import sys

import project
from src.helper.files import read_lines, file_exists
from src.benchmark.benchmark import all_benchmarks, ERROR_PROBABILITIES, Subset, BenchmarkFiles

if __name__ == "__main__":
    file_name = sys.argv[1]
    per_chars = 1000

    t_mean = []
    t_normalized = []

    for benchmark in all_benchmarks(Subset.TEST):
        print("== %s ==" % benchmark.name)
        path = benchmark.get_results_directory() + file_name
        total_runtime = float(read_lines(path)[-1]) if file_exists(path) else 0
        mean_runtime = total_runtime / 10000
        t_mean.append(mean_runtime)
        print("mean = %.2f" % mean_runtime)
        n_chars = sum(len(sequence) for sequence in benchmark.get_sequences(BenchmarkFiles.CORRUPT))
        normalized_runtime = total_runtime / n_chars * per_chars
        t_normalized.append(normalized_runtime)
        print("normalized(%i chars) = %.2f" % (per_chars, normalized_runtime))

    print("== total ==")
    print("mean = %.2f" % np.mean(t_mean))
    print("normalized(%i chars) = %.2f" % (per_chars, np.mean(t_normalized)))

コード例 #14

0

ファイルを表示

ファイル: acl_training_data.py プロジェクト: ad-freiburg/tokenization-repair

import project
from src.helper.files import get_files, read_lines
from select_acl_articles import get_year


def preprocess(line: str):
    tokens = [token for token in line.split() if len(token) > 0]
    line = " ".join(tokens)
    return line


if __name__ == "__main__":
    random.seed(42)

    acl_dir = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/"

    files = sorted(get_files(acl_dir))

    files = [file for file in files if get_year(file) >= 2005]
    examples = []

    for file in files:
        lines = read_lines(acl_dir + file)
        lines = [preprocess(line) for line in lines]
        lines = [line for line in lines if len(line) > 0]
        examples.extend(lines)

    random.shuffle(examples)
    for line in examples:
        print(line)

コード例 #15

0

ファイルを表示

import project
from src.helper.files import read_lines, write_lines
from src.settings import paths

if __name__ == "__main__":
    path = paths.BENCHMARKS_DIR + "doval/test/"
    corrupt_sequences = [
        line.replace(' ', '') for line in read_lines(path + "correct.txt")
    ]
    write_lines(path + "corrupt.txt", corrupt_sequences)

コード例 #16

0

ファイルを表示

 def load(self):
     lines = read_lines(self.file)
     self.predicted_sequences = lines[:-1]
     self.runtime = float(lines[-1])

コード例 #17

0

ファイルを表示

ファイル: remove_spaces.py プロジェクト: ad-freiburg/tokenization-repair

import sys

import project
from src.helper.files import read_lines


if __name__ == "__main__":
    lines = read_lines(sys.argv[1])
    for line in lines:
        print(line.replace(" ", ""))

コード例 #18

0

ファイルを表示

ファイル: ocr_postprocess.py プロジェクト: ad-freiburg/tokenization-repair

def is_word(token):
    n_letters = len([char for char in token if char.isalpha()])
    return n_letters / len(token) >= THRESHOLD


def remove_symbols(word):
    return "".join(char for char in word if char.isalnum())


if __name__ == "__main__":
    input_file = sys.argv[1]
    tokenized_file = sys.argv[2]
    n = -1 if len(sys.argv) < 4 else int(sys.argv[3])

    input_lines = read_lines(input_file)
    tokenized_lines = read_lines(tokenized_file)
    if n > 0:
        input_lines = input_lines[:n]

    for sequence, tokenized in zip(input_lines, tokenized_lines):
        input_spaces = get_space_positions(sequence)
        tokenized_spaces = get_space_positions(tokenized)
        tokens = get_tokens(sequence, input_spaces, tokenized_spaces)
        postprocessed_tokens = []
        for token, space_removed in tokens:
            prefix, word, suffix = strip_token(token)
            if space_removed and is_word(word):
                removed = remove_symbols(word)
                postprocessed_tokens.append(prefix + removed + suffix)
            else:

コード例 #19

0

ファイルを表示

ファイル: acl_analyse_ocr_errors.py プロジェクト: ad-freiburg/tokenization-repair

            TokenErrorType.TOKENIZATION_ERROR, TokenErrorType.OCR_ERROR,
            TokenErrorType.MIXED
        }
        error_name_label = "Total error"
    elif analysis_type == "spelling":
        error_types = {TokenErrorType.OCR_ERROR, TokenErrorType.MIXED}
        error_name_label = "Spelling error"
    else:
        error_types = {TokenErrorType.TOKENIZATION_ERROR}
        error_name_label = "Tokenization error"

    absolute_values = []
    error_rates = []

    for s_i, (correct, corrupt) in enumerate(
            zip(read_lines(folder + "spelling.txt"),
                read_lines(folder + "corrupt.txt"))):
        token_errors = get_token_edit_labels(correct, corrupt)
        n_tokens = len(token_errors)
        if n_tokens < 30:  # or n_tokens > 40
            continue
        n_spelling_errors = len(
            [error for error in token_errors if error in error_types])
        error_rate = n_spelling_errors / n_tokens
        print(n_spelling_errors, n_tokens, error_rate)
        absolute_values.append(n_spelling_errors)
        error_rates.append(error_rate)
        if s_i + 1 == n:
            break

    subtitle = "ACL development set (%i sequences)" % n

コード例 #20

0

ファイルを表示

ファイル: correct_icdar_ground_truth.py プロジェクト: ad-freiburg/tokenization-repair

    removed = ""
    while i < len(sequence):
        if i > 0 and i + 2 < len(sequence) and sequence[i - 1].isalpha() and sequence[i:(i + 2)] in ("- ", "‑ ") \
                and sequence[i + 2].isalpha():
            i += 2
        else:
            removed += sequence[i]
            i += 1
    return removed


def correct_ground_truth(sequence):
    sequence = sequence.replace("ſ", "s")
    sequence = sequence.replace("ﬁ", "fi")
    sequence = sequence.replace("ﬀ", "ff")
    sequence = sequence.replace("ﬃ", "ffi")
    for punctuation in ",;?!:)”":
        sequence = sequence.replace(" " + punctuation, punctuation)
    for punctuation in "(“":
        sequence = sequence.replace(punctuation + " ", punctuation)
    sequence = remove_hyphenation(sequence)
    return sequence


if __name__ == "__main__":
    in_file = sys.argv[1]

    for line in read_lines(in_file):
        correct = correct_ground_truth(line)
        print(correct)

コード例 #21

0

ファイルを表示

ファイル: create_arxiv_benchmark.py プロジェクト: ad-freiburg/tokenization-repair

import sys
import random

from project import src
from src.helper.files import read_lines, write_lines
from src.settings import paths
from src.arxiv.dataset import match_lines, to_input_file

if __name__ == "__main__":
    test = "test" in sys.argv
    random.seed(20201026)

    files_file = paths.ARXIV_TEST_FILES if test else paths.ARXIV_DEVELOPMENT_FILES
    subset_name = "test" if test else "development"

    files = read_lines(files_file)
    pairs = []
    for file in files:
        true_path = paths.ARXIV_GROUND_TRUTH_DIR + file
        input_path = paths.PDF_EXTRACT_DIR + to_input_file(file)
        matched = match_lines(true_path, input_path)
        pairs.extend(matched)

    random.shuffle(pairs)

    path = paths.BENCHMARKS_DIR + "arxiv/" + subset_name + "/"
    correct_sequences = [correct for _, correct in pairs]
    corrupt_sequences = [corrupt for corrupt, _ in pairs]
    write_lines(path + "correct.txt", correct_sequences)
    write_lines(path + "corrupt.txt", corrupt_sequences)

コード例 #22

0

ファイルを表示

ファイル: collect_training_paragraphs.py プロジェクト: ad-freiburg/tokenization-repair

import random

from project import src
from src.helper.files import read_lines

if __name__ == "__main__":
    random.seed(20201108)

    files_file = "/home/hertel/tokenization-repair-dumps/claudius/groundtruth-with-normalized-formulas/training.txt"
    base_dir = "/".join(files_file.split("/")[:-1]) + "/"
    print(base_dir)
    out_file = base_dir + "training_paragraphs.txt"

    files = read_lines(files_file)
    paragraphs = []

    print("reading...")

    for file in files:
        lines = read_lines(base_dir + file)
        lines = [" ".join(line.split()).strip() for line in lines]
        lines = [
            line for line in lines if len(line) > 0 and line != "[formula]"
        ]
        paragraphs.extend(lines)

    print("shuffling...")

    random.shuffle(paragraphs)

    print("writing...")

コード例 #23

0

ファイルを表示

if __name__ == "__main__":
    path = "benchmarks/ACL/development/"
    corrupt_file = path + "corrupt.txt"
    ground_truth_file = path + "spelling.txt"

    out_path = "char_error_distributions/"
    out_file_ocr = out_path + "span_ocr_error_rates.txt"
    out_file_tokenization = out_path + "span_tokenization_error_rates.txt"
    out_file_spans = out_path + "spans.txt"
    out_file_ocr = open(out_file_ocr, "w")
    out_file_tokenization = open(out_file_tokenization, "w")
    out_file_spans = open(out_file_spans, "w")

    for i, (corrupt, correct) in enumerate(
            zip(read_lines(corrupt_file), read_lines(ground_truth_file))):
        #print(i)
        #print(corrupt)
        #print(correct)
        erroneous_spans = get_erroneous_spans(corrupt, correct)
        for corrupt_span, correct_span in erroneous_spans:
            if corrupt_span.replace(
                    "-", "") != correct_span and len(correct_span) > 0:
                n_tokens = correct_span.count(" ") + 1
                char_edits = get_ocr_character_edits(corrupt_span,
                                                     correct_span)
                n_chars = len(correct_span)
                n_space_edits = 0
                n_space_insertions = 0
                n_space_deletions = 0
                n_ocr_edits = 0

コード例 #24

0

ファイルを表示

from acl_cleaned_analyse_ocr_errors import get_ocr_character_edits

if __name__ == "__main__":
    raw_file = sys.argv[1]
    clean_file = sys.argv[2]
    out_file = sys.argv[3] if len(sys.argv) > 3 else None

    hyphenator = Hyphenator()

    char_error_rates = []
    total_hyphen_edits = 0
    total_tokens = 0
    hyphenable_tokens = 0

    for i, (corrupt, correct) in enumerate(
            zip(read_lines(raw_file), read_lines(clean_file))):
        corrupt_tokens = corrupt.split()
        correct_tokens = correct.split()
        total_tokens += len(correct_tokens)
        for t in correct_tokens:
            try:
                if len(hyphenator.pairs(t)) > 0:
                    hyphenable_tokens += 1
            except IndexError:
                pass
        ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens)
        print(i + 1, ocr_errors)
        n_char_edits = 0
        n_hyphen_edits = 0
        for erroneous, corrected in ocr_errors:
            char_edits = get_ocr_character_edits(erroneous, corrected)

コード例 #25

0

ファイルを表示

ファイル: deduce_tokenization_from_spelling.py プロジェクト: ad-freiburg/tokenization-repair

            else:
                deletion_positions.add(pos)
    deduced = ""
    for i, char in enumerate(corrupt):
        if i in insertion_positions:
            deduced += " "
        if i not in deletion_positions:
            deduced += char
    return deduced


if __name__ == "__main__":
    benchmark = sys.argv[1]
    subset = sys.argv[2]

    input_file = paths.BENCHMARKS_DIR + benchmark + "/" + subset + "/corrupt.txt"
    predicted_file = paths.DUMP_DIR + "spelling/" + benchmark + "/" + subset + "/google.txt"
    out_file = paths.RESULTS_DIR + benchmark + "/" + subset + "/google_deduced.txt"

    input_sequences = read_lines(input_file)
    predicted_sequences = read_lines(predicted_file)

    with open(out_file, "w") as f:
        for corrupt, predicted in zip(input_sequences, predicted_sequences):
            tokenized = deduce_tokenization(corrupt, predicted)
            if corrupt != predicted:
                print(corrupt)
                print(predicted)
                print(tokenized)
            f.write(tokenized + "\n")

コード例 #26

0

ファイルを表示

ファイル: create_arxiv_training_corpus.py プロジェクト: ad-freiburg/tokenization-repair

            return True
    return False


def split_sentences(text):
    remerged = []
    for sentence in sent_tokenize(text):
        if len(remerged) == 0 or not is_wrong_split(sentence):
            remerged.append(sentence)
        else:
            remerged[-1] = remerged[-1] + sentence
    return remerged


if __name__ == "__main__":
    training_files = read_lines(paths.ARXIV_TRAINING_FILES)
    training_lines = []
    for file in training_files[1:]:
        lines = read_training_lines(paths.ARXIV_GROUND_TRUTH_DIR + file)
        training_lines += lines

    training_lines = [
        line for line in training_lines
        if line not in ("=", "[formula]", ".125in") and ".25in" not in line
    ]

    print(len(training_lines), "lines")
    write_lines(paths.ARXIV_TRAINING_LINES, training_lines)

    print(sum(1 for line in training_lines if len(line) > 256), "length > 256")

コード例 #27

0

ファイルを表示

import sys

import project
from src.helper.files import read_lines


if __name__ == "__main__":
    n_files = len(sys.argv) // 2
    weighted_files = [(sys.argv[2 * i + 1], int(sys.argv[2 * (i + 1)])) for i in range(n_files)]

    ocr_error_frequencies = {}

    for file, weight in weighted_files:
        for line in read_lines(file):
            corrupt, correct, frequency = line.split("\t")
            frequency = int(frequency) * weight
            pair = (corrupt, correct)
            if pair in ocr_error_frequencies:
                ocr_error_frequencies[pair] += frequency
            else:
                ocr_error_frequencies[pair] = frequency

    pairs = sorted(ocr_error_frequencies, key=lambda pair: ocr_error_frequencies[pair], reverse=True)
    for pair in pairs:
        print("\t".join([pair[0], pair[1], str(ocr_error_frequencies[pair])]))

コード例 #28

0

ファイルを表示

    char_edits = []
    for gap_raw, gap_clean in gaps:
        err_raw = raw[gap_raw[0]:(gap_raw[1] + 1)]
        err_clean = cleaned[gap_clean[0]:(gap_clean[1] + 1)]
        char_edits.append((err_raw, err_clean))
    return char_edits


if __name__ == "__main__":
    #in_file = "/home/hertel/tokenization-repair-dumps/nastase/ocr_errors.txt"
    in_file = sys.argv[1]  # "icdar_ocr_errors.txt"
    print_readable = False

    error_frequencies = {}

    for l_i, line in enumerate(read_lines(in_file)):
        if "\t" in line:
            raw, cleaned = line.split("\t")
            char_edits = get_ocr_character_edits(raw, cleaned)
            for err_raw, err_clean in char_edits:
                #print(f"'{err_raw}' -> '{err_clean}'", gap_raw, gap_clean)
                if (err_raw, err_clean) not in error_frequencies:
                    error_frequencies[(err_raw, err_clean)] = 1
                else:
                    error_frequencies[(err_raw, err_clean)] += 1
        else:
            pass
            #print(line)

    errors = sorted(error_frequencies,
                    key=lambda x: error_frequencies[x],

コード例 #29

0

ファイルを表示

from src.helper.files import read_lines

from acl_cleaned_get_ocr_errors import get_ocr_errors
from acl_cleaned_analyse_ocr_errors import get_ocr_character_edits


if __name__ == "__main__":
    raw_file = sys.argv[1]
    clean_file = sys.argv[2]
    out_file = sys.argv[3] if len(sys.argv) > 3 else None

    hyphenator = Hyphenator()

    error_frequencies = {}

    for i, (corrupt, correct) in enumerate(zip(read_lines(raw_file), read_lines(clean_file))):
        print(f"** SEQUENCE {i} **")
        corrupt_tokens = corrupt.split()
        correct_tokens = correct.split()
        ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens)
        for corrupt, correct in ocr_errors:
            corrupt_parts = corrupt.split(" ")
            correct_parts = correct.split(" ")
            if len(corrupt_parts) != len(correct_parts):
                continue
            for corrupt_part, correct_part in zip(corrupt_parts, correct_parts):
                edits = get_ocr_character_edits(correct_part, corrupt_part)
                edits = [e for e in edits if e != ("", "-")]
                if len(edits) > 0:
                    n_char_edits = []
                    token_len = len(correct_part)

コード例 #30

0

ファイルを表示

ファイル: create_noisy_benchmarks.py プロジェクト: ad-freiburg/tokenization-repair

    tuning_path = base_path + "tuning/"
    development_path = base_path + "development/"
    test_path = base_path + "test/"
    return tuning_path, development_path, test_path


def file_paths(dir_path: str):
    correct_path = dir_path + "correct.txt"
    corrupt_path = dir_path + "corrupt.txt"
    return correct_path, corrupt_path


if __name__ == "__main__":
    from src.helper.files import read_lines

    tuning_sequences = read_lines(paths.WIKI_TUNING_SENTENCES)
    development_sequences = read_lines(paths.WIKI_DEVELOPMENT_SENTENCES)
    test_sequences = read_lines(paths.WIKI_TEST_SENTENCES)

    for noise_level in NOISE_LEVELS:
        tuning_ground_truth_sequences = insert_noise(tuning_sequences,
                                                     noise_level)
        development_ground_truth_sequences = insert_noise(
            development_sequences, noise_level)
        test_ground_truth_sequences = insert_noise(test_sequences, noise_level)

        for p in ERROR_PROBABILITIES:
            print(noise_level, p)
            tuning_corrupt_sequences = corrupt_tokenization(
                tuning_ground_truth_sequences, p)
            development_corrupt_sequences = corrupt_tokenization(