import sys
import random

from project import src
from src.helper.files import read_lines, write_lines
from src.settings import paths
from src.arxiv.dataset import match_lines, to_input_file

if __name__ == "__main__":
    test = "test" in sys.argv
    random.seed(20201026)

    files_file = paths.ARXIV_TEST_FILES if test else paths.ARXIV_DEVELOPMENT_FILES
    subset_name = "test" if test else "development"

    files = read_lines(files_file)
    pairs = []
    for file in files:
        true_path = paths.ARXIV_GROUND_TRUTH_DIR + file
        input_path = paths.PDF_EXTRACT_DIR + to_input_file(file)
        matched = match_lines(true_path, input_path)
        pairs.extend(matched)

    random.shuffle(pairs)

    path = paths.BENCHMARKS_DIR + "arxiv/" + subset_name + "/"
    correct_sequences = [correct for _, correct in pairs]
    corrupt_sequences = [corrupt for corrupt, _ in pairs]
    write_lines(path + "correct.txt", correct_sequences)
    write_lines(path + "corrupt.txt", corrupt_sequences)
Beispiel #2
0
 def save(self):
     lines = self.predicted_sequences + [str(self.runtime)]
     write_lines(self.file, lines)
        test_ground_truth_sequences = insert_noise(test_sequences, noise_level)

        for p in ERROR_PROBABILITIES:
            print(noise_level, p)
            tuning_corrupt_sequences = corrupt_tokenization(
                tuning_ground_truth_sequences, p)
            development_corrupt_sequences = corrupt_tokenization(
                development_ground_truth_sequences, p)
            test_corrupt_sequences = corrupt_tokenization(
                test_ground_truth_sequences, p)

            tuning_path, development_path, test_path = benchmark_directories(
                noise_level, p)
            make_directory_recursive(tuning_path)
            make_directory_recursive(development_path)
            make_directory_recursive(test_path)
            tune_correct_path, tune_corrupt_path = file_paths(tuning_path)
            dev_correct_path, dev_corrupt_path = file_paths(development_path)
            test_correct_path, test_corrupt_path = file_paths(test_path)

            if TUNING:
                write_lines(tune_correct_path, tuning_ground_truth_sequences)
                write_lines(tune_corrupt_path, tuning_corrupt_sequences)
            if DEVELOPMENT:
                write_lines(dev_correct_path,
                            development_ground_truth_sequences)
                write_lines(dev_corrupt_path, development_corrupt_sequences)
            if TEST:
                write_lines(test_correct_path, test_ground_truth_sequences)
                write_lines(test_corrupt_path, test_corrupt_sequences)

if __name__ == "__main__":
    training_files = read_lines(paths.ARXIV_TRAINING_FILES)
    training_lines = []
    for file in training_files[1:]:
        lines = read_training_lines(paths.ARXIV_GROUND_TRUTH_DIR + file)
        training_lines += lines

    training_lines = [
        line for line in training_lines
        if line not in ("=", "[formula]", ".125in") and ".25in" not in line
    ]

    print(len(training_lines), "lines")
    write_lines(paths.ARXIV_TRAINING_LINES, training_lines)

    print(sum(1 for line in training_lines if len(line) > 256), "length > 256")

    training_sentences = []
    for line in training_lines:
        sentences = split_sentences(line)
        training_sentences.extend(sentences)

    print(len(training_sentences), "sentences")
    write_lines(paths.ARXIV_TRAINING_SEQUENCES, training_sentences)

    char_frequencies = {}
    for sentence in training_sentences:
        for char in sentence:
            if char not in char_frequencies:
Beispiel #5
0

if __name__ == "__main__":
    random.seed(20201026)

    files = get_files()

    matched_files = []
    unmatched_files = []

    for file in files:
        truth_file = paths.ARXIV_GROUND_TRUTH_DIR + file
        input_file = paths.PDF_EXTRACT_DIR + to_input_file(file)
        matched = match_lines(truth_file, input_file)
        print(truth_file, input_file, len(matched))
        if len(matched) > 0:
            matched_files.append(file)
        else:
            unmatched_files.append(file)

    print("%i matched" % len(matched_files))
    print("%i unmatched" % len(unmatched_files))

    random.shuffle(matched_files)

    write_lines(paths.ARXIV_DEVELOPMENT_FILES, sorted(matched_files[:1000]))
    write_lines(paths.ARXIV_TEST_FILES, sorted(matched_files[1000:2000]))

    training_files = sorted(matched_files[2000:] + unmatched_files)
    write_lines(paths.ARXIV_TRAINING_FILES, training_files)
        print("** " + file + " **")
        sequences = read_lines(in_dir + file)
        sequences = [s.strip() for s in sequences]
        repaired_sequences = []
        i = 0
        while i < len(sequences):
            batch_sequences = [sequences[i]]
            while batch_sequences[-1][-1] == "-" and i + 1 < len(sequences):
                i += 1
                batch_sequences.append(sequences[i])
            i += 1
            batch = "".join(batch_sequences)
            predicted = corrector.correct(batch)
            if len(batch_sequences) == 1:
                repaired_sequences.append(predicted)
            else:
                split_positions = set(
                    np.cumsum([
                        len(seq.replace(" ", "")) for seq in batch_sequences
                    ]))
                start = 0
                nospace_chars = 0
                for pos in range(len(predicted)):
                    if predicted[pos] != " ":
                        nospace_chars += 1
                        if nospace_chars in split_positions:
                            seq = predicted[start:(pos + 1)].strip()
                            repaired_sequences.append(seq)
                            start = pos + 1
        write_lines(out_dir + file, repaired_sequences)
Beispiel #7
0
if __name__ == "__main__":
    random.seed(1998)

    development_ids = set(load_object(paths.WIKI_DEVELOPMENT_ARTICLE_IDS))
    test_ids = set(load_object(paths.WIKI_TEST_ARTICLE_IDS))
    print(development_ids)
    print(test_ids)

    development_paragraphs = []
    test_paragraphs = []

    for article in get_article_jsons():
        id = article["id"]
        is_dev = id in development_ids
        is_test = (not is_dev) and id in test_ids
        if is_dev or is_test:
            paragraph = select_random_paragraph(article["text"])
            if is_dev:
                development_paragraphs.append(paragraph)
            elif is_test:
                test_paragraphs.append(paragraph)
            print("%i dev, %i test" %
                  (len(development_paragraphs), len(test_paragraphs)))

    random.shuffle(development_paragraphs)
    random.shuffle(test_paragraphs)

    write_lines(paths.WIKI_DEVELOPMENT_FILE, development_paragraphs)
    write_lines(paths.WIKI_TEST_FILE, test_paragraphs)
import random

import project
from src.helper.files import read_lines, write_lines
from src.settings import paths

if __name__ == "__main__":
    random.seed(42)
    print("reading...")
    lines = read_lines(paths.WIKI_TRAINING_SENTENCES)
    print("shuffling...")
    random.shuffle(lines)
    print("writing...")
    write_lines(paths.WIKI_TRAINING_SENTENCES_SHUFFLED, lines)
                                              sentence_splitter)
                    sentences = filter_sentences(sentences)
                    if len(sentences) > 0:
                        selected_sentence = random.choice(sentences)
                        selected_sentence = preprocess_sentence(
                            selected_sentence)
                        if id in tuning_ids:
                            tuning_sentences.append(selected_sentence)
                        elif id in development_ids:
                            development_sentences.append(selected_sentence)
                        else:
                            test_sentences.append(selected_sentence)
                elif TRAINING and id in training_ids:
                    sentences = split_article(article["text"],
                                              sentence_splitter)
                    sentences = filter_sentences(sentences)
                    sentences = [
                        preprocess_sentence(sentence) for sentence in sentences
                    ]
                    training_file.write('\n'.join(sentences + [""]))

    if TRAINING:
        training_file.close()
    else:
        for sentence_list in (tuning_sentences, development_sentences,
                              test_sentences):
            random.shuffle(sentence_list)
        write_lines(paths.WIKI_TUNING_SENTENCES, tuning_sentences)
        write_lines(paths.WIKI_DEVELOPMENT_SENTENCES, development_sentences)
        write_lines(paths.WIKI_TEST_SENTENCES, test_sentences)
    elif step == "lines":
        for split in ["training", "development", "test"]:
            path = paths.ACL_CORPUS_DIR + split + "/"
            lines = []
            for filename in sorted(get_files(path)):
                lines.extend(read_lines(path + filename))
            lines = [line.strip() for line in lines]
            lines = [line for line in lines if len(line) > 0]
            lines = [' '.join(line.split()) for line in lines]
            lines = [
                line for line in lines
                if sum(1 if c == "?" else 0 for c in line) < 4
            ]  # remove lines with many ?s
            print(len(lines), "lines")
            write_lines(paths.ACL_CORPUS_DIR + split + ".txt", lines)
            random.shuffle(lines)
            write_lines(paths.ACL_CORPUS_DIR + split + "_shuffled.txt", lines)

    elif step == "dict":
        char_frequencies = {}
        for line in read_lines(paths.ACL_CORPUS_TRAINING_FILE):
            for char in line:
                if char not in char_frequencies:
                    char_frequencies[char] = 1
                else:
                    char_frequencies[char] += 1
        print("== FREQUENCIES ==")
        for char in sorted(char_frequencies):
            print(char, char_frequencies[char])
        print("== ENCODER DICT ==")