def corrupt_dataset_single(p, splits, seed):
    """Creates a corrupt dataset in single-file format.

    :param p: corruption probability
    :param splits: subset of {training, development, test}, provided as a list
    :param seed: corruption random seed
    """
    corruptor = _corruptor(p, seed)
    benchmark_name = _benchmark_name(p)
    for split in splits:
        if split == "training":
            correct_sequences_path = paths.WIKI_TRAINING_SEQUENCES
        elif split == "development":
            correct_sequences_path = paths.WIKI_DEVELOPMENT_SEQUENCES
        else:
            correct_sequences_path = paths.WIKI_TEST_SEQUENCES
        correct_sequences = load_object(correct_sequences_path)
        corrupt_sequences = []
        byte_position = 0
        in_path = paths.WIKI_SINGLE_DIR + split + ".txt"
        out_path = paths.WIKI_SINGLE_DIR + "%s_%s.txt" % (benchmark_name, split)
        with open(out_path, 'wb') as out_file:
            for s_i, sequence in enumerate(read_sequences(in_path)):
                corrupt = corruptor.corrupt(sequence)
                s_id = correct_sequences[s_i].id
                bytes = (corrupt + '\n').encode("utf8")
                out_file.write(bytes)
                byte_len = out_file.tell() - byte_position
                char_len = len(corrupt)
                corrupt_sequences.append(Sequence(s_id, split, byte_position, byte_len, char_len))
                byte_position += byte_len
        corrupt_sequences_path = paths.WIKI_OUT_DIR + "%s_%s_sequences.pkl" % (benchmark_name, split)
        dump_object(corrupt_sequences, corrupt_sequences_path)
        corruptor.print_summary()
Esempio n. 2
0
def count_unigrams(n_sequences: int):
    total_start = timestamp()

    tokenizer = Tokenizer()
    counts_delim = {}
    counts_no_delim = {}

    tokenization_time = 0

    for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)):
        start = timestamp()
        tokens = tokenizer.tokenize(sequence)
        tokens[0].delimiter_before = True
        tokenization_time += time_diff(start)
        for token in tokens:
            counts = counts_delim if token.delimiter_before else counts_no_delim
            if token.text not in counts:
                counts[token.text] = 1
            else:
                counts[token.text] += 1
        if (s_i + 1) % K10 == 0:
            print("%ik sequences, %.2f s total time, %.2f s tokenization" %
                  ((s_i + 1) / K, time_diff(total_start), tokenization_time))
        if (s_i + 1) % M == 0:
            print("saving...")
            dump_object(counts_delim, paths.UNIGRAM_DELIM_FREQUENCY_DICT)
            dump_object(counts_no_delim, paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
Esempio n. 3
0
 def _save_encoder(self):
     """
     Stores the encoder at the model directory.
     File name is encoder.pkl.
     :return:
     """
     make_directory(self.model_dir())
     dump_object(self.encoder, self.model_dir() + "/encoder.pkl")
Esempio n. 4
0
 def _save_specification(self):
     """
     Stores the specification at the model directory.
     File name is specification.pkl.
     :return:
     """
     make_directory(self.model_dir())
     dump_object(self.specification,
                 self.model_dir() + "/specification.pkl")
Esempio n. 5
0
def most_frequent_wiki_and_all_aspell_word_counts(k_most_frequent):
    path = paths.WIKI_AND_ASPELL_TOKEN_COUNTERS % k_most_frequent
    if file_exists(path):
        return load_object(path)
    words = most_frequent_tokens(k_most_frequent)
    wiki_word_counters = load_object(paths.WIKI_TOKEN_COUNTERS)
    with open(paths.ASPELL_WORD_FILE) as f:
        for line in f:
            word = line[:-1]
            if word not in words:
                words[word] = wiki_word_counters[
                    word] if word in wiki_word_counters else 0
    dump_object(words, path)
    return words
Esempio n. 6
0
def get_stump_dict(unigrams: UnigramHolder) -> Dict[str, Set[str]]:
    if file_exists(paths.STUMP_DICT):
        return load_object(paths.STUMP_DICT)
    else:
        stump_dict = {}
        for token in unigrams.frequencies:
            if not token.isalpha():
                continue
            if unigrams.get(token) < MIN_TOKEN_FREQUENCY:
                continue
            for stump in get_stumps(token):
                if stump not in stump_dict:
                    stump_dict[stump] = {token}
                else:
                    stump_dict[stump].add(token)
        dump_object(stump_dict, paths.STUMP_DICT)
    return stump_dict
Esempio n. 7
0
def load_most_frequent(n):
    path = None
    if n is not None:
        path = paths.MOST_FREQUENT_UNIGRAMS_DICT % n
        if file_exists(path):
            frequencies = load_object(path)
            return frequencies
    delim_frequencies = load_object(paths.UNIGRAM_DELIM_FREQUENCY_DICT)
    no_delim_frequencies = load_object(paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
    frequencies = delim_frequencies
    for token in no_delim_frequencies:
        if token not in frequencies:
            frequencies[token] = no_delim_frequencies[token]
        else:
            frequencies[token] += no_delim_frequencies[token]
    if n is not None:
        frequencies = select_most_frequent(frequencies, n)
        dump_object(frequencies, path)
    return frequencies
def split_training_set():
    """Splits the training data set in single-file format into files of sequences of equal length."""

    _remove_training_split_files()
    length_counts = {}
    with open(paths.WIKI_TRAINING_FILE) as training_file:
        while True:
            line = training_file.readline()
            if line == "":
                break
            sequence = line[:-1]
            seq_len = len(sequence)
            if seq_len not in length_counts:
                length_counts[seq_len] = 0
            path = paths.WIKI_TRAINING_SPLIT_DIR + "%i.txt" % seq_len
            with open(path, 'a', encoding="utf8") as file:
                file.write(sequence + "\n")
            length_counts[seq_len] += 1
    dump_object(length_counts, paths.WIKI_TRAINING_SEQUENCE_COUNTS)
Esempio n. 9
0
def split_dataset(wiki_text_directory, out_directory, n_split):
    """
    Reads all article IDs from an extracted wikipedia dump, splits them into training, development and test sets and
    pickles the three sets as lists.
    :param wiki_text_directory: Link to a directory created by the WikiExtractor script.
        Assumes subdirectories to contain files where each line corresponds to an article json.
    :param out_directory: directory where the ID lists are stored
    :param n_split: number of articles of the training and test sets
    """
    training_ids, development_ids, test_ids = split_article_ids(
        wiki_text_directory, n_split)
    training_ids_path = out_directory + "training_article_ids.pkl"
    development_ids_path = out_directory + "development_article_ids.pkl"
    test_ids_path = out_directory + "test_article_ids.pkl"
    dump_object(training_ids, training_ids_path)
    dump_object(development_ids, development_ids_path)
    dump_object(test_ids, test_ids_path)
    print(
        "Split dataset into %i training articles, %i development articles, %i test articles."
        % (len(training_ids), len(development_ids), len(test_ids)))
Esempio n. 10
0
 def save(self):
     dump_object(self.threshold_dict, self.file)
Esempio n. 11
0
def write_dataset_split_files(wiki_text_directory, dev_ids, test_ids):
    """
    Reads the articles from an extracted wikipedia dump and writes three files, each containing the paragraphs
    of one partition.
    Also dumps three lists containing the sequences as Sequence-objects.
    The output file names are defined in src.settings.paths.

    :param wiki_text_directory: directory of the wikipedia dump, containing folders with files containing articles
        as jsons
    :param dev_ids: set of article IDs for the development set
    :param test_ids: set of article IDs for the test set
    :return:
    """
    articles = get_article_jsons(wiki_text_directory)

    training_file = open(paths.WIKI_TRAINING_FILE, 'wb')
    development_file = open(paths.WIKI_DEVELOPMENT_FILE, 'wb')
    test_file = open(paths.WIKI_TEST_FILE, 'wb')

    training_sequences = []
    development_sequences = []
    test_sequences = []

    sequence_id = 0
    for article in articles:
        article_id = article["id"]
        paragraphs = get_paragraphs(article)
        for paragraph in paragraphs:
            char_len = len(paragraph)
            bytes = (paragraph + '\n').encode(FILE_ENCODING)
            if article_id in dev_ids:
                byte_offset = development_file.tell()
                development_file.write(bytes)
                byte_len = development_file.tell() - byte_offset
                sequence = Sequence(sequence_id, DatasetSplit.TRAINING,
                                    byte_offset, byte_len, char_len)
                development_sequences.append(sequence)
            elif article_id in test_ids:
                byte_offset = test_file.tell()
                test_file.write(bytes)
                byte_len = test_file.tell() - byte_offset
                sequence = Sequence(sequence_id, DatasetSplit.DEVELOPMENT,
                                    byte_offset, byte_len, char_len)
                test_sequences.append(sequence)
            else:
                byte_offset = training_file.tell()
                training_file.write(bytes)
                byte_len = training_file.tell() - byte_offset
                sequence = Sequence(sequence_id, DatasetSplit.TEST,
                                    byte_offset, byte_len, char_len)
                training_sequences.append(sequence)
            sequence_id += 1

    training_file.close()
    development_file.close()
    test_file.close()
    del articles

    dump_object(training_sequences, paths.WIKI_TRAINING_SEQUENCES)
    dump_object(development_sequences, paths.WIKI_DEVELOPMENT_SEQUENCES)
    dump_object(test_sequences, paths.WIKI_TEST_SEQUENCES)
    CANDIDATE_FILE = paths.DUMP_DIR + "tmp_punkt_candidates.pkl"
    COUNT_FILE = paths.DUMP_DIR + "tmp_punkt_counts.pkl"
    ABBREVIATIONS_FILE = paths.EXTENDED_PUNKT_ABBREVIATIONS

    if MODE == "candidates":
        all_tokens = set()
        for sequence in sequences():
            for token in sequence.split():
                all_tokens.add(token)
        candidates = set()
        for token in all_tokens:
            is_candidate, candidate = abbreviation_candidate(token)
            if is_candidate:
                candidates.add(candidate)
        dump_object(candidates, CANDIDATE_FILE)

    elif MODE == "print-candidates":
        candidates = load_object(CANDIDATE_FILE)
        for c in sorted(candidates):
            print(c)

    elif MODE == "count":
        counts = {
            candidate: [0, 0]
            for candidate in load_object(CANDIDATE_FILE)
        }
        for sequence in sequences():
            tokens = sequence.split()
            for token in tokens:
                if token in counts:
Esempio n. 13
0
            space_state = model.step(state, space_label, include_sequence=False)
            no_space_state = state
            p_after_space = []
            p_after_no_space = []
            for j, label in enumerate(next_labels):
                char = model.encoder.decode_label(label)
                space_p = space_state["probabilities"][label]
                no_space_p = no_space_state["probabilities"][label]
                #print("", j, label, char, space_p, no_space_p)
                p_after_space.append(space_p)
                p_after_no_space.append(no_space_p)
                if j < LOOKAHEAD:
                    space_state = model.step(space_state, label, include_sequence=False)
                    no_space_state = model.step(no_space_state, label, include_sequence=False)
            case = Case(sequence_index=s_i,
                        position=i,
                        true_space=is_space,
                        p_space=p_space,
                        p_after_space=p_after_space,
                        p_after_no_space=p_after_no_space)
            cases[-1].append(case)
        if model.specification.backward:
            cases[-1] = cases[-1][::-1]

        if (s_i + 1) % 1000 == 0:
            dump_object(cases, path)
            print("saved at", path)

    dump_object(cases, path)
    print("saved at", path)
Esempio n. 14
0
 def dump(self, fitter: ThresholdFitter):
     dump_object(fitter, self._file())
            lines = [
                line for line in lines
                if sum(1 if c == "?" else 0 for c in line) < 4
            ]  # remove lines with many ?s
            print(len(lines), "lines")
            write_lines(paths.ACL_CORPUS_DIR + split + ".txt", lines)
            random.shuffle(lines)
            write_lines(paths.ACL_CORPUS_DIR + split + "_shuffled.txt", lines)

    elif step == "dict":
        char_frequencies = {}
        for line in read_lines(paths.ACL_CORPUS_TRAINING_FILE):
            for char in line:
                if char not in char_frequencies:
                    char_frequencies[char] = 1
                else:
                    char_frequencies[char] += 1
        print("== FREQUENCIES ==")
        for char in sorted(char_frequencies):
            print(char, char_frequencies[char])
        print("== ENCODER DICT ==")
        encoder_dict = {}
        for char in sorted(char_frequencies):
            if char_frequencies[char] > 10:
                encoder_dict[char] = len(encoder_dict)
        encoder_dict[symbols.SOS] = len(encoder_dict)
        encoder_dict[symbols.EOS] = len(encoder_dict)
        encoder_dict[symbols.UNKNOWN] = len(encoder_dict)
        print(encoder_dict)
        dump_object(encoder_dict, paths.ACL_ENCODER_DICT)
 def save(self):
     dump_object(self, paths.BIGRAM_HOLDER)
Esempio n. 17
0
 def save_training_results(self):
     dump_object(self.training_results,
                 self._path_to_file("training_results.pkl"))
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.ABBREV = 0.3
        trainer.train(text, verbose=True)
        del text

        print("building tokenizer...")
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        abbrevs = tokenizer._params.abbrev_types
        print(sorted(abbrevs))
        print("%i abbreviations" % len(abbrevs))

        target_abbrevs = [
            "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms",
            "seq", "o.r.s"
        ]
        for target in target_abbrevs:
            print(target, target in abbrevs, score(trainer, target))

        print("saving...")
        dump_object(tokenizer, PICKLE_FILE)

    elif MODE == "starters":
        tokenizer = load_object(PICKLE_FILE)
        for starter in sorted(tokenizer._params.sent_starters):
            print(starter)

    elif MODE == "starters-nltk":
        tokenizer = NLTKSentenceSplitter()
        for starter in sorted(tokenizer.tokenizer._params.sent_starters):
            print(starter)
    print(len(training_lines), "lines")
    write_lines(paths.ARXIV_TRAINING_LINES, training_lines)

    print(sum(1 for line in training_lines if len(line) > 256), "length > 256")

    training_sentences = []
    for line in training_lines:
        sentences = split_sentences(line)
        training_sentences.extend(sentences)

    print(len(training_sentences), "sentences")
    write_lines(paths.ARXIV_TRAINING_SEQUENCES, training_sentences)

    char_frequencies = {}
    for sentence in training_sentences:
        for char in sentence:
            if char not in char_frequencies:
                char_frequencies[char] = 1
            else:
                char_frequencies[char] += 1
    encoder = {
        char: i
        for i, char in enumerate(
            sorted(select_most_frequent(char_frequencies, 200)))
    }
    encoder[symbols.SOS] = len(encoder)
    encoder[symbols.EOS] = len(encoder)
    encoder[symbols.UNKNOWN] = len(encoder)
    print(encoder)
    dump_object(encoder, paths.ARXIV_ENCODER_DICT)
Esempio n. 20
0
import project
from src.datasets.wikipedia import Wikipedia
from src.settings import paths
from src.helper.pickle import dump_object

K = 100000
M = 10 * K

if __name__ == "__main__":
    token_frequencies = {}
    for s_i, sequence in enumerate(Wikipedia.training_sequences()):
        if s_i % K == 0:
            print("%.1fM sequences, %.1fM tokens" %
                  (s_i / M, len(token_frequencies) / M))
        tokens = sequence.split()
        for token in tokens:
            if token not in token_frequencies:
                token_frequencies[token] = 1
            else:
                token_frequencies[token] += 1
    dump_object(token_frequencies, paths.TOKEN_FREQUENCY_DICT)
import sys

import project
from src.helper.pickle import load_object, dump_object
from src.settings.paths import ESTIMATORS_DIR

if __name__ == "__main__":
    name = sys.argv[1]
    path = ESTIMATORS_DIR + name + "/specification.pkl"
    specification = load_object(path)
    specification.name = name
    dump_object(specification, path)
 def save(self):
     dump_object(self.penalties, self.file)
Esempio n. 23
0
 def save(self):
     dump_object(self.results, paths.RESULTS_DICT)
 def save(self):
     dump_object(self, self.benchmark_file())
Esempio n. 25
0
import sys

from project import src
from src.helper.files import read_sequences
from src.helper.data_structures import select_most_frequent
from src.settings import symbols
from src.helper.pickle import dump_object

if __name__ == "__main__":
    text_file = sys.argv[1]
    out_file = sys.argv[2]

    char_frequencies = {}

    for i, line in enumerate(read_sequences(text_file)):
        for char in line:
            if char not in char_frequencies:
                char_frequencies[char] = 1
            else:
                char_frequencies[char] += 1
        if (i + 1) % 100000 == 0:
            print(i + 1, "lines", len(char_frequencies), "unique characters")
        if (i + 1) == 10000000:
            break

    chars = select_most_frequent(char_frequencies, 200)
    symbs = [symbols.SOS, symbols.EOS, symbols.UNKNOWN]
    encoder = {symbol: i for i, symbol in enumerate(sorted(chars) + symbs)}

    dump_object(encoder, out_file)
Esempio n. 26
0
"""
Counts the occurences of all characters in the Wikipedia training set and stores the counts on disk.
"""

import sys

from project import src
from src.data.wikipedia import Wikipedia
from src.helper.pickle import dump_object
from src.settings import paths


def count_chars(counters, sequence):
    for char in sequence:
        if char not in counters:
            counters[char] = 1
        else:
            counters[char] += 1


if __name__ == "__main__":
    n_sequences = int(sys.argv[1])
    char_counters = {}
    for i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)):
        count_chars(char_counters, sequence)
        if (i + 1) % 100000 == 0:
            print("%i sequences processed" % (i + 1))
    print(char_counters)
    dump_object(char_counters, paths.CHARACTER_FREQUENCY_DICT)