Esempio n. 1
0
def get_encoder(n: int = 0) -> CharacterEncoder:
    if n > 0:
        frequencies = load_object(paths.CHARACTER_FREQUENCY_DICT)
        sorted_frequencies = sort_dict_by_value(frequencies)
        most_frequent_chars = [char for char, frequency in sorted_frequencies[:n]]
        code_symbols = most_frequent_chars + [symbols.SOS, symbols.EOS, symbols.UNKNOWN]
        encoder = {symbol: index for index, symbol in enumerate(code_symbols)}
    else:
        encoder = load_object(paths.WIKI_ENCODER_DICT)
    return CharacterEncoder(encoder)
def get_article_ids_split(out_directory: str) -> Tuple[List[int], List[int], List[int]]:
    """Reads the article IDs of the training, development and test partitions from disk.

    :param out_directory: directory where the IDs are stored
    :return: lists of training, development and test article IDs
    """
    training_ids_path = out_directory + "training_article_ids.pkl"
    development_ids_path = out_directory + "development_article_ids.pkl"
    test_ids_path = out_directory + "test_article_ids.pkl"
    training_ids = load_object(training_ids_path)
    development_ids = load_object(development_ids_path)
    test_ids = load_object(test_ids_path)
    return training_ids, development_ids, test_ids
Esempio n. 3
0
 def load(self, name):
     """
     Loads an exported model into the default session.
     Assumes the model is stored at paths.ESTIMATOR_FOLDER + name including specification and encoder.
     :param name: name of the model, defines model subfolder
     """
     self.specification = load_object(paths.MODEL_DIR + name +
                                      "/specification.pkl")
     self.encoder = load_object(paths.MODEL_DIR + name + "/encoder.pkl")
     self.estimator = self._make_estimator()
     self._update_saved_model()
     self.predict_fn = tf.contrib.predictor.from_saved_model(
         latest_saved_model_dir(self.model_dir()))
Esempio n. 4
0
def most_frequent_wiki_and_all_aspell_word_counts(k_most_frequent):
    path = paths.WIKI_AND_ASPELL_TOKEN_COUNTERS % k_most_frequent
    if file_exists(path):
        return load_object(path)
    words = most_frequent_tokens(k_most_frequent)
    wiki_word_counters = load_object(paths.WIKI_TOKEN_COUNTERS)
    with open(paths.ASPELL_WORD_FILE) as f:
        for line in f:
            word = line[:-1]
            if word not in words:
                words[word] = wiki_word_counters[
                    word] if word in wiki_word_counters else 0
    dump_object(words, path)
    return words
def corrupt_dataset_single(p, splits, seed):
    """Creates a corrupt dataset in single-file format.

    :param p: corruption probability
    :param splits: subset of {training, development, test}, provided as a list
    :param seed: corruption random seed
    """
    corruptor = _corruptor(p, seed)
    benchmark_name = _benchmark_name(p)
    for split in splits:
        if split == "training":
            correct_sequences_path = paths.WIKI_TRAINING_SEQUENCES
        elif split == "development":
            correct_sequences_path = paths.WIKI_DEVELOPMENT_SEQUENCES
        else:
            correct_sequences_path = paths.WIKI_TEST_SEQUENCES
        correct_sequences = load_object(correct_sequences_path)
        corrupt_sequences = []
        byte_position = 0
        in_path = paths.WIKI_SINGLE_DIR + split + ".txt"
        out_path = paths.WIKI_SINGLE_DIR + "%s_%s.txt" % (benchmark_name, split)
        with open(out_path, 'wb') as out_file:
            for s_i, sequence in enumerate(read_sequences(in_path)):
                corrupt = corruptor.corrupt(sequence)
                s_id = correct_sequences[s_i].id
                bytes = (corrupt + '\n').encode("utf8")
                out_file.write(bytes)
                byte_len = out_file.tell() - byte_position
                char_len = len(corrupt)
                corrupt_sequences.append(Sequence(s_id, split, byte_position, byte_len, char_len))
                byte_position += byte_len
        corrupt_sequences_path = paths.WIKI_OUT_DIR + "%s_%s_sequences.pkl" % (benchmark_name, split)
        dump_object(corrupt_sequences, corrupt_sequences_path)
        corruptor.print_summary()
 def __init__(self,
              n,
              use_aspell=True,
              postprocessing_method=None,
              verbose=False):
     if use_aspell:
         self.token_counters = most_frequent_wiki_and_all_aspell_word_counts(
             n)
     else:
         self.token_counters = most_frequent_tokens(n)
     self.postprocessing_method = postprocessing_method
     if postprocessing_method == "SVM":
         self.split_model = load_object(paths.SVM_SPLIT_MODEL)
     elif postprocessing_method == "RF":
         self.split_model = load_object(paths.RF_SPLIT_MODEL)
     self.verbose = verbose
 def __init__(self, n=None):
     token_frequencies = load_object(paths.TOKEN_FREQUENCY_DICT)
     if n is None:
         self.tokens = set(token_frequencies)
     else:
         self.tokens = set(
             token
             for token, _ in sort_dict_by_value(token_frequencies)[:n])
     self.max_token_len = max(len(token) for token in self.tokens)
Esempio n. 8
0
def load_most_frequent(n):
    path = None
    if n is not None:
        path = paths.MOST_FREQUENT_UNIGRAMS_DICT % n
        if file_exists(path):
            frequencies = load_object(path)
            return frequencies
    delim_frequencies = load_object(paths.UNIGRAM_DELIM_FREQUENCY_DICT)
    no_delim_frequencies = load_object(paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
    frequencies = delim_frequencies
    for token in no_delim_frequencies:
        if token not in frequencies:
            frequencies[token] = no_delim_frequencies[token]
        else:
            frequencies[token] += no_delim_frequencies[token]
    if n is not None:
        frequencies = select_most_frequent(frequencies, n)
        dump_object(frequencies, path)
    return frequencies
    def test_sequences(n_sequences: Optional[int] = None,
                       seed: Optional[int] = None):
        """Reads the correct test sequences stored in single-file format.

        :param n_sequences: number of sequences, set None to retrieve all
        :param seed: seed for shuffling, set None for unshuffled
        :return: iterator over paragraph texts
        """
        test_sequences = load_object(paths.WIKI_TEST_SEQUENCES)
        file = paths.WIKI_TEST_FILE
        return Wikipedia._read_sequences(file, test_sequences, seed,
                                         n_sequences)
 def get_evaluation_samples(file_name):
     file_name = paths.WIKI_EVALUATION_DIR + file_name
     original_lines = read_file(file_name +
                                file_names.ORIGINAL_SUFFIX).split('\n')
     original_lines = [line for line in original_lines if len(line) > 0]
     corrupt_lines = read_file(file_name +
                               file_names.CORRUPT_SUFFIX).split('\n')
     corrupt_lines = [line for line in corrupt_lines if len(line) > 0]
     corruptions = load_object(file_name + file_names.CORRUPTIONS_SUFFIX)
     assert (len(corrupt_lines) == len(original_lines))
     assert (len(corruptions) == len(original_lines))
     return list(zip(original_lines, corrupt_lines, corruptions))
Esempio n. 11
0
 def __init__(self,
              fitting_method: FittingMethod = FittingMethod.GREEDY,
              autosave: bool = True):
     self.file = THRESHOLD_FILES[fitting_method]
     if file_exists(self.file):
         self.threshold_dict = load_object(self.file)
     else:
         print(
             "WARNING: could not locate %s. A new, empty decision threshold dictionary was created instead."
             % self.file)
         self.threshold_dict = dict()
     self.autosave = autosave
Esempio n. 12
0
def most_frequent_tokens(n):
    if n is None:
        print("loading counters...")
        return load_object(paths.WIKI_TOKEN_COUNTERS)
    most_frequent_path = paths.WIKI_MOST_FREQUENT_TOKENS % n
    if file_exists(most_frequent_path):
        print("loading most frequent counters...")
        return load_object(most_frequent_path)
    sorted_token_counters_path = paths.WIKI_SORTED_TOKEN_COUNTERS
    if file_exists(sorted_token_counters_path):
        print("loading sorted counters...")
        sorted_token_counters = load_object(sorted_token_counters_path)
    else:
        print("loading counters...")
        token_counters = load_object(paths.WIKI_TOKEN_COUNTERS)
        print("sorting counters...")
        sorted_token_counters = sort_word_counters(token_counters)
        pickle_dump(sorted_token_counters, sorted_token_counters_path)
    most_frequent = sorted_word_counters_to_dict(sorted_token_counters[:n])
    if not file_exists(most_frequent_path):
        pickle_dump(most_frequent, most_frequent_path)
    return most_frequent
 def __init__(self,
              two_pass: bool = False,
              seq_acc: bool = False,
              autosave: bool = True):
     if seq_acc:
         self.file = paths.SEQ_ACC_BEAM_SEARCH_PENALTY_FILE
     elif two_pass:
         self.file = paths.TWO_PASS_BEAM_SEARCH_PENALTY_FILE
     else:
         self.file = paths.BEAM_SEARCH_PENALTY_FILE
     if file_exists(self.file):
         self.penalties = load_object(self.file)
     else:
         self.penalties = {}
     self.autosave = autosave
Esempio n. 14
0
def get_stump_dict(unigrams: UnigramHolder) -> Dict[str, Set[str]]:
    if file_exists(paths.STUMP_DICT):
        return load_object(paths.STUMP_DICT)
    else:
        stump_dict = {}
        for token in unigrams.frequencies:
            if not token.isalpha():
                continue
            if unigrams.get(token) < MIN_TOKEN_FREQUENCY:
                continue
            for stump in get_stumps(token):
                if stump not in stump_dict:
                    stump_dict[stump] = {token}
                else:
                    stump_dict[stump].add(token)
        dump_object(stump_dict, paths.STUMP_DICT)
    return stump_dict
    def __init__(self,
                 trained_abbreviations: bool = False,
                 extended_abbreviations: bool = True):
        """
        The default Punkt tokenizer with additional abbreviations.

        :param trained_abbreviations: Use the abbreviations of the Punkt tokenizer from Wikipedia.
        :param extended_abbreviations: Use the abbreviations determined by counting frequencies of tokens with and
        without dot on Wikipedia.
        """
        self.tokenizer = load_default_nltk_tokenizer()
        if trained_abbreviations:
            wiki_tokenizer = self._load_trained_tokenizer()
            for abbr in wiki_tokenizer._params.abbrev_types:
                self.tokenizer._params.abbrev_types.add(abbr)
        if extended_abbreviations:
            for abbr in load_object(paths.EXTENDED_PUNKT_ABBREVIATIONS):
                self.tokenizer._params.abbrev_types.add(abbr.lower())
Esempio n. 16
0
    def training_batches(batch_size: int, seed: int = 42) -> Iterator[Batch]:
        """Iterates over batches of correct training sequences stored in split format.

        Loads the number of sequences of equal length from disk and splits the sequences into batches of the same
        length. Reads the split files in shuffled order.
        Actually, only the order of the files is shuffled, but the batches are fixed, and the order of batches of the
        same sequence lenght is fixed. For training for multiple epochs, one should really shuffle the sequences and
        batches.

        :param batch_size: maximum number of sequences per batch
        :param seed: seed to shuffle the files
        :return: iterator over batches
        """
        num_sequences = load_object(paths.WIKI_TRAINING_SEQUENCE_COUNTS)
        batches = []
        for seq_len in sorted(num_sequences):
            n_batches = math.ceil(num_sequences[seq_len] / batch_size)
            batches += [seq_len] * n_batches
        random.Random(seed).shuffle(batches)
        file_positions = {}
        for seq_len in batches:
            if seq_len in file_positions:
                start = file_positions[seq_len]
            else:
                start = 0
            path = paths.WIKI_TRAINING_SPLIT_DIR + "%i.txt" % seq_len
            with open(path) as file:
                batch_sequences = []
                file.seek(start)
                for _ in range(batch_size):
                    line = file.readline()
                    if line == "":
                        break
                    sequence = line[:-1]
                    batch_sequences.append(sequence)
                file_positions[seq_len] = file.tell()
                yield Batch(batch_sequences, len(batch_sequences), seq_len)
import sys

from project import src
from src.helper.pickle import load_object
from src.settings import paths


if __name__ == "__main__":
    if "test" in sys.argv:
        article_ids = load_object(paths.WIKI_TEST_ARTICLE_IDS)
    else:
        article_ids = load_object(paths.WIKI_DEVELOPMENT_ARTICLE_IDS)
    article_ids = [int(i) for i in article_ids]
    for i in sorted(article_ids):
        print(i)
Esempio n. 18
0
    EPSILON = 1e-16
    lookahead = 2
    labeling = labeling_model_name != "0"
    title = "%s (%s)" % (approach, benchmark_name)

    #BENCHMARKS = ["0_0.1", "0.1_0.1", "arxiv-910k", "nastase-big"]
    BENCHMARKS = [benchmark_name]

    all_insertion_intervals = []
    all_deletion_intervals = []

    for benchmark in BENCHMARKS:
        cases_path = paths.CASES_FILE_NOISY if benchmark.startswith("0.1") else paths.CASES_FILE_CLEAN
        cases_path = cases_path % (model_name, "wikipedia" if benchmark.startswith("0") else benchmark)

        sequence_cases = load_object(cases_path)

        print(len(sequence_cases))

        if labeling_model_name != "0":
            from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
            labeling_model = BidirectionalLabelingEstimator()
            labeling_model.load(labeling_model_name)

        benchmark = Benchmark(benchmark, Subset.TUNING)
        case_db = []

        correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
        corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)

        for s_i, (correct, corrupt) in enumerate(zip(correct_sequences, corrupt_sequences)):
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.ABBREV = 0.3
        trainer.train(text, verbose=True)
        del text

        print("building tokenizer...")
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        abbrevs = tokenizer._params.abbrev_types
        print(sorted(abbrevs))
        print("%i abbreviations" % len(abbrevs))

        target_abbrevs = [
            "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms",
            "seq", "o.r.s"
        ]
        for target in target_abbrevs:
            print(target, target in abbrevs, score(trainer, target))

        print("saving...")
        dump_object(tokenizer, PICKLE_FILE)

    elif MODE == "starters":
        tokenizer = load_object(PICKLE_FILE)
        for starter in sorted(tokenizer._params.sent_starters):
            print(starter)

    elif MODE == "starters-nltk":
        tokenizer = NLTKSentenceSplitter()
        for starter in sorted(tokenizer.tokenizer._params.sent_starters):
            print(starter)
Esempio n. 20
0
 def __init__(self):
     if file_exists(paths.RESULTS_DICT):
         self.results = load_object(paths.RESULTS_DICT)
     else:
         self.results = {}
import sys

import project
from src.helper.pickle import load_object, dump_object
from src.settings.paths import ESTIMATORS_DIR

if __name__ == "__main__":
    name = sys.argv[1]
    path = ESTIMATORS_DIR + name + "/specification.pkl"
    specification = load_object(path)
    specification.name = name
    dump_object(specification, path)
Esempio n. 22
0
from src.data.raw_wikipedia import get_article_jsons
from src.data.preprocessing import preprocess_sequence


def select_random_paragraph(text: str) -> str:
    paragraphs = text.split('\n')
    paragraphs = [preprocess_sequence(paragraph) for paragraph in paragraphs]
    paragraphs = [paragraph for paragraph in paragraphs if len(paragraph) > 0]
    selected = random.choice(paragraphs)
    return selected


if __name__ == "__main__":
    random.seed(1998)

    development_ids = set(load_object(paths.WIKI_DEVELOPMENT_ARTICLE_IDS))
    test_ids = set(load_object(paths.WIKI_TEST_ARTICLE_IDS))
    print(development_ids)
    print(test_ids)

    development_paragraphs = []
    test_paragraphs = []

    for article in get_article_jsons():
        id = article["id"]
        is_dev = id in development_ids
        is_test = (not is_dev) and id in test_ids
        if is_dev or is_test:
            paragraph = select_random_paragraph(article["text"])
            if is_dev:
                development_paragraphs.append(paragraph)
Esempio n. 23
0
import sys

import project
from src.settings import paths
from src.helper.pickle import load_object
from src.helper.data_structures import sort_dict_by_value

if __name__ == "__main__":
    char_dict = load_object(paths.CHARACTER_FREQUENCY_DICT)

    for i, (char, frequency) in enumerate(sort_dict_by_value(char_dict)):
        print(i + 1, char, frequency)
Esempio n. 24
0
 def __init__(self, model, backward):
     self.model = model
     self.encoder = load_object(paths.WIKI_ENCODER_DICT)
     self.backward = backward
 def get_dictionaries():
     encoder = load_object(paths.WIKI_ENCODER_DICT)
     decoder = load_object(paths.WIKI_DECODER_DICT)
     return encoder, decoder
 def load():
     return load_object(paths.BIGRAM_HOLDER)
Esempio n. 27
0
    def get_character_counts() -> Dict[str, int]:
        """Loads the character counts from disk. Path is defined in src.settings.path.

        :return: dictionary char -> count
        """
        return load_object(paths.WIKI_CHARACTER_COUNT_DICT)
Esempio n. 28
0
def get_encoder_from_dict(path: str) -> CharacterEncoder:
    encoder_dict = load_object(path)
    return CharacterEncoder(encoder_dict)
Esempio n. 29
0
 def load(self):
     return load_object(self._file())
import project
from src.settings import paths
from src.helper.files import get_files
from src.helper.pickle import load_object

if __name__ == "__main__":
    for file in sorted(get_files(paths.THRESHOLD_FITTER_DIR)):
        fitter = load_object(paths.THRESHOLD_FITTER_DIR + file)
        print(file, fitter.n_sequences)