Beispiel #1
0
 def _remove_saved_models(self):
     files = get_files(self.model_dir())
     saved_models = [
         f for f in files
         if file_exists(self.model_dir() + "/" + f + "/saved_model.pb")
     ]
     for model_name in saved_models:
         remove_dir(self.model_dir() + "/" + model_name)
Beispiel #2
0
 def __init__(self,
              fitting_method: FittingMethod = FittingMethod.GREEDY,
              autosave: bool = True):
     self.file = THRESHOLD_FILES[fitting_method]
     if file_exists(self.file):
         self.threshold_dict = load_object(self.file)
     else:
         print(
             "WARNING: could not locate %s. A new, empty decision threshold dictionary was created instead."
             % self.file)
         self.threshold_dict = dict()
     self.autosave = autosave
Beispiel #3
0
def most_frequent_tokens(n):
    if n is None:
        print("loading counters...")
        return load_object(paths.WIKI_TOKEN_COUNTERS)
    most_frequent_path = paths.WIKI_MOST_FREQUENT_TOKENS % n
    if file_exists(most_frequent_path):
        print("loading most frequent counters...")
        return load_object(most_frequent_path)
    sorted_token_counters_path = paths.WIKI_SORTED_TOKEN_COUNTERS
    if file_exists(sorted_token_counters_path):
        print("loading sorted counters...")
        sorted_token_counters = load_object(sorted_token_counters_path)
    else:
        print("loading counters...")
        token_counters = load_object(paths.WIKI_TOKEN_COUNTERS)
        print("sorting counters...")
        sorted_token_counters = sort_word_counters(token_counters)
        pickle_dump(sorted_token_counters, sorted_token_counters_path)
    most_frequent = sorted_word_counters_to_dict(sorted_token_counters[:n])
    if not file_exists(most_frequent_path):
        pickle_dump(most_frequent, most_frequent_path)
    return most_frequent
Beispiel #4
0
def most_frequent_wiki_and_all_aspell_word_counts(k_most_frequent):
    path = paths.WIKI_AND_ASPELL_TOKEN_COUNTERS % k_most_frequent
    if file_exists(path):
        return load_object(path)
    words = most_frequent_tokens(k_most_frequent)
    wiki_word_counters = load_object(paths.WIKI_TOKEN_COUNTERS)
    with open(paths.ASPELL_WORD_FILE) as f:
        for line in f:
            word = line[:-1]
            if word not in words:
                words[word] = wiki_word_counters[
                    word] if word in wiki_word_counters else 0
    dump_object(words, path)
    return words
 def __init__(self,
              two_pass: bool = False,
              seq_acc: bool = False,
              autosave: bool = True):
     if seq_acc:
         self.file = paths.SEQ_ACC_BEAM_SEARCH_PENALTY_FILE
     elif two_pass:
         self.file = paths.TWO_PASS_BEAM_SEARCH_PENALTY_FILE
     else:
         self.file = paths.BEAM_SEARCH_PENALTY_FILE
     if file_exists(self.file):
         self.penalties = load_object(self.file)
     else:
         self.penalties = {}
     self.autosave = autosave
Beispiel #6
0
def latest_saved_model_dir(path):
    """
    Finds the most recent timestamped subfolder at the given path.
    Assumes folders are named by a timestamp.
    :param path: path containing at least one timestamped subfolder
    :return: string consisting of path/latest_timestamped_subfolder, or None if no saved model exists
    """
    files = get_files(path)
    saved_model_dirs = [
        f for f in files
        if "temp" not in f and file_exists(path + "/" + f + "/saved_model.pb")
    ]
    if len(saved_model_dirs) == 0:
        return None
    latest = sorted([int(model_dir) for model_dir in saved_model_dirs])[-1]
    return path + "/" + str(latest)
def get_stump_dict(unigrams: UnigramHolder) -> Dict[str, Set[str]]:
    if file_exists(paths.STUMP_DICT):
        return load_object(paths.STUMP_DICT)
    else:
        stump_dict = {}
        for token in unigrams.frequencies:
            if not token.isalpha():
                continue
            if unigrams.get(token) < MIN_TOKEN_FREQUENCY:
                continue
            for stump in get_stumps(token):
                if stump not in stump_dict:
                    stump_dict[stump] = {token}
                else:
                    stump_dict[stump].add(token)
        dump_object(stump_dict, paths.STUMP_DICT)
    return stump_dict
Beispiel #8
0
def load_most_frequent(n):
    path = None
    if n is not None:
        path = paths.MOST_FREQUENT_UNIGRAMS_DICT % n
        if file_exists(path):
            frequencies = load_object(path)
            return frequencies
    delim_frequencies = load_object(paths.UNIGRAM_DELIM_FREQUENCY_DICT)
    no_delim_frequencies = load_object(paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
    frequencies = delim_frequencies
    for token in no_delim_frequencies:
        if token not in frequencies:
            frequencies[token] = no_delim_frequencies[token]
        else:
            frequencies[token] += no_delim_frequencies[token]
    if n is not None:
        frequencies = select_most_frequent(frequencies, n)
        dump_object(frequencies, path)
    return frequencies
Beispiel #9
0
 def __init__(self):
     if file_exists(paths.RESULTS_DICT):
         self.results = load_object(paths.RESULTS_DICT)
     else:
         self.results = {}
import numpy as np

import sys

import project
from src.helper.files import read_lines, file_exists
from src.benchmark.benchmark import all_benchmarks, ERROR_PROBABILITIES, Subset, BenchmarkFiles

if __name__ == "__main__":
    file_name = sys.argv[1]
    per_chars = 1000

    t_mean = []
    t_normalized = []

    for benchmark in all_benchmarks(Subset.TEST):
        print("== %s ==" % benchmark.name)
        path = benchmark.get_results_directory() + file_name
        total_runtime = float(read_lines(path)[-1]) if file_exists(path) else 0
        mean_runtime = total_runtime / 10000
        t_mean.append(mean_runtime)
        print("mean = %.2f" % mean_runtime)
        n_chars = sum(len(sequence) for sequence in benchmark.get_sequences(BenchmarkFiles.CORRUPT))
        normalized_runtime = total_runtime / n_chars * per_chars
        t_normalized.append(normalized_runtime)
        print("normalized(%i chars) = %.2f" % (per_chars, normalized_runtime))

    print("== total ==")
    print("mean = %.2f" % np.mean(t_mean))
    print("normalized(%i chars) = %.2f" % (per_chars, np.mean(t_normalized)))
Beispiel #11
0
 def _load_training_results(self):
     path = self._path_to_file("training_results.pkl")
     if file_exists(path):
         self.training_results = load_object(path)