def _remove_saved_models(self): files = get_files(self.model_dir()) saved_models = [ f for f in files if file_exists(self.model_dir() + "/" + f + "/saved_model.pb") ] for model_name in saved_models: remove_dir(self.model_dir() + "/" + model_name)
def __init__(self, fitting_method: FittingMethod = FittingMethod.GREEDY, autosave: bool = True): self.file = THRESHOLD_FILES[fitting_method] if file_exists(self.file): self.threshold_dict = load_object(self.file) else: print( "WARNING: could not locate %s. A new, empty decision threshold dictionary was created instead." % self.file) self.threshold_dict = dict() self.autosave = autosave
def most_frequent_tokens(n): if n is None: print("loading counters...") return load_object(paths.WIKI_TOKEN_COUNTERS) most_frequent_path = paths.WIKI_MOST_FREQUENT_TOKENS % n if file_exists(most_frequent_path): print("loading most frequent counters...") return load_object(most_frequent_path) sorted_token_counters_path = paths.WIKI_SORTED_TOKEN_COUNTERS if file_exists(sorted_token_counters_path): print("loading sorted counters...") sorted_token_counters = load_object(sorted_token_counters_path) else: print("loading counters...") token_counters = load_object(paths.WIKI_TOKEN_COUNTERS) print("sorting counters...") sorted_token_counters = sort_word_counters(token_counters) pickle_dump(sorted_token_counters, sorted_token_counters_path) most_frequent = sorted_word_counters_to_dict(sorted_token_counters[:n]) if not file_exists(most_frequent_path): pickle_dump(most_frequent, most_frequent_path) return most_frequent
def most_frequent_wiki_and_all_aspell_word_counts(k_most_frequent): path = paths.WIKI_AND_ASPELL_TOKEN_COUNTERS % k_most_frequent if file_exists(path): return load_object(path) words = most_frequent_tokens(k_most_frequent) wiki_word_counters = load_object(paths.WIKI_TOKEN_COUNTERS) with open(paths.ASPELL_WORD_FILE) as f: for line in f: word = line[:-1] if word not in words: words[word] = wiki_word_counters[ word] if word in wiki_word_counters else 0 dump_object(words, path) return words
def __init__(self, two_pass: bool = False, seq_acc: bool = False, autosave: bool = True): if seq_acc: self.file = paths.SEQ_ACC_BEAM_SEARCH_PENALTY_FILE elif two_pass: self.file = paths.TWO_PASS_BEAM_SEARCH_PENALTY_FILE else: self.file = paths.BEAM_SEARCH_PENALTY_FILE if file_exists(self.file): self.penalties = load_object(self.file) else: self.penalties = {} self.autosave = autosave
def latest_saved_model_dir(path): """ Finds the most recent timestamped subfolder at the given path. Assumes folders are named by a timestamp. :param path: path containing at least one timestamped subfolder :return: string consisting of path/latest_timestamped_subfolder, or None if no saved model exists """ files = get_files(path) saved_model_dirs = [ f for f in files if "temp" not in f and file_exists(path + "/" + f + "/saved_model.pb") ] if len(saved_model_dirs) == 0: return None latest = sorted([int(model_dir) for model_dir in saved_model_dirs])[-1] return path + "/" + str(latest)
def get_stump_dict(unigrams: UnigramHolder) -> Dict[str, Set[str]]: if file_exists(paths.STUMP_DICT): return load_object(paths.STUMP_DICT) else: stump_dict = {} for token in unigrams.frequencies: if not token.isalpha(): continue if unigrams.get(token) < MIN_TOKEN_FREQUENCY: continue for stump in get_stumps(token): if stump not in stump_dict: stump_dict[stump] = {token} else: stump_dict[stump].add(token) dump_object(stump_dict, paths.STUMP_DICT) return stump_dict
def load_most_frequent(n): path = None if n is not None: path = paths.MOST_FREQUENT_UNIGRAMS_DICT % n if file_exists(path): frequencies = load_object(path) return frequencies delim_frequencies = load_object(paths.UNIGRAM_DELIM_FREQUENCY_DICT) no_delim_frequencies = load_object(paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT) frequencies = delim_frequencies for token in no_delim_frequencies: if token not in frequencies: frequencies[token] = no_delim_frequencies[token] else: frequencies[token] += no_delim_frequencies[token] if n is not None: frequencies = select_most_frequent(frequencies, n) dump_object(frequencies, path) return frequencies
def __init__(self): if file_exists(paths.RESULTS_DICT): self.results = load_object(paths.RESULTS_DICT) else: self.results = {}
import numpy as np import sys import project from src.helper.files import read_lines, file_exists from src.benchmark.benchmark import all_benchmarks, ERROR_PROBABILITIES, Subset, BenchmarkFiles if __name__ == "__main__": file_name = sys.argv[1] per_chars = 1000 t_mean = [] t_normalized = [] for benchmark in all_benchmarks(Subset.TEST): print("== %s ==" % benchmark.name) path = benchmark.get_results_directory() + file_name total_runtime = float(read_lines(path)[-1]) if file_exists(path) else 0 mean_runtime = total_runtime / 10000 t_mean.append(mean_runtime) print("mean = %.2f" % mean_runtime) n_chars = sum(len(sequence) for sequence in benchmark.get_sequences(BenchmarkFiles.CORRUPT)) normalized_runtime = total_runtime / n_chars * per_chars t_normalized.append(normalized_runtime) print("normalized(%i chars) = %.2f" % (per_chars, normalized_runtime)) print("== total ==") print("mean = %.2f" % np.mean(t_mean)) print("normalized(%i chars) = %.2f" % (per_chars, np.mean(t_normalized)))
def _load_training_results(self): path = self._path_to_file("training_results.pkl") if file_exists(path): self.training_results = load_object(path)