def _load_state_tying(self, reload=False): ''' loads a state tying map from a file, loads the file and returns its content :param stFile: state tying map file (allo_syntax int) :return state_tying: variable with state tying mapping where: statetying.allo_map important ''' from os.path import isfile from Log import log from LmDataset import StateTying if not isinstance(self.state_tying, StateTying): reload = True if reload: print("Loading state tying file:", self.state_tying_name) assert isfile( self.state_tying_name), "State tying file does not exists" log.initialize(verbosity=[5]) self.state_tying = StateTying(self.state_tying_name) print("Finished state tying mapping:", len(self.state_tying.allo_map), "allos to int")
def _main(argv): import better_exchook better_exchook.install() log.initialize(verbosity=[5]) print("LmDataset demo startup") kwargs = eval(argv[0]) print("Creating LmDataset with kwargs=%r ..." % kwargs) dataset = LmDataset(**kwargs) print("init_seq_order ...") dataset.init_seq_order(epoch=1) seq_idx = 0 last_log_time = time.time() print("start iterating through seqs ...") while dataset.is_less_than_num_seqs(seq_idx): if seq_idx == 0: print("load_seqs with seq_idx=%i ...." % seq_idx) dataset.load_seqs(seq_idx, seq_idx + 1) if time.time() - last_log_time > 2.0: last_log_time = time.time() print( "Loading %s progress, %i/%i (%.0f%%) seqs loaded (%.0f%% skipped), (%.0f%% unknown) total syms %i ..." % (dataset.__class__.__name__, dataset.next_orth_idx, dataset.estimated_num_seqs, 100.0 * dataset.next_orth_idx / dataset.estimated_num_seqs, 100.0 * dataset.num_skipped / (dataset.next_orth_idx or 1), 100.0 * dataset.num_unknown / dataset._num_timesteps_accumulated["data"], dataset._num_timesteps_accumulated["data"])) seq_idx += 1 print("finished iterating, num seqs: %i" % seq_idx) print("dataset len:", dataset.len_info())
def init(config_filename, cmd_line_opts, dataset_config_str): """ :param str config_filename: global config for CRNN :param list[str] cmd_line_opts: options for init_config method :param str dataset_config_str: dataset via init_dataset_via_str() """ rnn.init_better_exchook() rnn.init_thread_join_hack() if config_filename: rnn.init_config(config_filename, cmd_line_opts) rnn.init_log() else: log.initialize(verbosity=[5]) print("Returnn hdf_dump starting up.", file=log.v3) rnn.init_faulthandler() if config_filename: rnn.init_data() rnn.print_task_properties() assert isinstance(rnn.train_data, Dataset) dataset = rnn.train_data else: assert dataset_config_str dataset = init_dataset(dataset_config_str) print("Source dataset:", dataset.len_info(), file=log.v3) return dataset
def init(config_filename, cmd_line_opts, dataset_config_str): """ :param str config_filename: global config for CRNN :param list[str] cmd_line_opts: options for initConfig method :param str dataset_config_str: dataset via init_dataset_via_str() """ rnn.initBetterExchook() rnn.initThreadJoinHack() if config_filename: rnn.initConfig(config_filename, cmd_line_opts) rnn.initLog() else: log.initialize(verbosity=[5]) print >> log.v3, "CRNN dump-dataset starting up." rnn.initFaulthandler() rnn.initConfigJsonNetwork() if config_filename: rnn.initData() rnn.printTaskProperties() assert isinstance(rnn.train_data, Dataset) return rnn.train_data else: assert dataset_config_str dataset = init_dataset_via_str(dataset_config_str) print >> log.v3, "Source dataset:", dataset.len_info() return dataset
def init(config_filename, cmd_line_opts, dataset_config_str): """ :param str config_filename: global config for CRNN :param list[str] cmd_line_opts: options for initConfig method :param str dataset_config_str: dataset via init_dataset_via_str() """ rnn.initBetterExchook() rnn.initThreadJoinHack() if config_filename: rnn.initConfig(config_filename, cmd_line_opts) rnn.initLog() else: log.initialize(verbosity=[5]) print("Returnn hdf_dump starting up.", file=log.v3) rnn.initFaulthandler() if config_filename: rnn.initData() rnn.printTaskProperties() assert isinstance(rnn.train_data, Dataset) return rnn.train_data else: assert dataset_config_str dataset = init_dataset_via_str(dataset_config_str) print("Source dataset:", dataset.len_info(), file=log.v3) return dataset
def demo(): import better_exchook better_exchook.install() import rnn import sys if len(sys.argv) <= 1: print("usage: python %s [config] [other options]" % __file__) print("example usage: python %s ++learning_rate_control newbob ++learning_rate_file newbob.data ++learning_rate 0.001" % __file__) rnn.initConfig(commandLineOptions=sys.argv[1:]) rnn.config._hack_value_reading_debug() from Pretrain import pretrainFromConfig pretrain = pretrainFromConfig(rnn.config) first_non_pretrain_epoch = 1 pretrain_learning_rate = None if pretrain: first_non_pretrain_epoch = pretrain.get_train_num_epochs() + 1 log.initialize(verbosity=[5]) control = loadLearningRateControlFromConfig(rnn.config) print("LearningRateControl: %r" % control) if not control.epochData: print("No epoch data so far.") return firstEpoch = min(control.epochData.keys()) if firstEpoch != 1: print("Strange, first epoch from epoch data is %i." % firstEpoch) print("Error key: %s from %r" % (control.getErrorKey(epoch=firstEpoch), control.epochData[firstEpoch].error)) if pretrain: pretrain_learning_rate = rnn.config.float('pretrain_learning_rate', control.defaultLearningRate) maxEpoch = max(control.epochData.keys()) for epoch in range(1, maxEpoch + 2): # all epochs [1..maxEpoch+1] oldLearningRate = None if epoch in control.epochData: oldLearningRate = control.epochData[epoch].learningRate if epoch < first_non_pretrain_epoch: learningRate = pretrain_learning_rate s = "Pretrain epoch %i, fixed learning rate: %s (was: %s)" % (epoch, learningRate, oldLearningRate) elif first_non_pretrain_epoch > 1 and epoch == first_non_pretrain_epoch: learningRate = control.defaultLearningRate s = "First epoch after pretrain, epoch %i, fixed learning rate: %s (was %s)" % (epoch, learningRate, oldLearningRate) else: learningRate = control.calcNewLearnignRateForEpoch(epoch) s = "Calculated learning rate for epoch %i: %s (was: %s)" % (epoch, learningRate, oldLearningRate) if learningRate < control.minLearningRate: learningRate = control.minLearningRate s += ", clipped to %s" % learningRate s += ", previous relative error: %s" % control.calcRelativeError(epoch - 2, epoch - 1) if hasattr(control, "_calcRecentMeanRelativeError"): s += ", previous mean relative error: %s" % control._calcRecentMeanRelativeError(epoch) print(s) # Overwrite new learning rate so that the calculation for further learning rates stays consistent. if epoch in control.epochData: control.epochData[epoch].learningRate = learningRate else: control.epochData[epoch] = control.EpochData(learningRate=learningRate) print("Finished, last stored epoch was %i." % maxEpoch)
def main(): global LstmCellTypes print("Benchmarking LSTMs.") better_exchook.install() print("Args:", " ".join(sys.argv)) arg_parser = ArgumentParser() arg_parser.add_argument("cfg", nargs="*", help="opt=value, opt in %r" % sorted(base_settings.keys())) arg_parser.add_argument("--no-cpu", action="store_true") arg_parser.add_argument("--no-gpu", action="store_true") arg_parser.add_argument("--selected", help="comma-separated list from %r" % LstmCellTypes) arg_parser.add_argument("--no-setup-tf-thread-pools", action="store_true") args = arg_parser.parse_args() for opt in args.cfg: key, value = opt.split("=", 1) assert key in base_settings value_type = type(base_settings[key]) base_settings[key] = value_type(value) print("Settings:") pprint(base_settings) log.initialize(verbosity=[4]) print("Returnn:", describe_crnn_version(), file=log.v3) print("TensorFlow:", describe_tensorflow_version(), file=log.v3) print("Python:", sys.version.replace("\n", ""), sys.platform) if not args.no_setup_tf_thread_pools: setup_tf_thread_pools(log_file=log.v2) else: print("Not setting up the TF thread pools. Will be done automatically by TF to number of CPU cores.") if args.no_gpu: print("GPU will not be used.") else: print("GPU available: %r" % is_gpu_available()) print_available_devices() if args.selected: LstmCellTypes = args.selected.split(",") benchmarks = {} if not args.no_gpu and is_gpu_available(): for lstm_unit in LstmCellTypes: benchmarks["GPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=True) if not args.no_cpu: for lstm_unit in LstmCellTypes: if lstm_unit in GpuOnlyCellTypes: continue benchmarks["CPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=False) print("-" * 20) print("Settings:") pprint(base_settings) print("Final results:") for t, lstm_unit in sorted([(t, lstm_unit) for (lstm_unit, t) in sorted(benchmarks.items())]): print(" %s: %s" % (lstm_unit, hms_fraction(t))) print("Done.")
def main(): global LstmCellTypes print("Benchmarking LSTMs.") better_exchook.install() print("Args:", " ".join(sys.argv)) arg_parser = ArgumentParser() arg_parser.add_argument("cfg", nargs="*", help="opt=value, opt in %r" % sorted(base_settings.keys())) arg_parser.add_argument("--no-cpu", action="store_true") arg_parser.add_argument("--no-gpu", action="store_true") arg_parser.add_argument("--selected", help="comma-separated list from %r" % LstmCellTypes) arg_parser.add_argument("--no-setup-tf-thread-pools", action="store_true") args = arg_parser.parse_args() for opt in args.cfg: key, value = opt.split("=", 1) assert key in base_settings value_type = type(base_settings[key]) base_settings[key] = value_type(value) print("Settings:") pprint(base_settings) log.initialize(verbosity=[4]) print("Returnn:", describe_returnn_version(), file=log.v3) print("TensorFlow:", describe_tensorflow_version(), file=log.v3) print("Python:", sys.version.replace("\n", ""), sys.platform) if not args.no_setup_tf_thread_pools: setup_tf_thread_pools(log_file=log.v2) else: print("Not setting up the TF thread pools. Will be done automatically by TF to number of CPU cores.") if args.no_gpu: print("GPU will not be used.") else: print("GPU available: %r" % is_gpu_available()) print_available_devices() if args.selected: LstmCellTypes = args.selected.split(",") benchmarks = {} if not args.no_gpu and is_gpu_available(): for lstm_unit in LstmCellTypes: benchmarks["GPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=True) if not args.no_cpu: for lstm_unit in LstmCellTypes: if lstm_unit in GpuOnlyCellTypes: continue benchmarks["CPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=False) print("-" * 20) print("Settings:") pprint(base_settings) print("Final results:") for t, lstm_unit in sorted([(t, lstm_unit) for (lstm_unit, t) in sorted(benchmarks.items())]): print(" %s: %s" % (lstm_unit, hms_fraction(t))) print("Done.")
def _init(): settings_file = base_dir + "/settings.py" if os.path.exists(settings_file): from lib.utils import load_config_py, ObjAsDict load_config_py(settings_file, ObjAsDict(Settings)) assert os.path.exists(Settings.returnn_path()) sys.path.insert(0, Settings.returnn_path()) # so that we can import Config try: import returnn # new-style RETURNN import except ImportError: pass # init log with default verbosity (3) from Log import log log.initialize()
def init(configFilename=None): rnn.init_better_exchook() rnn.init_thread_join_hack() if configFilename: rnn.init_config(configFilename, command_line_options=[]) rnn.init_log() else: log.initialize() print("Returnn collect-words starting up.", file=log.v3) rnn.init_faulthandler() if configFilename: rnn.init_config_json_network() rnn.init_data() rnn.print_task_properties()
def init(configFilename=None): rnn.initBetterExchook() rnn.initThreadJoinHack() if configFilename: rnn.initConfig(configFilename, commandLineOptions=[]) rnn.initLog() else: log.initialize() print >> log.v3, "CRNN collect-orth-symbols starting up." rnn.initFaulthandler() if configFilename: rnn.initConfigJsonNetwork() rnn.initData() rnn.printTaskProperties()
def load_state_tying(self, state_tying_name='state-tying.txt.gz'): """ loads a state tying map from a file, loads the file and returns its content where: statetying.allo_map important :param state_tying_name: holds the path and name of the state tying file """ from LmDataset import StateTying from os.path import isfile from Log import log log.initialize(verbosity=[5]) assert isfile(state_tying_name), "State tying file does not exist" self.state_tying = StateTying(state_tying_name)
def init(configFilename=None): rnn.initBetterExchook() rnn.initThreadJoinHack() if configFilename: rnn.initConfig(configFilename, commandLineOptions=[]) rnn.initLog() else: log.initialize() print("CRNN collect-orth-symbols starting up.", file=log.v3) rnn.initFaulthandler() if configFilename: rnn.initConfigJsonNetwork() rnn.initData() rnn.printTaskProperties()
def _load_lexicon(self): ''' loads a lexicon from a file, loads the xml and returns its conent where: lex.lemmas and lex.phonemes important ''' from os.path import isfile from Log import log from LmDataset import Lexicon assert isfile(self.lexicon_name), "Lexicon does not exists" log.initialize(verbosity=[5]) self.lexicon = Lexicon(self.lexicon_name)
def demo(): import better_exchook better_exchook.install() import rnn import sys if len(sys.argv) <= 1: print("usage: python %s [config] [other options]" % __file__) print("example usage: python %s ++learning_rate_control newbob ++learning_rate_file newbob.data ++learning_rate 0.001" % __file__) rnn.initConfig(commandLineOptions=sys.argv[1:]) from Pretrain import pretrainFromConfig pretrain = pretrainFromConfig(rnn.config) first_non_pretrain_epoch = 1 pretrain_learning_rate = None if pretrain: first_non_pretrain_epoch = pretrain.get_train_num_epochs() + 1 rnn.config._hack_value_reading_debug() log.initialize(verbosity=[5]) control = loadLearningRateControlFromConfig(rnn.config) print("LearningRateControl: %r" % control) if not control.epochData: print("No epoch data so far.") return if pretrain: pretrain_learning_rate = rnn.config.float('pretrain_learning_rate', control.defaultLearningRate) maxEpoch = max(control.epochData.keys()) for epoch in range(1, maxEpoch + 2): # all epochs [1..maxEpoch+1] oldLearningRate = None if epoch in control.epochData: oldLearningRate = control.epochData[epoch].learningRate if epoch < first_non_pretrain_epoch: learningRate = pretrain_learning_rate s = "Pretrain epoch %i, fixed learning rate: %s (was: %s)" % (epoch, learningRate, oldLearningRate) elif first_non_pretrain_epoch > 1 and epoch == first_non_pretrain_epoch: learningRate = control.defaultLearningRate s = "First epoch after pretrain, epoch %i, fixed learning rate: %s (was %s)" % (epoch, learningRate, oldLearningRate) else: learningRate = control.calcLearningRateForEpoch(epoch) s = "Calculated learning rate for epoch %i: %s (was: %s)" % (epoch, learningRate, oldLearningRate) if learningRate < control.minLearningRate: learningRate = control.minLearningRate s += ", clipped to %s" % learningRate s += ", previous relative error: %s" % control.calcRelativeError(epoch - 2, epoch - 1) print(s) # Overwrite new learning rate so that the calculation for further learning rates stays consistent. if epoch in control.epochData: control.epochData[epoch].learningRate = learningRate else: control.epochData[epoch] = control.EpochData(learningRate=learningRate) print("Finished, last stored epoch was %i." % maxEpoch)
def load_lexicon(self, lexicon_name='recog.150k.final.lex.gz'): """ loads Lexicon takes a file, loads the xml and returns as Lexicon where: lex.lemmas and lex.phonemes important :param str lexicon_name: holds the path and name of the lexicon file """ from LmDataset import Lexicon from os.path import isfile from Log import log log.initialize(verbosity=[5]) assert isfile(lexicon_name), "Lexicon file does not exist" self.lexicon = Lexicon(lexicon_name)
def _main(): import better_exchook better_exchook.install() from argparse import ArgumentParser arg_parser = ArgumentParser() arg_parser.add_argument( "lm_dataset", help="Python eval string, should eval to dict" + ", or otherwise filename, and will just dump") arg_parser.add_argument("--post_processor", nargs="*") args = arg_parser.parse_args() if not args.lm_dataset.startswith("{") and os.path.isfile(args.lm_dataset): callback = print if args.post_processor: pp = get_post_processor_function(args.post_processor) callback = lambda text: print(pp(text)) iter_corpus(args.lm_dataset, callback) sys.exit(0) log.initialize(verbosity=[5]) print("LmDataset demo startup") kwargs = eval(args.lm_dataset) assert isinstance(kwargs, dict), "arg should be str of dict: %s" % args.lm_dataset print("Creating LmDataset with kwargs=%r ..." % kwargs) dataset = LmDataset(**kwargs) print("init_seq_order ...") dataset.init_seq_order(epoch=1) seq_idx = 0 last_log_time = time.time() print("start iterating through seqs ...") while dataset.is_less_than_num_seqs(seq_idx): if seq_idx == 0: print("load_seqs with seq_idx=%i ...." % seq_idx) dataset.load_seqs(seq_idx, seq_idx + 1) if time.time() - last_log_time > 2.0: last_log_time = time.time() print("Loading %s progress, %i/%i (%.0f%%) seqs loaded (%.0f%% skipped), (%.0f%% unknown) total syms %i ..." % ( dataset.__class__.__name__, dataset.next_orth_idx, dataset.estimated_num_seqs, 100.0 * dataset.next_orth_idx / dataset.estimated_num_seqs, 100.0 * dataset.num_skipped / (dataset.next_orth_idx or 1), 100.0 * dataset.num_unknown / dataset._num_timesteps_accumulated["data"], dataset._num_timesteps_accumulated["data"])) seq_idx += 1 print("finished iterating, num seqs: %i" % seq_idx) print("dataset len:", dataset.len_info())
def __load_lexicon(lexFile): ''' loads a lexicon from a file, loads the xml and returns its conent :param lexFile: lexicon file with xml structure :return lex: variable with xml structure where: lex.lemmas and lex.phonemes important ''' from os.path import isfile from Log import log from LmDataset import Lexicon assert isfile(lexFile), "Lexicon does not exists" log.initialize(verbosity=[5]) lex = Lexicon(lexFile) return lex
def _load_lexicon(self, reload=False): ''' loads a lexicon from a file, loads the xml and returns its content where: lex.lemmas and lex.phonemes important :param bool reload: should lexicon be reloaded ''' from LmDataset import Lexicon if not isinstance(self.lexicon, Lexicon): reload = True if reload: from os.path import isfile from Log import log assert isfile(self.lexicon_name), "Lexicon does not exists" log.initialize(verbosity=[5]) self.lexicon = Lexicon(self.lexicon_name)
def _main(argv): import better_exchook better_exchook.install() log.initialize(verbosity=[5]) dataset = LmDataset(**eval(argv[0])) dataset.init_seq_order(epoch=1) seq_idx = 0 last_log_time = time.time() while dataset.is_less_than_num_seqs(seq_idx): dataset.load_seqs(seq_idx, seq_idx + 1) if time.time() - last_log_time > 2.0: last_log_time = time.time() print >> log.v5, "Loading %s progress, %i/%i (%.0f%%) seqs loaded (%.0f%% skipped), total syms %i ..." % ( dataset.__class__.__name__, dataset.next_orth_idx, dataset.estimated_num_seqs, 100.0 * dataset.next_orth_idx / dataset.estimated_num_seqs, 100.0 * dataset.num_skipped / (dataset.next_orth_idx or 1), dataset._num_timesteps_accumulated["data"]) seq_idx += 1 print >>log.v3, "dataset len:", dataset.len_info()
import NativeOp import numpy as np from numpy.testing.utils import assert_almost_equal import theano.tensor as T import TheanoUtil f32 = "float32" import better_exchook from Log import log better_exchook.replace_traceback_format_tb() log.initialize() # some code might need it def test_sparse_to_dense(): n_time = 3 n_batch = 2 n_dim = 5 s0 = np.array([[0, 0], [0, 1], [1, 1], [1, 2], [1, 2], [2, 2], [2, 2]], dtype=f32) s1 = np.array([[1, 2], [2, 3], [1, 1], [2, 0], [4, 1], [3, 3], [4, 4]], dtype=f32) w = np.array([[1, 2], [2, 1], [1, 2], [3, 4], [5, 6], [7, 8], [9, 9]], dtype=f32) m = np.array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 0]], dtype=f32) W = np.array([[[0, 1, 2, 0, 0], [0, 0, 2, 0, 0]], [[0, 1, 3, 0, 5], [0, 2, 0, 1, 0]], [[0, 0, 0, 7, 9], [4, 6, 0, 8, 0]]], dtype=f32) assert W.shape == (n_time, n_batch, n_dim)
sys.path += ["."] # Python 3 hack import NativeOp import numpy from numpy.testing.utils import assert_almost_equal import theano.tensor as T import TheanoUtil import sys f32 = "float32" import better_exchook from Log import log better_exchook.replace_traceback_format_tb() log.initialize() # some code might need it TheanoUtil.monkey_patches() chunk = NativeOp.chunk unchunk = NativeOp.unchunk naive_chunk_start_frames = NativeOp.Chunking.naive_chunk_start_frames def get_num_chunks(n_time, chunk_size, chunk_step): return len(naive_chunk_start_frames(n_time, chunk_size, chunk_step)) def naive_chunk(x, chunk_size, chunk_step): if x.ndim == 3: # (time,batch,dim) if x.shape[1] == 1:
def demo(): print("SprintDataset demo.") from argparse import ArgumentParser from Util import hms, progress_bar_with_time from Log import log from Config import Config from Dataset import init_dataset arg_parser = ArgumentParser() arg_parser.add_argument("--config", help="config with ExternSprintDataset", required=True) arg_parser.add_argument("--sprint_cache_dataset", help="kwargs dict for SprintCacheDataset", required=True) arg_parser.add_argument("--max_num_seqs", default=sys.maxint, type=int) arg_parser.add_argument("--action", default="compare", help="compare or benchmark") args = arg_parser.parse_args() log.initialize(verbosity=[4]) sprint_cache_dataset_kwargs = eval(args.sprint_cache_dataset) assert isinstance(sprint_cache_dataset_kwargs, dict) sprint_cache_dataset = SprintCacheDataset(**sprint_cache_dataset_kwargs) print("SprintCacheDataset: %r" % sprint_cache_dataset) config = Config() config.load_file(args.config) dataset = init_dataset(config.typed_value("train")) print("Dataset via config: %r" % dataset) assert sprint_cache_dataset.num_inputs == dataset.num_inputs assert tuple(sprint_cache_dataset.num_outputs["classes"]) == tuple( dataset.num_outputs["classes"]) sprint_cache_dataset.init_seq_order(epoch=1) if args.action == "compare": print("Iterating through dataset...") seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith( "seq-"), "dataset does not provide tag-names for seqs" dataset_seq = sprint_cache_dataset.get_dataset_seq_for_name(tag) data = dataset.get_data(seq_idx, "data") targets = dataset.get_data(seq_idx, "classes") assert data.shape == dataset_seq.features.shape assert targets.shape == dataset_seq.targets["classes"].shape assert numpy.allclose(data, dataset_seq.features) assert numpy.allclose(targets, dataset_seq.targets["classes"]) seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i" % seq_idx) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) elif args.action == "benchmark": print("Iterating through dataset...") start_time = time.time() seq_tags = [] seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith( "seq-"), "dataset does not provide tag-names for seqs" seq_tags.append(tag) dataset.get_data(seq_idx, "data") dataset.get_data(seq_idx, "classes") seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i, time: %f" % (seq_idx, time.time() - start_time)) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) if hasattr(dataset, "exit_handler"): dataset.exit_handler() else: print("No way to stop any background tasks.") del dataset start_time = time.time() print("Iterating through SprintCacheDataset...") for i, tag in enumerate(seq_tags): sprint_cache_dataset.get_dataset_seq_for_name(tag) progress_bar_with_time(float(i) / len(seq_tags)) print("Finished through SprintCacheDataset. time: %f" % (time.time() - start_time, )) else: raise Exception("invalid action: %r" % args.action)
def init_log(config): log_dir = config.dir("log_dir", "logs") model = config.unicode("model") filename = log_dir + model + ".log" verbosity = config.int("log_verbosity", 3) log.initialize([filename], [verbosity], [])
def demo(): """ Demo run. Given some learning rate file (with scores / existing lrs), will calculate how lrs would have been set, given some config. """ import better_exchook better_exchook.install() import rnn import sys if len(sys.argv) <= 1: print("usage: python %s [config] [other options] [++check_learning_rates 1]" % __file__) print( ("example usage: " "python %s ++learning_rate_control newbob ++learning_rate_file newbob.data ++learning_rate 0.001") % __file__) rnn.init_config(command_line_options=sys.argv[1:]) # noinspection PyProtectedMember rnn.config._hack_value_reading_debug() rnn.config.update({"log": []}) rnn.init_log() rnn.init_backend_engine() check_lr = rnn.config.bool("check_learning_rates", False) from Pretrain import pretrain_from_config pretrain = pretrain_from_config(rnn.config) first_non_pretrain_epoch = 1 pretrain_learning_rate = None if pretrain: first_non_pretrain_epoch = pretrain.get_train_num_epochs() + 1 log.initialize(verbosity=[5]) control = load_learning_rate_control_from_config(rnn.config) print("LearningRateControl: %r" % control) if not control.epoch_data: print("No epoch data so far.") return first_epoch = min(control.epoch_data.keys()) if first_epoch != 1: print("Strange, first epoch from epoch data is %i." % first_epoch) print("Error key: %s from %r" % (control.get_error_key(epoch=first_epoch), control.epoch_data[first_epoch].error)) if pretrain: pretrain_learning_rate = rnn.config.float('pretrain_learning_rate', control.default_learning_rate) max_epoch = max(control.epoch_data.keys()) for epoch in range(1, max_epoch + 2): # all epochs [1..max_epoch+1] old_learning_rate = None if epoch in control.epoch_data: old_learning_rate = control.epoch_data[epoch].learning_rate if epoch < first_non_pretrain_epoch: learning_rate = pretrain_learning_rate s = "Pretrain epoch %i, fixed learning rate: %s (was: %s)" % (epoch, learning_rate, old_learning_rate) elif 1 < first_non_pretrain_epoch == epoch: learning_rate = control.default_learning_rate s = "First epoch after pretrain, epoch %i, fixed learning rate: %s (was %s)" % ( epoch, learning_rate, old_learning_rate) else: learning_rate = control.calc_new_learning_rate_for_epoch(epoch) s = "Calculated learning rate for epoch %i: %s (was: %s)" % (epoch, learning_rate, old_learning_rate) if learning_rate < control.min_learning_rate: learning_rate = control.min_learning_rate s += ", clipped to %s" % learning_rate s += ", previous relative error: %s" % control.calc_relative_error(epoch - 2, epoch - 1) if hasattr(control, "_calc_recent_mean_relative_error"): # noinspection PyProtectedMember s += ", previous mean relative error: %s" % control._calc_recent_mean_relative_error(epoch) print(s) if check_lr and old_learning_rate is not None: if old_learning_rate != learning_rate: print("Learning rate is different in epoch %i!" % epoch) sys.exit(1) # Overwrite new learning rate so that the calculation for further learning rates stays consistent. if epoch in control.epoch_data: control.epoch_data[epoch].learning_rate = learning_rate else: control.epoch_data[epoch] = control.EpochData(learningRate=learning_rate) print("Finished, last stored epoch was %i." % max_epoch)
import contextlib import unittest import numpy.testing from pprint import pprint import better_exchook better_exchook.replace_traceback_format_tb() from Config import Config from TFNetwork import * from TFNetworkLayer import * from TFEngine import * from Log import log import TFUtil TFUtil.debug_register_better_repr() log.initialize(verbosity=[5]) @contextlib.contextmanager def make_scope(): with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as session: yield session network = {} _last = "data" def build_resnet(conv_time_dim): # network # (also defined by num_inputs & num_outputs) dropout = 0 L2 = 0.1
def initLog(): logs = config.list('log', []) log_verbosity = config.int_list('log_verbosity', []) log_format = config.list('log_format', []) log.initialize(logs=logs, verbosity=log_verbosity, formatter=log_format)
import sys sys.path += ["."] # Python 3 hack from TFEngine import * import Util import TFUtil TFUtil.debugRegisterBetterRepr() from Config import Config from nose.tools import assert_equal, assert_is_instance import numpy import numpy.testing import os from pprint import pprint import better_exchook better_exchook.replace_traceback_format_tb() from Log import log log.initialize(verbosity=[5]) session = tf.InteractiveSession() def test_DataProvider(): """ :param Dataset.Dataset dataset: :param int seq_idx: :param str|None output_layer_name: e.g. "output". if not set, will read from config "forward_output_layer" :return: numpy array, output in time major format (time,batch,dim) :rtype: numpy.ndarray """ from GeneratingDataset import DummyDataset seq_len = 5 n_data_dim = 2
import theano.printing from pprint import pprint from GeneratingDataset import Task12AXDataset from Updater import Updater from Device import Device from Util import NumbersDict from Config import Config from NetworkHiddenLayer import DumpLayer import rnn import EngineUtil import Network import better_exchook from Log import log better_exchook.replace_traceback_format_tb() log.initialize() # some code needs it # Some code uses get_global_config(). # Not sure about the most clean solution. rnn.config = Config() class DummyDevice: """ Behave like Device. Only needed for assign_dev_data. """ blocking = True used_data_keys = ("data", "classes") targets = None output_index = None
def initLog(): logs = config.list('log', []) log_verbosity = config.int_list('log_verbosity', []) log_format = config.list('log_format', []) log.initialize(logs = logs, verbosity = log_verbosity, formatter = log_format)
from nose.tools import assert_equal, assert_is_instance, assert_in, assert_not_in, assert_true, assert_false from GeneratingDataset import GeneratingDataset, DummyDataset from EngineBatch import Batch from Dataset import DatasetSeq from Log import log import numpy as np log.initialize() def test_generate_batches(): dataset = DummyDataset(input_dim=2, output_dim=3, num_seqs=20) dataset.init_seq_order(1) batch_gen = dataset.generate_batches(recurrent_net=False, max_seqs=2, batch_size=5) while batch_gen.has_more(): batch_gen.peek_next_n(1) batch_gen.advance(1) def test_generate_batches_recurrent(): dataset = DummyDataset(input_dim=2, output_dim=3, num_seqs=20) dataset.init_seq_order(1) batch_gen = dataset.generate_batches(recurrent_net=True, max_seqs=2, batch_size=5) while batch_gen.has_more(): batch_gen.peek_next_n(1) batch_gen.advance(1) def test_iterate_seqs_no_chunking_1(): dataset = DummyDataset(input_dim=2, output_dim=3, num_seqs=2, seq_len=11) dataset.init_seq_order(1) seqs = list(dataset._iterate_seqs(chunk_size=0, chunk_step=0))
def demo(): """ Demo. """ print("SprintDataset demo.") from argparse import ArgumentParser from Util import progress_bar_with_time from Log import log from Config import Config from Dataset import init_dataset arg_parser = ArgumentParser() arg_parser.add_argument("--config", help="config with ExternSprintDataset", required=True) arg_parser.add_argument("--sprint_cache_dataset", help="kwargs dict for SprintCacheDataset", required=True) arg_parser.add_argument("--max_num_seqs", default=sys.maxsize, type=int) arg_parser.add_argument("--action", default="compare", help="compare or benchmark") args = arg_parser.parse_args() log.initialize(verbosity=[4]) sprint_cache_dataset_kwargs = eval(args.sprint_cache_dataset) assert isinstance(sprint_cache_dataset_kwargs, dict) sprint_cache_dataset = SprintCacheDataset(**sprint_cache_dataset_kwargs) print("SprintCacheDataset: %r" % sprint_cache_dataset) config = Config() config.load_file(args.config) dataset = init_dataset(config.typed_value("train")) print("Dataset via config: %r" % dataset) assert sprint_cache_dataset.num_inputs == dataset.num_inputs assert tuple(sprint_cache_dataset.num_outputs["classes"]) == tuple(dataset.num_outputs["classes"]) sprint_cache_dataset.init_seq_order(epoch=1) if args.action == "compare": print("Iterating through dataset...") seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith("seq-"), "dataset does not provide tag-names for seqs" dataset_seq = sprint_cache_dataset.get_dataset_seq_for_name(tag) data = dataset.get_data(seq_idx, "data") targets = dataset.get_data(seq_idx, "classes") assert data.shape == dataset_seq.features["data"].shape assert targets.shape == dataset_seq.features["classes"].shape assert numpy.allclose(data, dataset_seq.features["data"]) assert numpy.allclose(targets, dataset_seq.features["classes"]) seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i" % seq_idx) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) elif args.action == "benchmark": print("Iterating through dataset...") start_time = time.time() seq_tags = [] seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith("seq-"), "dataset does not provide tag-names for seqs" seq_tags.append(tag) dataset.get_data(seq_idx, "data") dataset.get_data(seq_idx, "classes") seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i, time: %f" % (seq_idx, time.time() - start_time)) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) if hasattr(dataset, "exit_handler"): dataset.exit_handler() else: print("No way to stop any background tasks.") del dataset start_time = time.time() print("Iterating through SprintCacheDataset...") for i, tag in enumerate(seq_tags): sprint_cache_dataset.get_dataset_seq_for_name(tag) progress_bar_with_time(float(i) / len(seq_tags)) print("Finished through SprintCacheDataset. time: %f" % (time.time() - start_time,)) else: raise Exception("invalid action: %r" % args.action)