def eval_trained_dnn(main_dir, _iter, egs_dir, run_opts): input_model_dir = "{dir}/model_{iter}".format(dir=main_dir, iter=_iter) # we assume that there are just one tar file for validation tar_file = ("{0}/valid_egs.1.tar".format(egs_dir)) _command = '{command} "{main_dir}/log/compute_prob_valid.{iter}.log" ' \ 'local/tf/eval_dnn.py ' \ '--tar-file="{tar_file}" --use-gpu=no ' \ '--log-file="{main_dir}/log/compute_prob_valid.{iter}.log" ' \ '--input-dir="{input_model_dir}"'.format(command=run_opts.command, main_dir=main_dir, iter=_iter, tar_file=tar_file, input_model_dir=input_model_dir) utils.background_command(_command) # we assume that there are just one tar file for train diagnostics tar_file = ("{0}/train_subset_egs.1.tar".format(egs_dir)) _command = '{command} "{main_dir}/log/compute_prob_train_subset.{iter}.log" ' \ 'local/tf/eval_dnn.py ' \ '--tar-file="{tar_file}" --use-gpu=no ' \ '--log-file="{main_dir}/log/compute_prob_train_subset.{iter}.log" ' \ '--input-dir="{input_model_dir}"'.format(command=run_opts.command, main_dir=main_dir, iter=_iter, tar_file=tar_file, input_model_dir=input_model_dir) utils.background_command(_command)
def train_new_models(model_dir, _iter, random_seed, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, dropout_proportion, egs_dir, momentum, max_param_change, minibatch_size, run_opts, feature_dim, archives_minibatch_count, try_count=0, train_opts=""): """ Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes models in dirs like exp/tdnn_a/model_24.{1,2,3,..<num_jobs>} We cannot easily use a single parallel SGE job to do the main training, because the computation of which archive and which --frame option to use for each job is a little complex, so we spawn each one separately. """ threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this # slows down the computation slightly, so don't accumulate it on every # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. verbose_opt = ("--verbose=1" if _iter % 20 == 0 and _iter > 0 else "") for job in range(1, num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 minibatch_count = archives_minibatch_count[archive_index] if try_count > 0 and utils.is_correct_model_dir( '{0}/model_{1}.{2}'.format(model_dir, _iter + 1, job)): continue egs_rspecifier = \ '--ranges-file="{egs_dir}/temp/ranges.{archive_index}" ' \ '--scp-file="{egs_dir}/temp/feats.scp.{archive_index}" ' \ '--shuffle=True --minibatch-size={minibatch_size}'.format( egs_dir=egs_dir, archive_index=archive_index, minibatch_size=minibatch_size) # check whether tar file exist or not. If it was generated, so lets pass it to the script for speedup tar_file = '{egs_dir}/egs.{archive_index}.tar'.format( egs_dir=egs_dir, archive_index=archive_index) if os.path.exists(tar_file): egs_rspecifier = '--tar-file="{0}" {1}'.format( tar_file, egs_rspecifier) _command = '{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log ' \ 'local/tf/train_dnn_one_iteration.py ' \ '{parallel_train_opts} ' \ '{verbose_opt} --print-interval=10 ' \ '--momentum={momentum} ' \ '--max-param-change={max_param_change} ' \ '--l2-regularize-factor={l2_regularize_factor} ' \ '--random-seed={random_seed} {train_opts} ' \ '--learning-rate={learning_rate} ' \ '--scale={shrinkage_value} ' \ '--minibatch-count={minibatch_count} ' \ '--feature-dim={feature_dim} ' \ '--dropout-proportion={dropout_proportion} ' \ '{egs_rspecifier} ' \ '--input-dir={dir}/model_{iter} ' \ '--output-dir={dir}/model_{next_iter}.{job}' \ .format(command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=model_dir, iter=_iter, next_iter=_iter + 1, random_seed=_iter + random_seed, job=job, parallel_train_opts=run_opts.parallel_train_opts, verbose_opt=verbose_opt, momentum=momentum, max_param_change=max_param_change, l2_regularize_factor=1.0 / num_jobs, train_opts=train_opts, learning_rate=learning_rate, shrinkage_value=shrinkage_value, minibatch_count=minibatch_count, feature_dim=feature_dim, dropout_proportion=dropout_proportion, egs_rspecifier=egs_rspecifier) thread = utils.background_command(_command, require_zero_status=False) threads.append(thread) for thread in threads: thread.join()