def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am=True, use_multitask_egs=False, compute_per_dim_accuracy=False): if get_raw_nnet_from_am: model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format( dir=dir, iter=iter) else: model = "{dir}/{iter}.raw".format(dir=dir, iter=iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" egs_rspecifier = ("{0}:{1}/valid_diagnostic{2}".format( scp_or_ark, egs_dir, egs_suffix)) opts = [] if compute_per_dim_accuracy: opts.append("--compute-per-dim-accuracy") multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """ {command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-compute-prob "{model}" \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ ark:- |" """.format(command=run_opts.command, dir=dir, iter=iter, egs_rspecifier=egs_rspecifier, opts=' '.join(opts), model=model, multitask_egs_opts=multitask_egs_opts)) egs_rspecifier = ("{0}:{1}/train_diagnostic{2}".format( scp_or_ark, egs_dir, egs_suffix)) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="train_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-compute-prob {opts} "{model}" \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ ark:- |" """.format(command=run_opts.command, dir=dir, iter=iter, egs_rspecifier=egs_rspecifier, opts=' '.join(opts), model=model, multitask_egs_opts=multitask_egs_opts))
def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, lda_opts=None, use_multitask_egs=False): if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="egs.", archive_index="JOB", use_multitask_egs=use_multitask_egs) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_rspecifier = ( "ark:nnet3-copy-egs {multitask_egs_opts} " "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |" "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-acc-lda-stats --rand-prune={rand_prune} \ {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), range(1, num_lda_jobs + 1))) common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( command=run_opts.command, dir=dir, lda_stat_files=" ".join(lda_stat_files))) for file in lda_stat_files: try: os.remove(file) except OSError: logger.error("There was error while trying to remove " "lda stat files.") raise # this computes a fixed affine transform computed in the way we described # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled # variant of an LDA transform but without dimensionality reduction. common_lib.execute_command( """{command} {dir}/log/get_transform.log \ nnet-get-feature-transform {lda_opts} {dir}/lda.mat \ {dir}/lda_stats""".format( command=run_opts.command, dir=dir, lda_opts=lda_opts if lda_opts is not None else "")) common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, use_multitask_egs=False): model = '{0}/{1}.mdl'.format(dir, iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="train_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))
def compute_progress(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am=True, use_multitask_egs=False): if get_raw_nnet_from_am: prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format( dir, iter - 1) model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter) else: prev_model = '{0}/{1}.raw'.format(dir, iter - 1) model = '{0}/{1}.raw'.format(dir, iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" egs_rspecifier = "{0}:{1}/train_diagnostic{2}".format( scp_or_ark, egs_dir, egs_suffix) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="train_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/progress.{iter}.log \ nnet3-info "{model}" '&&' \ nnet3-show-progress --use-gpu=no "{prev_model}" "{model}" \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ ark:- |" """.format(command=run_opts.command, dir=dir, iter=iter, egs_rspecifier=egs_rspecifier, model=model, prev_model=prev_model, multitask_egs_opts=multitask_egs_opts))
def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, max_objective_evaluations=30, use_multitask_egs=False): """ Function to do model combination In the nnet3 setup, the logic for doing averaging of subsets of the models in the case where there are too many models to reliably esetimate interpolation factors (max_models_combine) is moved into the nnet3-combine. """ raw_model_strings = [] logger.info("Combining {0} models.".format(models_to_combine)) models_to_combine.add(num_iters) for iter in sorted(models_to_combine): model_file = '{0}/{1}.mdl'.format(dir, iter) if os.path.exists(model_file): # we used to copy them with nnet3-am-copy --raw=true, but now # the raw-model-reading code discards the other stuff itself. raw_model_strings.append(model_file) else: print("{0}: warning: model file {1} does not exist " "(final combination)".format(sys.argv[0], model_file)) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="combine.", use_multitask_egs=use_multitask_egs) # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. # Since the batch-norm stats are not technically parameters, they are not # combined in the combination code, they are just obtained from the first # model. raw_model_strings = list(reversed(raw_model_strings)) common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-chain-combine \ --max-objective-evaluations={max_objective_evaluations} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ ark:- ark:- |" - \| \ nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ {dir}/final.mdl""".format( command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, combine_gpu_opt=run_opts.combine_gpu_opt, max_objective_evaluations=max_objective_evaluations, l2=l2_regularize, leaky=leaky_hmm_coefficient, dir=dir, raw_models=" ".join(raw_model_strings), num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. compute_train_cv_probabilities( dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs)
def train_new_models(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, run_opts, train_opts, backstitch_training_scale=0.0, backstitch_training_interval=1, use_multitask_egs=False): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw We cannot easily use a single parallel SGE job to do the main training, because the computation of which archive and which --frame option to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code use_multitask_egs : True, if different examples used to train multiple tasks or outputs, e.g.multilingual training. multilingual egs can be generated using get_egs.sh and steps/nnet3/multilingual/allocate_multilingual_examples.py, those are the top-level scripts. """ deriv_time_opts = [] if min_deriv_time is not None: deriv_time_opts.append("--optimization.min-deriv-time={0}".format( min_deriv_time)) if max_deriv_time_relative is not None: deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format( int(max_deriv_time_relative))) threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this # slows down the computation slightly, so don't accumulate it on every # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") for job in range(1, num_jobs+1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 # previous : frame_shift = (k/num_archives) % frame_subsampling_factor frame_shift = ((archive_index + k/num_archives) % frame_subsampling_factor) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="cegs.", archive_index=archive_index, use_multitask_egs=use_multitask_egs) scp_or_ark = "scp" if use_multitask_egs else "ark" cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if iter > 0 else "") + (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "")) thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ --apply-deriv-weights={app_deriv_wts} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ {cache_io_opts} --xent-regularize={xent_reg} \ {deriv_time_opts} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ --backstitch-training-interval={backstitch_training_interval} \ --l2-regularize-factor={l2_regularize_factor} {train_opts} \ --srand={srand} \ "{raw_model}" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \ --frame-shift={fr_shft} \ {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \ --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1, job=job, deriv_time_opts=" ".join(deriv_time_opts), app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, train_opts=train_opts, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, cache_io_opts=cache_io_opts, parallel_train_opts=run_opts.parallel_train_opts, verbose_opt=verbose_opt, momentum=momentum, max_param_change=max_param_change, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, l2_regularize_factor=1.0/num_jobs, raw_model=raw_model_string, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, num_chunk_per_mb=num_chunk_per_minibatch_str, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark), require_zero_status=True) threads.append(thread) for thread in threads: thread.join()
def combine_models(dir, num_iters, models_to_combine, egs_dir, minibatch_size_str, run_opts, chunk_width=None, get_raw_nnet_from_am=True, sum_to_one_penalty=0.0, use_multitask_egs=False): """ Function to do model combination In the nnet3 setup, the logic for doing averaging of subsets of the models in the case where there are too many models to reliably esetimate interpolation factors (max_models_combine) is moved into the nnet3-combine. """ raw_model_strings = [] logger.info("Combining {0} models.".format(models_to_combine)) models_to_combine.add(num_iters) for iter in sorted(models_to_combine): if get_raw_nnet_from_am: model_file = '{0}/{1}.mdl'.format(dir, iter) if not os.path.exists(model_file): raise Exception('Model file {0} missing'.format(model_file)) raw_model_strings.append( '"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) else: model_file = '{0}/{1}.raw'.format(dir, iter) if not os.path.exists(model_file): raise Exception('Model file {0} missing'.format(model_file)) raw_model_strings.append(model_file) if get_raw_nnet_from_am: out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl " "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters)) else: out_model = '{dir}/final.raw'.format(dir=dir) # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. # Since the batch-norm stats are not technically parameters, they are not # combined in the combination code, they are just obtained from the first # model. raw_model_strings = list(reversed(raw_model_strings)) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" egs_rspecifier = "{0}:{1}/combine{2}".format(scp_or_ark, egs_dir, egs_suffix) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="combine.", use_multitask_egs=use_multitask_egs) common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-combine --num-iters=80 \ --enforce-sum-to-one={hard_enforce} \ --sum-to-one-penalty={penalty} \ --enforce-positive-weights=true \ --verbose=3 {raw_models} \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size={mbsize} ark:- ark:- |" \ "{out_model}" """.format(command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, dir=dir, raw_models=" ".join(raw_model_strings), egs_rspecifier=egs_rspecifier, hard_enforce=(sum_to_one_penalty <= 0), penalty=sum_to_one_penalty, mbsize=minibatch_size_str, out_model=out_model, multitask_egs_opts=multitask_egs_opts)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. if get_raw_nnet_from_am: compute_train_cv_probabilities( dir=dir, iter='combined', egs_dir=egs_dir, run_opts=run_opts, use_multitask_egs=use_multitask_egs) else: compute_train_cv_probabilities( dir=dir, iter='final', egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=False, use_multitask_egs=use_multitask_egs)
def train_new_models(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, momentum, max_param_change, shuffle_buffer_size, minibatch_size_str, image_augmentation_opts, run_opts, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, use_multitask_egs=False, backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw We cannot easily use a single parallel SGE job to do the main training, because the computation of which archive and which --frame option to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code Selected args: frames_per_eg: The default value -1 implies chunk_level_training, which is particularly applicable to RNN training. If it is > 0, then it implies frame-level training, which is applicable for DNN training. If it is > 0, then each parallel SGE job created, a different frame numbered 0..frames_per_eg-1 is used. use_multitask_egs : True, if different examples used to train multiple tasks or outputs, e.g.multilingual training. multilingual egs can be generated using get_egs.sh and steps/nnet3/multilingual/allocate_multilingual_examples.py, those are the top-level scripts. """ chunk_level_training = False if frames_per_eg > 0 else True deriv_time_opts = [] if min_deriv_time is not None: deriv_time_opts.append("--optimization.min-deriv-time={0}".format( min_deriv_time)) if max_deriv_time_relative is not None: deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format( max_deriv_time_relative)) threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this # slows down the computation slightly, so don't accumulate it on every # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") for job in range(1, num_jobs+1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 if not chunk_level_training: frame = (k / num_archives + archive_index) % frames_per_eg cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if iter > 0 else "") + (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "")) if image_augmentation_opts: image_augmentation_cmd = ( 'nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |'.format( srand=k+srand, aug_opts=image_augmentation_opts)) else: image_augmentation_cmd = '' multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="egs.", archive_index=archive_index, use_multitask_egs=use_multitask_egs) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_rspecifier = ( """ark,bg:nnet3-copy-egs {frame_opts} {multitask_egs_opts} \ {scp_or_ark}:{egs_dir}/egs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} \ --srand={srand} ark:- ark:- | {aug_cmd} \ nnet3-merge-egs --minibatch-size={minibatch_size} ark:- ark:- |""".format( frame_opts=("" if chunk_level_training else "--frame={0}".format(frame)), egs_dir=egs_dir, archive_index=archive_index, shuffle_buffer_size=shuffle_buffer_size, minibatch_size=minibatch_size_str, aug_cmd=image_augmentation_cmd, srand=iter+srand, scp_or_ark=scp_or_ark, multitask_egs_opts=multitask_egs_opts)) # note: the thread waits on that process's completion. thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-train {parallel_train_opts} {cache_io_opts} \ {verbose_opt} --print-interval=10 \ --momentum={momentum} \ --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ --backstitch-training-interval={backstitch_training_interval} \ --srand={srand} \ {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \ {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=dir, iter=iter, next_iter=iter + 1, srand=iter + srand, job=job, parallel_train_opts=run_opts.parallel_train_opts, cache_io_opts=cache_io_opts, verbose_opt=verbose_opt, momentum=momentum, max_param_change=max_param_change, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, deriv_time_opts=" ".join(deriv_time_opts), raw_model=raw_model_string, egs_rspecifier=egs_rspecifier), require_zero_status=True) threads.append(thread) for thread in threads: thread.join()
def combine_models(dir, num_iters, models_to_combine, egs_dir, minibatch_size_str, run_opts, chunk_width=None, get_raw_nnet_from_am=True, sum_to_one_penalty=0.0, use_multitask_egs=False, compute_per_dim_accuracy=False): """ Function to do model combination In the nnet3 setup, the logic for doing averaging of subsets of the models in the case where there are too many models to reliably esetimate interpolation factors (max_models_combine) is moved into the nnet3-combine. """ raw_model_strings = [] logger.info("Combining {0} models.".format(models_to_combine)) models_to_combine.add(num_iters) for iter in sorted(models_to_combine): suffix = "mdl" if get_raw_nnet_from_am else "raw" model_file = '{0}/{1}.{2}'.format(dir, iter, suffix) if not os.path.exists(model_file): raise Exception('Model file {0} missing'.format(model_file)) raw_model_strings.append(model_file) if get_raw_nnet_from_am: out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl " "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters)) else: out_model = '{dir}/final.raw'.format(dir=dir) # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. # Since the batch-norm stats are not technically parameters, they are not # combined in the combination code, they are just obtained from the first # model. raw_model_strings = list(reversed(raw_model_strings)) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" egs_rspecifier = "{0}:{1}/combine{2}".format(scp_or_ark, egs_dir, egs_suffix) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="combine.", use_multitask_egs=use_multitask_egs) common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-combine --num-iters=80 \ --enforce-sum-to-one={hard_enforce} \ --sum-to-one-penalty={penalty} \ --enforce-positive-weights=true \ --verbose=3 {raw_models} \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size={mbsize} ark:- ark:- |" \ "{out_model}" """.format(command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, dir=dir, raw_models=" ".join(raw_model_strings), egs_rspecifier=egs_rspecifier, hard_enforce=(sum_to_one_penalty <= 0), penalty=sum_to_one_penalty, mbsize=minibatch_size_str, out_model=out_model, multitask_egs_opts=multitask_egs_opts)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. if get_raw_nnet_from_am: compute_train_cv_probabilities( dir=dir, iter='combined', egs_dir=egs_dir, run_opts=run_opts, use_multitask_egs=use_multitask_egs, compute_per_dim_accuracy=compute_per_dim_accuracy) else: compute_train_cv_probabilities( dir=dir, iter='final', egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=False, use_multitask_egs=use_multitask_egs, compute_per_dim_accuracy=compute_per_dim_accuracy)
def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am=True, use_multitask_egs=False, compute_per_dim_accuracy=False): if get_raw_nnet_from_am: model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format( dir=dir, iter=iter) else: model = "{dir}/{iter}.raw".format(dir=dir, iter=iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" egs_rspecifier = ("{0}:{1}/valid_diagnostic{2}".format( scp_or_ark, egs_dir, egs_suffix)) opts = [] if compute_per_dim_accuracy: opts.append("--compute-per-dim-accuracy") multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """ {command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-compute-prob "{model}" \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ ark:- |" """.format(command=run_opts.command, dir=dir, iter=iter, egs_rspecifier=egs_rspecifier, opts=' '.join(opts), model=model, multitask_egs_opts=multitask_egs_opts)) egs_rspecifier = ("{0}:{1}/train_diagnostic{2}".format( scp_or_ark, egs_dir, egs_suffix)) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="train_diagnostic.", use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-compute-prob {opts} "{model}" \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ ark:- |" """.format(command=run_opts.command, dir=dir, iter=iter, egs_rspecifier=egs_rspecifier, opts=' '.join(opts), model=model, multitask_egs_opts=multitask_egs_opts))
def train_new_models(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, momentum, max_param_change, shuffle_buffer_size, minibatch_size_str, image_augmentation_opts, run_opts, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, use_multitask_egs=False, backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw We cannot easily use a single parallel SGE job to do the main training, because the computation of which archive and which --frame option to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code Selected args: frames_per_eg: The frames_per_eg, in the context of (non-chain) nnet3 training, is normally the number of output (supervised) frames in each training example. However, the frames_per_eg argument to this function should only be set to that number (greater than zero) if you intend to train on a single frame of each example, on each minibatch. If you provide this argument >0, then for each training job a different frame from the dumped example is selected to train on, based on the option --frame=n to nnet3-copy-egs. If you leave frames_per_eg at its default value (-1), then the entire sequence of frames is used for supervision. This is suitable for RNN training, where it helps to amortize the cost of computing the activations for the frames of context needed for the recurrence. use_multitask_egs : True, if different examples used to train multiple tasks or outputs, e.g.multilingual training. multilingual egs can be generated using get_egs.sh and steps/nnet3/multilingual/allocate_multilingual_examples.py, those are the top-level scripts. """ chunk_level_training = False if frames_per_eg > 0 else True deriv_time_opts = [] if min_deriv_time is not None: deriv_time_opts.append( "--optimization.min-deriv-time={0}".format(min_deriv_time)) if max_deriv_time_relative is not None: deriv_time_opts.append( "--optimization.max-deriv-time-relative={0}".format( max_deriv_time_relative)) threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this # slows down the computation slightly, so don't accumulate it on every # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") for job in range(1, num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 if not chunk_level_training: frame = (k / num_archives + archive_index) % frames_per_eg cache_io_opts = (("--read-cache={dir}/cache.{iter}".format( dir=dir, iter=iter) if iter > 0 else "") + (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "")) if image_augmentation_opts: image_augmentation_cmd = ( 'nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |' .format(srand=k + srand, aug_opts=image_augmentation_opts)) else: image_augmentation_cmd = '' multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="egs.", archive_index=archive_index, use_multitask_egs=use_multitask_egs) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_rspecifier = ( """ark,bg:nnet3-copy-egs {frame_opts} {multitask_egs_opts} \ {scp_or_ark}:{egs_dir}/egs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} \ --srand={srand} ark:- ark:- | {aug_cmd} \ nnet3-merge-egs --minibatch-size={minibatch_size} ark:- ark:- |""". format(frame_opts=("" if chunk_level_training else "--frame={0}".format(frame)), egs_dir=egs_dir, archive_index=archive_index, shuffle_buffer_size=shuffle_buffer_size, minibatch_size=minibatch_size_str, aug_cmd=image_augmentation_cmd, srand=iter + srand, scp_or_ark=scp_or_ark, multitask_egs_opts=multitask_egs_opts)) # note: the thread waits on that process's completion. thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-train {parallel_train_opts} {cache_io_opts} \ {verbose_opt} --print-interval=10 \ --momentum={momentum} \ --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ --l2-regularize-factor={l2_regularize_factor} \ --backstitch-training-interval={backstitch_training_interval} \ --srand={srand} \ {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \ {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=dir, iter=iter, next_iter=iter + 1, srand=iter + srand, job=job, parallel_train_opts=run_opts.parallel_train_opts, cache_io_opts=cache_io_opts, verbose_opt=verbose_opt, momentum=momentum, max_param_change=max_param_change, l2_regularize_factor=1.0 / num_jobs, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, deriv_time_opts=" ".join(deriv_time_opts), raw_model=raw_model_string, egs_rspecifier=egs_rspecifier), require_zero_status=True) threads.append(thread) for thread in threads: thread.join()
def compute_preconditioning_matrix( dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, lda_opts=None, use_multitask_egs=False, ): if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="egs.", archive_index="JOB", use_multitask_egs=use_multitask_egs, ) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_rspecifier = ("ark:nnet3-copy-egs {multitask_egs_opts} " "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |" "".format( egs_dir=egs_dir, scp_or_ark=scp_or_ark, multitask_egs_opts=multitask_egs_opts, )) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-acc-lda-stats --rand-prune={rand_prune} \ {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, egs_rspecifier=egs_rspecifier, rand_prune=rand_prune, )) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats lda_stat_files = [ "{0}/{1}.lda_stats".format(dir, x) for x in range(1, num_lda_jobs + 1) ] common_lib.execute_command("""{command} {dir}/log/sum_transform_stats.log \ sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( command=run_opts.command, dir=dir, lda_stat_files=" ".join(lda_stat_files))) for file in lda_stat_files: try: os.remove(file) except OSError: logger.error("There was error while trying to remove " "lda stat files.") raise # this computes a fixed affine transform computed in the way we described # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled # variant of an LDA transform but without dimensionality reduction. common_lib.execute_command("""{command} {dir}/log/get_transform.log \ nnet-get-feature-transform {lda_opts} {dir}/lda.mat \ {dir}/lda_stats""".format( command=run_opts.command, dir=dir, lda_opts=lda_opts if lda_opts is not None else "", )) common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, max_objective_evaluations=30, use_multitask_egs=False): """ Function to do model combination In the nnet3 setup, the logic for doing averaging of subsets of the models in the case where there are too many models to reliably esetimate interpolation factors (max_models_combine) is moved into the nnet3-combine. """ raw_model_strings = [] logger.info("Combining {0} models.".format(models_to_combine)) models_to_combine.add(num_iters) for iter in sorted(models_to_combine): model_file = '{0}/{1}.mdl'.format(dir, iter) if os.path.exists(model_file): # we used to copy them with nnet3-am-copy --raw=true, but now # the raw-model-reading code discards the other stuff itself. raw_model_strings.append(model_file) else: print("{0}: warning: model file {1} does not exist " "(final combination)".format(sys.argv[0], model_file)) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="combine.", use_multitask_egs=use_multitask_egs) # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. # Since the batch-norm stats are not technically parameters, they are not # combined in the combination code, they are just obtained from the first # model. raw_model_strings = list(reversed(raw_model_strings)) common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-chain-combine \ --max-objective-evaluations={max_objective_evaluations} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ ark:- ark:- |" - \| \ nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ {dir}/final.mdl""".format( command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, combine_gpu_opt=run_opts.combine_gpu_opt, max_objective_evaluations=max_objective_evaluations, l2=l2_regularize, leaky=leaky_hmm_coefficient, dir=dir, raw_models=" ".join(raw_model_strings), num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. compute_train_cv_probabilities(dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs)
def train_new_models(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, run_opts, train_opts, backstitch_training_scale=0.0, backstitch_training_interval=1, use_multitask_egs=False): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw We cannot easily use a single parallel SGE job to do the main training, because the computation of which archive and which --frame option to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code use_multitask_egs : True, if different examples used to train multiple tasks or outputs, e.g.multilingual training. multilingual egs can be generated using get_egs.sh and steps/nnet3/multilingual/allocate_multilingual_examples.py, those are the top-level scripts. """ deriv_time_opts = [] if min_deriv_time is not None: deriv_time_opts.append( "--optimization.min-deriv-time={0}".format(min_deriv_time)) if max_deriv_time_relative is not None: deriv_time_opts.append( "--optimization.max-deriv-time-relative={0}".format( int(max_deriv_time_relative))) threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this # slows down the computation slightly, so don't accumulate it on every # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") for job in range(1, num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 # previous : frame_shift = (k/num_archives) % frame_subsampling_factor frame_shift = ((archive_index + k // num_archives) % frame_subsampling_factor) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="cegs.", archive_index=archive_index, use_multitask_egs=use_multitask_egs) scp_or_ark = "scp" if use_multitask_egs else "ark" cache_io_opts = (("--read-cache={dir}/cache.{iter}".format( dir=dir, iter=iter) if iter > 0 else "") + (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "")) thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ --apply-deriv-weights={app_deriv_wts} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ {cache_io_opts} --xent-regularize={xent_reg} \ {deriv_time_opts} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ --backstitch-training-interval={backstitch_training_interval} \ --l2-regularize-factor={l2_regularize_factor} {train_opts} \ --srand={srand} \ "{raw_model}" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \ --frame-shift={fr_shft} \ {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \ --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1, job=job, deriv_time_opts=" ".join(deriv_time_opts), app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, train_opts=train_opts, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, cache_io_opts=cache_io_opts, parallel_train_opts=run_opts.parallel_train_opts, verbose_opt=verbose_opt, momentum=momentum, max_param_change=max_param_change, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, l2_regularize_factor=1.0 / num_jobs, raw_model=raw_model_string, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, num_chunk_per_mb=num_chunk_per_minibatch_str, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark), require_zero_status=True) threads.append(thread) for thread in threads: thread.join()