Example #1
0
def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, shrinkage_value,
                        num_chunk_per_minibatch,
                        num_hidden_layers, add_layers_period,
                        left_context, right_context,
                        apply_deriv_weights, min_deriv_time,
                        max_deriv_time,
                        l2_regularize, xent_regularize,
                        leaky_hmm_coefficient,
                        momentum, max_param_change, shuffle_buffer_size,
                        frame_subsampling_factor, truncate_deriv_weights,
                        run_opts,
                        dropout_edit_string="",
                        background_process_handler=None):
    """ Called from steps/nnet3/chain/train.py for one iteration for
    neural network training with LF-MMI objective

    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    logger.info("Training neural net (pass {0})".format(iter))

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        left_context=left_context, right_context=right_context,
        l2_regularize=l2_regularize, xent_regularize=xent_regularize,
        leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts,
        background_process_handler=background_process_handler)

    if iter > 0:
        # Runs in the background
        compute_progress(dir, iter, run_opts,
                         background_process_handler=background_process_handler)

    if (iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period)
            and iter % add_layers_period == 0):

        # if we've just added new hiden layer, don't do averaging but take the
        # best.
        do_average = False

        cur_num_hidden_layers = 1 + iter / add_layers_period
        config_file = "{0}/configs/layer{1}.config".format(
            dir, cur_num_hidden_layers)
        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={lr} "
                            "{dir}/{iter}.mdl - | nnet3-init --srand={srand} "
                            "- {config} - |".format(lr=learning_rate, dir=dir,
                                                    iter=iter,
                                                    srand=iter + srand,
                                                    config=config_file))
        cache_io_opts = ""
    else:
        do_average = True
        if iter == 0:
            # on iteration 0, pick the best, don't average.
            do_average = False
        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                            "{1}/{2}.mdl - |".format(learning_rate, dir, iter))
        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                 iter=iter)

    if do_average:
        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
        cur_max_param_change = max_param_change
    else:
        # on iteration zero or when we just added a layer, use a smaller
        # minibatch size (and we will later choose the output of just one of
        # the jobs): the model-averaging isn't always helpful when the model is
        # changing too fast (i.e. it can worsen the objective function), and
        # the smaller minibatch size will help to keep the update stable.
        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    raw_model_string = '{0} {1}'.format(raw_model_string, dropout_edit_string)

    shrink_info_str = ''
    if shrinkage_value != 1.0:
        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)

    logger.info("On iteration {0}, learning rate is {1}"
                "{shrink_info}.".format(
                    iter, learning_rate,
                    shrink_info=shrink_info_str))

    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string,
                     egs_dir=egs_dir,
                     left_context=left_context, right_context=right_context,
                     apply_deriv_weights=apply_deriv_weights,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time=max_deriv_time,
                     l2_regularize=l2_regularize,
                     xent_regularize=xent_regularize,
                     leaky_hmm_coefficient=leaky_hmm_coefficient,
                     momentum=momentum,
                     max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     num_chunk_per_minibatch=cur_num_chunk_per_minibatch,
                     frame_subsampling_factor=frame_subsampling_factor,
                     truncate_deriv_weights=truncate_deriv_weights,
                     cache_io_opts=cache_io_opts, run_opts=run_opts)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            shrink=shrinkage_value)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts,
            shrink=shrinkage_value)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        raise Exception("Error while trying to delete the raw models")

    new_model = "{0}/{1}.mdl".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #2
0
def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, minibatch_size_str,
                        momentum, max_param_change, shuffle_buffer_size,
                        run_opts, image_augmentation_opts=None,
                        frames_per_eg=-1,
                        min_deriv_time=None, max_deriv_time_relative=None,
                        shrinkage_value=1.0, dropout_edit_string="",
                        get_raw_nnet_from_am=True,
                        use_multitask_egs=False,
                        backstitch_training_scale=0.0, backstitch_training_interval=1):
    """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
    network training

    Selected args:
        frames_per_eg: The default value -1 implies chunk_level_training, which
            is particularly applicable to RNN training. If it is > 0, then it
            implies frame-level training, which is applicable for DNN training.
            If it is > 0, then each parallel SGE job created, a different frame
            numbered 0..frames_per_eg-1 is used.
        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
            parameter values are scaled by this value.
        get_raw_nnet_from_am: If True, then the network is read and stored as
            acoustic model i.e. along with transition model e.g. 10.mdl
            as against a raw network e.g. 10.raw when the value is False.
    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    logger.info("Training neural net (pass {0})".format(iter))

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        run_opts=run_opts,
        get_raw_nnet_from_am=get_raw_nnet_from_am,
        use_multitask_egs=use_multitask_egs)

    if iter > 0:
        # Runs in the background
        compute_progress(dir=dir, iter=iter, egs_dir=egs_dir,
                         run_opts=run_opts,
                         get_raw_nnet_from_am=get_raw_nnet_from_am,
                         use_multitask_egs=use_multitask_egs)

    do_average = (iter > 0)

    if get_raw_nnet_from_am:
        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                            "--scale={1} {2}/{3}.mdl - |".format(
                                learning_rate, shrinkage_value,
                                dir, iter))

    else:
        raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} "
                            "{dir}/{iter}.raw - |".format(
                                lr=learning_rate, s=shrinkage_value,
                                dir=dir, iter=iter))

    raw_model_string = raw_model_string + dropout_edit_string

    if do_average:
        cur_minibatch_size_str = minibatch_size_str
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str)
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    shrink_info_str = ''
    if shrinkage_value != 1.0:
        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)

    logger.info("On iteration {0}, learning rate is {1}"
                "{shrink_info}.".format(
                    iter, learning_rate,
                    shrink_info=shrink_info_str))

    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string, egs_dir=egs_dir,
                     momentum=momentum, max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     minibatch_size_str=cur_minibatch_size_str,
                     run_opts=run_opts,
                     frames_per_eg=frames_per_eg,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time_relative=max_deriv_time_relative,
                     image_augmentation_opts=image_augmentation_opts,
                     use_multitask_egs=use_multitask_egs,
                     backstitch_training_scale=backstitch_training_scale,
                     backstitch_training_interval=backstitch_training_interval)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        logger.error("Error while trying to delete the raw models")
        raise

    if get_raw_nnet_from_am:
        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
    else:
        new_model = "{0}/{1}.raw".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #3
0
def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, shrinkage_value,
                        num_chunk_per_minibatch_str,
                        apply_deriv_weights, min_deriv_time,
                        max_deriv_time_relative,
                        l2_regularize, xent_regularize,
                        leaky_hmm_coefficient,
                        momentum, max_param_change, shuffle_buffer_size,
                        frame_subsampling_factor,
                        run_opts, dropout_edit_string="", train_opts="",
                        backstitch_training_scale=0.0, backstitch_training_interval=1,
                        use_multitask_egs=False):
    """ Called from steps/nnet3/chain/train.py for one iteration for
    neural network training with LF-MMI objective

    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        l2_regularize=l2_regularize, xent_regularize=xent_regularize,
        leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts,
        use_multitask_egs=use_multitask_egs)

    if iter > 0:
        # Runs in the background
        compute_progress(dir, iter, run_opts)

    do_average = (iter > 0)

    raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                        "--scale={1} {2}/{3}.mdl - |".format(
                            learning_rate, shrinkage_value, dir, iter))

    if do_average:
        cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str(
            num_chunk_per_minibatch_str)
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    raw_model_string = raw_model_string + dropout_edit_string
    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string,
                     egs_dir=egs_dir,
                     apply_deriv_weights=apply_deriv_weights,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time_relative=max_deriv_time_relative,
                     l2_regularize=l2_regularize,
                     xent_regularize=xent_regularize,
                     leaky_hmm_coefficient=leaky_hmm_coefficient,
                     momentum=momentum,
                     max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                     frame_subsampling_factor=frame_subsampling_factor,
                     run_opts=run_opts, train_opts=train_opts,
                     # linearly increase backstitch_training_scale during the
                     # first few iterations (hard-coded as 15)
                     backstitch_training_scale=(backstitch_training_scale *
                         iter / 15 if iter < 15 else backstitch_training_scale),
                     backstitch_training_interval=backstitch_training_interval,
                     use_multitask_egs=use_multitask_egs)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        raise Exception("Error while trying to delete the raw models")

    new_model = "{0}/{1}.mdl".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #4
0
def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, minibatch_size_str,
                        momentum, max_param_change, shuffle_buffer_size,
                        run_opts, image_augmentation_opts=None,
                        frames_per_eg=-1,
                        min_deriv_time=None, max_deriv_time_relative=None,
                        shrinkage_value=1.0, dropout_edit_string="",
                        get_raw_nnet_from_am=True,
                        use_multitask_egs=False,
                        backstitch_training_scale=0.0, backstitch_training_interval=1,
                        compute_per_dim_accuracy=False):
    """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
    network training

    Selected args:
        frames_per_eg: The default value -1 implies chunk_level_training, which
            is particularly applicable to RNN training. If it is > 0, then it
            implies frame-level training, which is applicable for DNN training.
            If it is > 0, then each parallel SGE job created, a different frame
            numbered 0..frames_per_eg-1 is used.
        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
            parameter values are scaled by this value.
        get_raw_nnet_from_am: If True, then the network is read and stored as
            acoustic model i.e. along with transition model e.g. 10.mdl
            as against a raw network e.g. 10.raw when the value is False.
    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    logger.info("Training neural net (pass {0})".format(iter))

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        run_opts=run_opts,
        get_raw_nnet_from_am=get_raw_nnet_from_am,
        use_multitask_egs=use_multitask_egs,
        compute_per_dim_accuracy=compute_per_dim_accuracy)

    if iter > 0:
        # Runs in the background
        compute_progress(dir=dir, iter=iter, egs_dir=egs_dir,
                         run_opts=run_opts,
                         get_raw_nnet_from_am=get_raw_nnet_from_am,
                         use_multitask_egs=use_multitask_egs)

    do_average = (iter > 0)

    if get_raw_nnet_from_am:
        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                            "--scale={1} {2}/{3}.mdl - |".format(
                                learning_rate, shrinkage_value,
                                dir, iter))

    else:
        raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} "
                            "{dir}/{iter}.raw - |".format(
                                lr=learning_rate, s=shrinkage_value,
                                dir=dir, iter=iter))

    raw_model_string = raw_model_string + dropout_edit_string

    if do_average:
        cur_minibatch_size_str = minibatch_size_str
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str)
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    shrink_info_str = ''
    if shrinkage_value != 1.0:
        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)

    logger.info("On iteration {0}, learning rate is {1}"
                "{shrink_info}.".format(
                    iter, learning_rate,
                    shrink_info=shrink_info_str))

    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string, egs_dir=egs_dir,
                     momentum=momentum, max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     minibatch_size_str=cur_minibatch_size_str,
                     run_opts=run_opts,
                     frames_per_eg=frames_per_eg,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time_relative=max_deriv_time_relative,
                     image_augmentation_opts=image_augmentation_opts,
                     use_multitask_egs=use_multitask_egs,
                     backstitch_training_scale=backstitch_training_scale,
                     backstitch_training_interval=backstitch_training_interval)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        logger.error("Error while trying to delete the raw models")
        raise

    if get_raw_nnet_from_am:
        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
    else:
        new_model = "{0}/{1}.raw".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #5
0
def train_one_iteration(dir,
                        iter,
                        srand,
                        egs_dir,
                        num_jobs,
                        num_archives_processed,
                        num_archives,
                        learning_rate,
                        minibatch_size,
                        num_hidden_layers,
                        add_layers_period,
                        left_context,
                        right_context,
                        momentum,
                        max_param_change,
                        shuffle_buffer_size,
                        run_opts,
                        cv_minibatch_size=256,
                        frames_per_eg=-1,
                        min_deriv_time=None,
                        max_deriv_time=None,
                        shrinkage_value=1.0,
                        dropout_edit_string="",
                        get_raw_nnet_from_am=True,
                        background_process_handler=None):
    """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
    network training

    Args:
        frames_per_eg: The default value -1 implies chunk_level_training, which
            is particularly applicable to RNN training. If it is > 0, then it
            implies frame-level training, which is applicable for DNN training.
            If it is > 0, then each parallel SGE job created, a different frame
            numbered 0..frames_per_eg-1 is used.
        min_deriv_time: Applicable for RNN training. A default value of None
            implies a min_deriv_time of 0 is used. During RNN training, its
            value is set to chunk_width - num_bptt_steps in the training
            script.
        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
            parameter values are scaled by this value.
        get_raw_nnet_from_am: If True, then the network is read and stored as
            acoustic model i.e. along with transition model e.g. 10.mdl
            as against a raw network e.g. 10.raw when the value is False.
    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    logger.info("Training neural net (pass {0})".format(iter))

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir,
        iter=iter,
        egs_dir=egs_dir,
        left_context=left_context,
        right_context=right_context,
        run_opts=run_opts,
        mb_size=cv_minibatch_size,
        get_raw_nnet_from_am=get_raw_nnet_from_am,
        wait=False,
        background_process_handler=background_process_handler)

    if iter > 0:
        # Runs in the background
        compute_progress(dir=dir,
                         iter=iter,
                         egs_dir=egs_dir,
                         left_context=left_context,
                         right_context=right_context,
                         run_opts=run_opts,
                         mb_size=cv_minibatch_size,
                         wait=False,
                         get_raw_nnet_from_am=get_raw_nnet_from_am,
                         background_process_handler=background_process_handler)

    # an option for writing cache (storing pairs of nnet-computations
    # and computation-requests) during training.
    cache_read_opt = ""
    if (iter > 0 and (iter <= (num_hidden_layers - 1) * add_layers_period)
            and iter % add_layers_period == 0):

        # if we've just added new hiden layer, don't do averaging but take the
        # best.
        do_average = False

        cur_num_hidden_layers = 1 + iter / add_layers_period
        config_file = "{0}/configs/layer{1}.config".format(
            dir, cur_num_hidden_layers)
        if get_raw_nnet_from_am:
            raw_model_string = ("nnet3-am-copy --raw=true "
                                "--learning-rate={lr} {dir}/{iter}.mdl - | "
                                "nnet3-init --srand={srand} - "
                                "{config} - |".format(lr=learning_rate,
                                                      dir=dir,
                                                      iter=iter,
                                                      srand=iter + srand,
                                                      config=config_file))
        else:
            raw_model_string = ("nnet3-copy --learning-rate={lr} "
                                "{dir}/{iter}.raw - | "
                                "nnet3-init --srand={srand} - "
                                "{config} - |".format(lr=learning_rate,
                                                      dir=dir,
                                                      iter=iter,
                                                      srand=iter + srand,
                                                      config=config_file))
    else:
        do_average = True
        if iter == 0:
            # on iteration 0, pick the best, don't average.
            do_average = False
        else:
            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(
                dir=dir, iter=iter)
        if get_raw_nnet_from_am:
            raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                                "{1}/{2}.mdl - |".format(
                                    learning_rate, dir, iter))
        else:
            raw_model_string = ("nnet3-copy --learning-rate={lr} "
                                "{dir}/{iter}.raw - |".format(lr=learning_rate,
                                                              dir=dir,
                                                              iter=iter))

    raw_model_string = raw_model_string + dropout_edit_string

    if do_average:
        cur_minibatch_size = minibatch_size
        cur_max_param_change = max_param_change
    else:
        # on iteration zero or when we just added a layer, use a smaller
        # minibatch size (and we will later choose the output of just one of
        # the jobs): the model-averaging isn't always helpful when the model is
        # changing too fast (i.e. it can worsen the objective function), and
        # the smaller minibatch size will help to keep the update stable.
        cur_minibatch_size = minibatch_size / 2
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    try:
        os.remove("{0}/.error".format(dir))
    except OSError:
        pass

    shrink_info_str = ''
    if shrinkage_value != 1.0:
        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)

    logger.info("On iteration {0}, learning rate is {1}"
                "{shrink_info}.".format(iter,
                                        learning_rate,
                                        shrink_info=shrink_info_str))

    train_new_models(dir=dir,
                     iter=iter,
                     srand=srand,
                     num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string,
                     egs_dir=egs_dir,
                     left_context=left_context,
                     right_context=right_context,
                     momentum=momentum,
                     max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     minibatch_size=cur_minibatch_size,
                     cache_read_opt=cache_read_opt,
                     run_opts=run_opts,
                     frames_per_eg=frames_per_eg,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time=max_deriv_time)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
        num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir,
            iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am,
            shrink=shrinkage_value)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir,
            iter=iter,
            best_model_index=best_model,
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am,
            shrink=shrinkage_value)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        logger.error("Error while trying to delete the raw models")
        raise

    if get_raw_nnet_from_am:
        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
    else:
        new_model = "{0}/{1}.raw".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #6
0
def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, minibatch_size,
                        num_hidden_layers, add_layers_period,
                        left_context, right_context,
                        momentum, max_param_change, shuffle_buffer_size,
                        run_opts,
                        cv_minibatch_size=256, frames_per_eg=-1,
                        min_deriv_time=None, max_deriv_time=None,
                        shrinkage_value=1.0,
                        get_raw_nnet_from_am=True,
                        background_process_handler=None):
    """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
    network training

    Args:
        frames_per_eg: The default value -1 implies chunk_level_training, which
            is particularly applicable to RNN training. If it is > 0, then it
            implies frame-level training, which is applicable for DNN training.
            If it is > 0, then each parallel SGE job created, a different frame
            numbered 0..frames_per_eg-1 is used.
        min_deriv_time: Applicable for RNN training. A default value of None
            implies a min_deriv_time of 0 is used. During RNN training, its
            value is set to chunk_width - num_bptt_steps in the training
            script.
        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
            parameter values are scaled by this value.
        get_raw_nnet_from_am: If True, then the network is read and stored as
            acoustic model i.e. along with transition model e.g. 10.mdl
            as against a raw network e.g. 10.raw when the value is False.
    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    logger.info("Training neural net (pass {0})".format(iter))

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError) as e:
            raise Exception("Exception while reading the random seed "
                            "for training: {0}".format(e.str()))
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        left_context=left_context, right_context=right_context,
        run_opts=run_opts,
        mb_size=cv_minibatch_size,
        get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False,
        background_process_handler=background_process_handler)

    if iter > 0:
        # Runs in the background
        compute_progress(dir=dir, iter=iter, egs_dir=egs_dir,
                         left_context=left_context,
                         right_context=right_context,
                         run_opts=run_opts,
                         mb_size=cv_minibatch_size, wait=False,
                         get_raw_nnet_from_am=get_raw_nnet_from_am,
                         background_process_handler=background_process_handler)

    # an option for writing cache (storing pairs of nnet-computations
    # and computation-requests) during training.
    cache_read_opt = ""
    if (iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period)
            and iter % add_layers_period == 0):

        # if we've just added new hiden layer, don't do averaging but take the
        # best.
        do_average = False

        cur_num_hidden_layers = 1 + iter / add_layers_period
        config_file = "{0}/configs/layer{1}.config".format(
            dir, cur_num_hidden_layers)
        if get_raw_nnet_from_am:
            raw_model_string = ("nnet3-am-copy --raw=true "
                                "--learning-rate={lr} {dir}/{iter}.mdl - | "
                                "nnet3-init --srand={srand} - "
                                "{config} - |".format(
                                    lr=learning_rate, dir=dir, iter=iter,
                                    srand=iter + srand, config=config_file))
        else:
            raw_model_string = ("nnet3-copy --learning-rate={lr} "
                                "{dir}/{iter}.raw - | "
                                "nnet3-init --srand={srand} - "
                                "{config} - |".format(
                                    lr=learning_rate, dir=dir, iter=iter,
                                    srand=iter + srand, config=config_file))
    else:
        do_average = True
        if iter == 0:
            # on iteration 0, pick the best, don't average.
            do_average = False
        else:
            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(
                dir=dir, iter=iter)
        if get_raw_nnet_from_am:
            raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                                "{1}/{2}.mdl - |".format(learning_rate,
                                                         dir, iter))
        else:
            raw_model_string = ("nnet3-copy --learning-rate={lr} "
                                "{dir}/{iter}.raw - |".format(
                                    lr=learning_rate, dir=dir, iter=iter))

    if do_average:
        cur_minibatch_size = minibatch_size
        cur_max_param_change = max_param_change
    else:
        # on iteration zero or when we just added a layer, use a smaller
        # minibatch size (and we will later choose the output of just one of
        # the jobs): the model-averaging isn't always helpful when the model is
        # changing too fast (i.e. it can worsen the objective function), and
        # the smaller minibatch size will help to keep the update stable.
        cur_minibatch_size = minibatch_size / 2
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    try:
        os.remove("{0}/.error".format(dir))
    except OSError:
        pass

    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string, egs_dir=egs_dir,
                     left_context=left_context, right_context=right_context,
                     momentum=momentum, max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     minibatch_size=cur_minibatch_size,
                     cache_read_opt=cache_read_opt, run_opts=run_opts,
                     frames_per_eg=frames_per_eg,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time=max_deriv_time)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am,
            shrink=shrinkage_value)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am,
            shrink=shrinkage_value)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        raise Exception("Error while trying to delete the raw models")

    if get_raw_nnet_from_am:
        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
    else:
        new_model = "{0}/{1}.raw".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #7
0
def train_one_iteration(dir,
                        iter,
                        srand,
                        egs_dir,
                        num_jobs,
                        num_archives_processed,
                        num_archives,
                        learning_rate,
                        shrinkage_value,
                        num_chunk_per_minibatch_str,
                        apply_deriv_weights,
                        min_deriv_time,
                        max_deriv_time_relative,
                        l2_regularize,
                        xent_regularize,
                        leaky_hmm_coefficient,
                        momentum,
                        max_param_change,
                        shuffle_buffer_size,
                        frame_subsampling_factor,
                        run_opts,
                        dropout_edit_string="",
                        train_opts="",
                        chain_opts="",
                        backstitch_training_scale=0.0,
                        backstitch_training_interval=1,
                        use_multitask_egs=False):
    """ Called from steps/nnet3/chain/train.py for one iteration for
    neural network training with LF-MMI objective

    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(dir=dir,
                                   iter=iter,
                                   egs_dir=egs_dir,
                                   l2_regularize=l2_regularize,
                                   xent_regularize=xent_regularize,
                                   leaky_hmm_coefficient=leaky_hmm_coefficient,
                                   run_opts=run_opts,
                                   use_multitask_egs=use_multitask_egs,
                                   chain_opts=chain_opts)

    if iter > 0:
        # Runs in the background
        compute_progress(dir, iter, run_opts)

    do_average = (iter > 0)

    raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                        "--scale={1} {2}/{3}.mdl - |".format(
                            learning_rate, shrinkage_value, dir, iter))

    if do_average:
        cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str(
            num_chunk_per_minibatch_str)
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    raw_model_string = raw_model_string + dropout_edit_string
    train_new_models(
        dir=dir,
        iter=iter,
        srand=srand,
        num_jobs=num_jobs,
        num_archives_processed=num_archives_processed,
        num_archives=num_archives,
        raw_model_string=raw_model_string,
        egs_dir=egs_dir,
        apply_deriv_weights=apply_deriv_weights,
        min_deriv_time=min_deriv_time,
        max_deriv_time_relative=max_deriv_time_relative,
        l2_regularize=l2_regularize,
        xent_regularize=xent_regularize,
        leaky_hmm_coefficient=leaky_hmm_coefficient,
        momentum=momentum,
        max_param_change=cur_max_param_change,
        shuffle_buffer_size=shuffle_buffer_size,
        num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
        frame_subsampling_factor=frame_subsampling_factor,
        run_opts=run_opts,
        train_opts=train_opts,
        chain_opts=chain_opts,
        # linearly increase backstitch_training_scale during the
        # first few iterations (hard-coded as 15)
        backstitch_training_scale=(backstitch_training_scale * iter / 15 if
                                   iter < 15 else backstitch_training_scale),
        backstitch_training_interval=backstitch_training_interval,
        use_multitask_egs=use_multitask_egs)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
        num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir,
            iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(dir=dir,
                                             iter=iter,
                                             best_model_index=best_model,
                                             run_opts=run_opts)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        raise Exception("Error while trying to delete the raw models")

    new_model = "{0}/{1}.mdl".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))
Example #8
0
def train_one_iteration(dir,
                        iter,
                        srand,
                        egs_dir,
                        num_jobs,
                        num_archives_processed,
                        num_archives,
                        learning_rate,
                        shrinkage_value,
                        num_chunk_per_minibatch,
                        num_hidden_layers,
                        add_layers_period,
                        left_context,
                        right_context,
                        apply_deriv_weights,
                        min_deriv_time,
                        max_deriv_time,
                        l2_regularize,
                        xent_regularize,
                        leaky_hmm_coefficient,
                        momentum,
                        max_param_change,
                        shuffle_buffer_size,
                        frame_subsampling_factor,
                        truncate_deriv_weights,
                        run_opts,
                        background_process_handler=None):
    """ Called from steps/nnet3/chain/train.py for one iteration for
    neural network training with LF-MMI objective

    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    logger.info("Training neural net (pass {0})".format(iter))

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError) as e:
            raise Exception("Exception while reading the random seed "
                            "for training: {0}".format(e.str()))
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir,
        iter=iter,
        egs_dir=egs_dir,
        left_context=left_context,
        right_context=right_context,
        l2_regularize=l2_regularize,
        xent_regularize=xent_regularize,
        leaky_hmm_coefficient=leaky_hmm_coefficient,
        run_opts=run_opts,
        background_process_handler=background_process_handler)

    if iter > 0:
        # Runs in the background
        compute_progress(dir,
                         iter,
                         run_opts,
                         background_process_handler=background_process_handler)

    if (iter > 0 and (iter <= (num_hidden_layers - 1) * add_layers_period)
            and iter % add_layers_period == 0):

        # if we've just added new hiden layer, don't do averaging but take the
        # best.
        do_average = False

        cur_num_hidden_layers = 1 + iter / add_layers_period
        config_file = "{0}/configs/layer{1}.config".format(
            dir, cur_num_hidden_layers)
        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={lr} "
                            "{dir}/{iter}.mdl - | nnet3-init --srand={srand} "
                            "- {config} - |".format(lr=learning_rate,
                                                    dir=dir,
                                                    iter=iter,
                                                    srand=iter + srand,
                                                    config=config_file))
        cache_io_opts = ""
    else:
        do_average = True
        if iter == 0:
            # on iteration 0, pick the best, don't average.
            do_average = False
        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                            "{1}/{2}.mdl - |".format(learning_rate, dir, iter))
        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                 iter=iter)

    if do_average:
        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
        cur_max_param_change = max_param_change
    else:
        # on iteration zero or when we just added a layer, use a smaller
        # minibatch size (and we will later choose the output of just one of
        # the jobs): the model-averaging isn't always helpful when the model is
        # changing too fast (i.e. it can worsen the objective function), and
        # the smaller minibatch size will help to keep the update stable.
        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    train_new_models(dir=dir,
                     iter=iter,
                     srand=srand,
                     num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string,
                     egs_dir=egs_dir,
                     left_context=left_context,
                     right_context=right_context,
                     apply_deriv_weights=apply_deriv_weights,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time=max_deriv_time,
                     l2_regularize=l2_regularize,
                     xent_regularize=xent_regularize,
                     leaky_hmm_coefficient=leaky_hmm_coefficient,
                     momentum=momentum,
                     max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     num_chunk_per_minibatch=cur_num_chunk_per_minibatch,
                     frame_subsampling_factor=frame_subsampling_factor,
                     truncate_deriv_weights=truncate_deriv_weights,
                     cache_io_opts=cache_io_opts,
                     run_opts=run_opts)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
        num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir,
            iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            shrink=shrinkage_value)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(dir=dir,
                                             iter=iter,
                                             best_model_index=best_model,
                                             run_opts=run_opts,
                                             shrink=shrinkage_value)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        raise Exception("Error while trying to delete the raw models")

    new_model = "{0}/{1}.mdl".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))