Beispiel #1
0
 def __init__(self, model_dir, question_file_name, silence_pattern='/2:sil/'):  ## TODO: where to handle silence pattern? Currently fragile
     super(NNAcousticModel, self).__init__(model_dir)
     self.load_stream_info()
     self.label_expander = HTSLabelNormalisation(question_file_name=question_file_name)
     self.param_generator = MLParameterGenerationFast() # ParameterGeneration()
     self.silent_feature_indices = self.get_silent_feature_indices(question_file_name, silence_pattern) 
     
     std = self.output_std
     m = numpy.shape(std)
     
     std = std.reshape((1,self.outdim))
     
     self.stream_std = self.split_into_streams(std)
Beispiel #2
0
def load_nnets_models(cfg_list):

    nnets_model_list = []
    for i, cfg in enumerate(cfg_list):  #load two nnets_models into memory
        model_dir = os.path.join(cfg.work_dir, 'nnets_model')
        hidden_layer_size = cfg.hyper_params['hidden_layer_size']
        combined_model_arch = str(len(hidden_layer_size))
        for hid_size in hidden_layer_size:
            combined_model_arch += '_' + str(hid_size)
        label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name,
            add_frame_features=cfg.add_frame_features,
            subphone_feats=cfg.subphone_feats)
        add_feat_dim = sum(cfg.additional_features.values())
        lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim

        nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \
             %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch),
            combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate'])

        nnets_model = cPickle.load(open(nnets_file_name, 'rb'))
        print nnets_file_name
        print "__________________________"
        nnets_model_list.append(nnets_model)
    assert (len(nnets_model_list) == 2)
    return nnets_model_list
Beispiel #3
0
def main_function(cfg, outdir, model_pickle_file=None):

    hidden_layer_size = cfg.hyper_params['hidden_layer_size']
    data_dir = cfg.data_dir
    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    #     norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label
    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim
        print(('Input label dimension is %d' % lab_dim))
        suffix = str(lab_dim)
    elif cfg.label_style == 'HTS_duration':
        label_normaliser = HTSDurationLabelNormalisation(
            question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension  ## + cfg.appended_input_dim
        print(('Input label dimension is %d' % lab_dim))
        suffix = str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix = 'composed'

    combined_model_arch = str(len(hidden_layer_size))
    for hid_size in hidden_layer_size:
        combined_model_arch += '_' + str(hid_size)

    ## if made with run_lstm:--
    '''
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \
                      %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch),
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate'])
    '''

    ## if made with run_dnn:--
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch),
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)

    ## override the name computed from config variables if model_pickle_file specified:
    if model_pickle_file != None:
        nnets_file_name = model_pickle_file

    print('store DNN')

    store_network(nnets_file_name, outdir)
Beispiel #4
0
def main_function(cfg, outdir, model_pickle_file=None):    
    
    hidden_layer_size = cfg.hyper_params['hidden_layer_size']
    data_dir = cfg.data_dir
    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
#     norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')
	
    ### normalise input full context label
    assert cfg.label_style == 'HTS'
    #
    label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats)
    lab_dim = label_normaliser.dimension + cfg.appended_input_dim
    print('Input label dimension is %d' % lab_dim)
    suffix=str(lab_dim)


    combined_model_arch = str(len(hidden_layer_size))
    for hid_size in hidden_layer_size:
        combined_model_arch += '_' + str(hid_size)
    
    ## if made with run_lstm:--
    '''
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \
                      %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), 
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate'])
        
    ## if made with run_dnn:--
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), 
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)
    '''

    ## if made with run_merlin:--
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \
                      %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), 
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate'])


    ## override the name computed from config variables if model_pickle_file specified:
    if model_pickle_file != None:
        nnets_file_name = model_pickle_file

    print('store DNN')


    store_network(nnets_file_name, lab_dim, outdir)
Beispiel #5
0
def main_function(cfg):
    # get a logger for this main function
    logger = logging.getLogger("main")

    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")

    # later, we might do this via a handler that is created, attached and configured
    # using the standard config mechanism of the logging module
    # but for now we need to do it manually
    plotlogger.set_plot_path(cfg.plot_dir)

    #### parameter setting########
    hidden_layers_sizes = cfg.hyper_params['hidden_layer_size']

    ####prepare environment

    try:
        file_id_list = read_file_list(cfg.file_id_scp)
        logger.debug('Loaded file id list from %s' % cfg.file_id_scp)
    except IOError:
        # this means that open(...) threw an error
        logger.critical('Could not load file id list from %s' % cfg.file_id_scp)
        raise

    ###total file number including training, development, and testing
    total_file_number = len(file_id_list)

    data_dir = cfg.data_dir

    nn_cmp_dir = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_nosil_dir = os.path.join(data_dir, 'nn_nosil' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_norm_dir = os.path.join(data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))

    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    gen_dir = os.path.join(cfg.work_dir, 'gen')

    in_file_list_dict = {}

    for feature_name in list(cfg.in_dir_dict.keys()):
        in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name],
                                                                 cfg.file_extension_dict[feature_name], False)

    nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext)
    nn_cmp_nosil_file_list = prepare_file_path_list(file_id_list, nn_cmp_nosil_dir, cfg.cmp_ext)
    nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext)

    ###normalisation information
    norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(
        cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label

    # currently supporting two different forms of lingustic features
    # later, we should generalise this

    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim
        logger.info('Input label dimension is %d' % lab_dim)
        suffix = str(lab_dim)
    elif cfg.label_style == 'HTS_duration':
        label_normaliser = HTSDurationLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension  ## + cfg.appended_input_dim
        logger.info('Input label dimension is %d' % lab_dim)
        suffix = str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix = 'composed'

    if cfg.process_labels_in_work_dir:
        label_data_dir = cfg.work_dir
    else:
        label_data_dir = data_dir

    # the number can be removed
    binary_label_dir = os.path.join(label_data_dir, 'binary_label_' + suffix)
    nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix)
    nn_label_norm_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_norm_' + suffix)
    #    nn_label_norm_mvn_dir = os.path.join(data_dir, 'nn_no_silence_lab_norm_'+suffix)

    in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)
    binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext)
    nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext)
    nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext)

    # to do - sanity check the label dimension here?



    min_max_normaliser = None
    label_norm_file = 'label_norm_%s.dat' % (cfg.label_style)
    label_norm_file = os.path.join(label_data_dir, label_norm_file)

    normaliser = math_statis.Statis(feature_dimension=lab_dim, read_func=file_util.load_binary_file_frame,
                                    writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99)
    if cfg.NORMLAB and (cfg.label_style in ['HTS', 'HTS_duration']):
        # simple HTS labels
        logger.info('preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list)

        if cfg.label_style == 'HTS':
            remover = SilenceRemover(n_cmp=lab_dim, silence_pattern=cfg.silence_pattern)
            remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list)
        elif cfg.label_style == 'HTS_duration':
            ## don't remove silences for duration
            nn_label_file_list = binary_label_file_list
        ###use only training data to find min-max information, then apply on the whole dataset
        normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])
        normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)

    if cfg.NORMLAB and (cfg.label_style == 'composed'):
        # new flexible label preprocessor

        logger.info('preparing label data (input) using "composed" style labels')
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)

        logger.info('Loaded label configuration')
        # logger.info('%s' % label_composer.configuration.labels )

        lab_dim = label_composer.compute_label_dimension()
        logger.info('label dimension will be %d' % lab_dim)

        if cfg.precompile_xpaths:
            label_composer.precompile_xpaths()

        # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees)
        # create all the lists of these, ready to pass to the label composer

        in_label_align_file_list = {}
        for label_style, label_style_required in label_composer.label_styles.items():
            if label_style_required:
                logger.info('labels of style %s are required - constructing file paths for them' % label_style)
                if label_style == 'xpath':
                    in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, cfg.xpath_label_align_dir,
                                                                               cfg.utt_ext, False)
                elif label_style == 'hts':
                    in_label_align_file_list['hts'] = prepare_file_path_list(file_id_list, cfg.hts_label_align_dir,
                                                                             cfg.lab_ext, False)
                else:
                    logger.critical('unsupported label style %s specified in label configuration' % label_style)
                    raise Exception

            # now iterate through the files, one at a time, constructing the labels for them
            num_files = len(file_id_list)
            logger.info('the label styles required are %s' % label_composer.label_styles)

            for i in range(num_files):
                logger.info('making input label features for %4d of %4d' % (i + 1, num_files))

                # iterate through the required label styles and open each corresponding label file

                # a dictionary of file descriptors, pointing at the required files
                required_labels = {}

                for label_style, label_style_required in label_composer.label_styles.items():

                    # the files will be a parallel set of files for a single utterance
                    # e.g., the XML tree and an HTS label file
                    if label_style_required:
                        required_labels[label_style] = open(in_label_align_file_list[label_style][i], 'r')
                        logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i])

                logger.debug('label styles with open files: %s' % required_labels)
                label_composer.make_labels(required_labels, out_file_name=binary_label_file_list[i],
                                           fill_missing_values=cfg.fill_missing_values,
                                           iterate_over_frames=cfg.iterate_over_frames)

                # now close all opened files
                for fd in required_labels.values():
                    fd.close()

        # silence removal
        if cfg.remove_silence_using_binary_labels:
            silence_feature = 0  ## use first feature in label -- hardcoded for now
            logger.info('Silence removal from label using silence feature: %s' % (
            label_composer.configuration.labels[silence_feature]))
            logger.info('Silence will be removed from CMP files in same way')
            ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming:
            trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \
                         binary_label_file_list, lab_dim, silence_feature, percent_to_keep=5)
        else:
            logger.info('No silence removal done')
            # start from the labels we have just produced, not trimmed versions
            nn_label_file_list = binary_label_file_list

        ###use only training data to find min-max information, then apply on the whole dataset
        normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])
        normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)

    if normaliser != None:
        ### save label normalisation information for unseen testing labels
        label_min_vector = normaliser.min_vector
        label_max_vector = normaliser.max_vector
        label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0)

        label_norm_info = numpy.array(label_norm_info, 'float32')
        fid = open(label_norm_file, 'wb')
        label_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' % (label_min_vector.size, label_norm_file))

    ### make output acoustic data
    if cfg.MAKECMP:
        logger.info('creating acoustic (output) features')
        delta_win = cfg.delta_win  # [-0.5, 0.0, 0.5]
        acc_win = cfg.acc_win  # [1.0, -2.0, 1.0]

        acoustic_worker = AcousticComposition(delta_win=delta_win, acc_win=acc_win)
        acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict,
                                        cfg.out_dimension_dict)

        if cfg.label_style == 'HTS':

            if cfg.remove_silence_using_binary_labels:
                ## do this to get lab_dim:
                label_composer = LabelComposer()
                label_composer.load_label_configuration(cfg.label_config_file)
                lab_dim = label_composer.compute_label_dimension()

                silence_feature = 0  ## use first feature in label -- hardcoded for now
                logger.info('Silence removal from CMP using binary label file')

                ## overwrite the untrimmed audio with the trimmed version:
                trim_silence(nn_cmp_file_list, nn_cmp_nosil_file_list, cfg.cmp_dim,
                             binary_label_file_list, lab_dim, silence_feature)

            else:  ## back off to previous method using HTS labels:
                remover = SilenceRemover(n_cmp=cfg.cmp_dim, silence_pattern=cfg.silence_pattern)
                remover.remove_silence(nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number],
                                       in_label_align_file_list[0:cfg.train_file_number + cfg.valid_file_number],
                                       nn_cmp_nosil_file_list[
                                       0:cfg.train_file_number + cfg.valid_file_number])  # save to itself


        elif cfg.label_style == 'HTS_duration':
            ## don't remove silences for duration
            nn_cmp_nosil_file_list = nn_cmp_file_list
            pass

    ### save acoustic normalisation information for normalising the features back
    var_dir = os.path.join(data_dir, 'var')
    if not os.path.exists(var_dir):
        os.makedirs(var_dir)

    var_file_dict = {}
    for feature_name in list(cfg.out_dimension_dict.keys()):
        var_file_dict[feature_name] = os.path.join(var_dir,
                                                   feature_name + '_' + str(cfg.out_dimension_dict[feature_name]))

    ### normalise output acoustic data
    if cfg.NORMCMP:
        logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation)
        cmp_norm_info = None
        normaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                        writer_func=file_util.array_to_binary_file)
        if cfg.output_feature_normalisation == 'MVN':

            ###calculate mean and std vectors on the training data, and apply on the whole dataset
            global_mean_vector = normaliser.compute_mean(nn_cmp_nosil_file_list[0:cfg.train_file_number], 0,
                                                         cfg.cmp_dim)
            global_std_vector = normaliser.compute_std(nn_cmp_nosil_file_list[0:cfg.train_file_number],
                                                       global_mean_vector, 0, cfg.cmp_dim)

            normaliser.feature_normalisation(nn_cmp_nosil_file_list[0:cfg.train_file_number + cfg.valid_file_number],
                                             nn_cmp_norm_file_list[0:cfg.train_file_number + cfg.valid_file_number])
            cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0)

        elif cfg.output_feature_normalisation == 'MINMAX':

            global_mean_vector = normaliser.compute_mean(nn_cmp_nosil_file_list[0:cfg.train_file_number])
            global_std_vector = normaliser.compute_std(nn_cmp_nosil_file_list[0:cfg.train_file_number],
                                                               global_mean_vector)
            normaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                            writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99)
            # min_max_normaliser = MinMaxNormalisation(feature_dimension=cfg.cmp_dim, min_value=0.01, max_value=0.99)
            normaliser.find_min_max_values(nn_cmp_nosil_file_list[0:cfg.train_file_number])
            normaliser.normalise_data(nn_cmp_nosil_file_list, nn_cmp_norm_file_list)

            cmp_min_vector = normaliser.min_vector
            cmp_max_vector = normaliser.max_vector
            cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0)

        else:
            logger.critical('Normalisation type %s is not supported!\n' % (cfg.output_feature_normalisation))
            raise

        cmp_norm_info = numpy.array(cmp_norm_info, 'float32')
        fid = open(norm_info_file, 'wb')
        cmp_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' % (cfg.output_feature_normalisation, norm_info_file))
        # logger.debug(' value was\n%s' % cmp_norm_info)

        feature_index = 0
        for feature_name in list(cfg.out_dimension_dict.keys()):
            feature_std_vector = numpy.array(
                global_std_vector[:, feature_index:feature_index + cfg.out_dimension_dict[feature_name]], 'float32')

            fid = open(var_file_dict[feature_name], 'w')
            feature_std_vector.tofile(fid)
            fid.close()

            logger.info('saved %s variance vector to %s' % (feature_name, var_file_dict[feature_name]))
            # logger.debug(' value was\n%s' % feature_std_vector)

            feature_index += cfg.out_dimension_dict[feature_name]

    train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number]
    train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number]
    valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number]
    valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number]
    test_x_file_list = nn_label_norm_file_list[
                       cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
    test_y_file_list = nn_cmp_norm_file_list[
                       cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

    # we need to know the label dimension before training the DNN
    # computing that requires us to look at the labels
    #
    # currently, there are two ways to do this
    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim

    elif cfg.label_style == 'composed':
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)
        lab_dim = label_composer.compute_label_dimension()

    logger.info('label dimension is %d' % lab_dim)

    combined_model_arch = str(len(hidden_layers_sizes))
    for hid_size in hidden_layers_sizes:
        combined_model_arch += '_' + str(hid_size)

    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      % (model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch),
                         combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)

    ### DNN model training
    if cfg.TRAINDNN:

        logger.info('training DNN')

        try:
            os.makedirs(model_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create model directory %s' % model_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        try:
            # print   'start DNN'
            train_DNN(train_xy_file_list=(train_x_file_list, train_y_file_list), \
                      valid_xy_file_list=(valid_x_file_list, valid_y_file_list), \
                      nnets_file_name=nnets_file_name, \
                      n_ins=lab_dim, n_outs=cfg.cmp_dim, ms_outs=cfg.multistream_outs, \
                      hyper_params=cfg.hyper_params, buffer_size=cfg.buffer_size, plot=cfg.plot)
        except KeyboardInterrupt:
            logger.critical('train_DNN interrupted via keyboard')
            # Could 'raise' the exception further, but that causes a deep traceback to be printed
            # which we don't care about for a keyboard interrupt. So, just bail out immediately
            sys.exit(1)
        except:
            logger.critical('train_DNN threw an exception')
            raise

    ### generate parameters from DNN
    temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d' \
                    % (cfg.model_type, cfg.combined_feature_name, int(cfg.do_post_filtering), \
                       cfg.train_file_number, lab_dim, cfg.cmp_dim, \
                       len(hidden_layers_sizes), hidden_layers_sizes[0])
    gen_dir = os.path.join(gen_dir, temp_dir_name)

    gen_file_id_list = file_id_list[
                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
    test_x_file_list = nn_label_norm_file_list[
                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

    if cfg.DNNGEN:
        logger.info('generating from DNN')

        try:
            os.makedirs(gen_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create generation directory %s' % gen_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext)

        #        dnn_generation(valid_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)
        dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)

        logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation)

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_min_vector = cmp_min_max[0,]
        cmp_max_vector = cmp_min_max[1,]
        denormaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                          writer_func=file_util.array_to_binary_file)
        if cfg.output_feature_normalisation == 'MVN'
            denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector)

        elif cfg.output_feature_normalisation == 'MINMAX':
            denormaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                              writer_func=file_util.array_to_binary_file,
                                              min_value=0.01, max_value=0.99, min_vector=cmp_min_vector,
                                              max_vector=cmp_max_vector)
            denormaliser.denormalise_data(gen_file_list, gen_file_list)
        else:
            logger.critical('denormalising method %s is not supported!\n' % (cfg.output_feature_normalisation))
            raise

        ##perform MLPG to smooth parameter trajectory
        ## lf0 is included, the output features much have vuv.
        generator = ParameterGeneration(gen_wav_features=cfg.gen_wav_features)
        generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict,
                                         var_file_dict)

    ### generate wav
    if cfg.GENWAV:
        logger.info('reconstructing waveform(s)')
        generate_wav(gen_dir, gen_file_id_list, cfg)  # generated speech
    #       generate_wav(nn_cmp_dir, gen_file_id_list)  # reference copy synthesis speech

    ### evaluation: calculate distortion
    if cfg.CALMCD:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(data_dir, 'ref_data')

        ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext)
        ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext)
        ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext)

        in_gen_label_align_file_list = in_label_align_file_list[
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
        calculator = IndividualDistortionComp()

        spectral_distortion = 0.0
        bap_mse = 0.0
        f0_mse = 0.0
        vuv_error = 0.0

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number]
        test_file_id_list = file_id_list[
                            cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            ## get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim = label_composer.compute_label_dimension()

            ## use first feature in label -- hardcoded for now
            silence_feature = 0

            ## Use these to trim silence:
            untrimmed_test_labels = binary_label_file_list[
                                    cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

        if 'mgc' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['mgc'][
                                           cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \
                             untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.mgc_dim, silence_pattern=cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['mgc'][
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number],
                                       in_gen_label_align_file_list, ref_mgc_list)
            valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir,
                                                                      cfg.mgc_ext, cfg.mgc_dim)
            test_spectral_distortion = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir,
                                                                     cfg.mgc_ext, cfg.mgc_dim)
            valid_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(2.0)  ##MCD
            test_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(2.0)  ##MCD

        if 'bap' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['bap'][
                                           cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \
                             untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.bap_dim, silence_pattern=cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['bap'][
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number],
                                       in_gen_label_align_file_list, ref_bap_list)
            valid_bap_mse = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext,
                                                          cfg.bap_dim)
            test_bap_mse = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext,
                                                         cfg.bap_dim)
            valid_bap_mse = valid_bap_mse / 10.0
            test_bap_mse = test_bap_mse / 10.0

        if 'lf0' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['lf0'][
                                           cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \
                             untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.lf0_dim, silence_pattern=['*-#+*'])
                remover.remove_silence(in_file_list_dict['lf0'][
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number],
                                       in_gen_label_align_file_list, ref_lf0_list)
            valid_f0_mse, valid_f0_corr, valid_vuv_error = calculator.compute_distortion(valid_file_id_list,
                                                                                         ref_data_dir, gen_dir,
                                                                                         cfg.lf0_ext, cfg.lf0_dim)
            test_f0_mse, test_f0_corr, test_vuv_error = calculator.compute_distortion(test_file_id_list, ref_data_dir,
                                                                                      gen_dir, cfg.lf0_ext, cfg.lf0_dim)

        logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
                    % (valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error * 100.))
        logger.info('Test   : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
                    % (test_spectral_distortion, test_bap_mse, test_f0_mse, test_f0_corr, test_vuv_error * 100.))
Beispiel #6
0
class NNAcousticModel(NN):
    ## add speech specific stuff, like splitting into streams and param gen
    def __init__(
        self,
        model_dir,
        question_file_name,
        silence_pattern='/2:sil/'
    ):  ## TODO: where to handle silence pattern? Currently fragile
        super(NNAcousticModel, self).__init__(model_dir)
        self.load_stream_info()
        self.label_expander = HTSLabelNormalisation(
            question_file_name=question_file_name)
        self.param_generator = MLParameterGenerationFast(
        )  # ParameterGeneration()
        self.silent_feature_indices = self.get_silent_feature_indices(
            question_file_name, silence_pattern)

        std = self.output_std
        m = numpy.shape(std)

        std = std.reshape((1, self.outdim))

        self.stream_std = self.split_into_streams(std)

    def get_silent_feature_indices(self, question_file_name, silence_pattern):
        print 'get_silent_feature_indices'
        indices = []
        questions = [q for q in readlist(question_file_name) if q != '']
        questions = [q for q in questions if 'CQS' not in q]
        for (i, question) in enumerate(questions):
            if silence_pattern in question:
                indices.append(i)
                print 'silence question found:'
                print question
        return indices

    def load_stream_info(self):
        stream_info_fname = os.path.join(self.model_dir, 'stream_info.txt')
        assert os.path.isfile(stream_info_fname)
        stream_data = readlist(stream_info_fname)
        stream_data = [line.split(' ') for line in stream_data]
        assert len(stream_data) == 4
        (self.instreams, indims, self.outstreams, outdims) = stream_data
        indims = [int(val) for val in indims]
        outdims = [int(val) for val in outdims]

        ## note that indims are not network input, but input to acoustic preprocessing of data!
        assert self.outdim == sum(outdims)
        self.indims = dict(zip(self.instreams, indims))
        self.outdims = dict(zip(self.outstreams, outdims))

    ## FOR DEBUGGING:-
    def generate_from_norm_binary_lab(self,
                                      bin_label_file,
                                      labdim,
                                      outwave,
                                      enforce_silence=False,
                                      mlpg=True,
                                      vuv_thresh=0.5,
                                      fzero_scale=1.0):

        input = get_speech(bin_label_file, labdim)

        #input = input[:500,:]
        output = self.predict(input, input_normalisation=True)

        put_speech(
            output,
            '/afs/inf.ed.ac.uk/user/o/owatts/temp/cpu_gen/undenorm_66_015_from_norm_lab.cmp'
        )
        sys.exit('vliadnviadnvdvn stoped early')

        streams = self.split_into_streams(output)

        if mlpg:
            mlpged = {}
            for (stream, data) in streams.items():
                if stream in self.indims:
                    mlpg_data = self.param_generator.generation(
                        data, self.stream_std[stream], self.indims[stream])
                else:
                    mlpg_data = data
                mlpged[stream] = mlpg_data
            streams = mlpged

        else:
            # take statics only!
            statics = {}
            for (stream, data) in streams.items():
                if stream in self.indims:
                    statics[stream] = data[:, :self.indims[stream]]
                else:  ## for e.g. vuv
                    statics[stream] = data
            streams = statics

        if enforce_silence:
            for (stream, data) in streams.items():
                print input[:, self.silent_feature_indices]
                sys.exit('ntfbdfbsfrbsfbr')
                silent_frames = numpy.sum(input[:,
                                                self.silent_feature_indices],
                                          axis=1)
                data[silent_frames == 1.0, :] = 0.0
                streams[stream] = data

        if 'lf0' in streams:
            fzero = numpy.exp(streams['lf0'])

            if 'vuv' in streams:
                vuv = streams['vuv']
                lf0 = streams['lf0']
                fzero[vuv <= vuv_thresh] = 0.0

            fzero *= fzero_scale

            streams['lf0'] = fzero

        self.world_resynth(streams, outwave)



    def generate(self, htk_label_file, enforce_silence=True, mlpg=True, fill_unvoiced_gaps=0, \
                                 variance_expansion=1.0, vuv_thresh=0.5, fzero_scale=1.0):

        input = self.label_expander.load_labels_with_state_alignment(
            htk_label_file)
        output = self.predict(input)
        streams = self.split_into_streams(output)

        if mlpg:
            mlpged = {}
            for (stream, data) in streams.items():
                if stream in self.indims:
                    mlpg_data = self.param_generator.generation(
                        data, self.stream_std[stream], self.indims[stream])
                else:
                    mlpg_data = data
                mlpged[stream] = mlpg_data
            streams = mlpged

        else:
            # take statics only!
            statics = {}
            for (stream, data) in streams.items():
                if stream in self.indims:
                    statics[stream] = data[:, :self.indims[stream]]
                else:  ## for e.g. vuv
                    statics[stream] = data
            streams = statics

        ## TODO: handle F0 separately
        if variance_expansion > 0.0:
            new_streams = {}
            for (stream, data) in streams.items():
                new_streams[stream] = self.simple_scale_variance_wrapper_p0(
                    streams[stream], stream)
            streams = new_streams

        # impose 0 ceiling on baps, else we get artifacts:-
        # (I think this was the problem I was trying to fix by not scaling f0 and energy previously)
        streams['bap'] = np.minimum(streams['bap'],
                                    np.zeros(np.shape(streams['bap'])))

        #         if fill_unvoiced_gaps > 0:
        #             vuv = streams['vuv']
        #             ## turn from soft to binary:
        #             binary_vuv = np.zeros(np.shape(vuv))
        #             binary_vuv[vuv > vuv_thresh] = 1
        #             vuv = binary_vuv
        #             gaplength = fill_unvoiced_gaps
        #             vuv = fill_short_unvoiced_gaps(vuv, gaplength)
        #             print vuv
        #             streams['vuv'] = vuv
        #

        if enforce_silence:
            for (stream, data) in streams.items():
                silent_frames = numpy.sum(input[:,
                                                self.silent_feature_indices],
                                          axis=1)
                data[silent_frames == 1.0, :] = 0.0
                streams[stream] = data

        if 'lf0' in streams:
            fzero = numpy.exp(streams['lf0'])

            if 'vuv' in streams:
                vuv = streams['vuv']
                lf0 = streams['lf0']
                fzero[vuv <= vuv_thresh] = 0.0

            fzero *= fzero_scale

            streams['lf0'] = fzero

        #self.world_resynth(streams, outwave)
        return streams

    def split_into_streams(self, input):
        nframe, ndim = numpy.shape(input)
        assert ndim == self.outdim, (ndim, self.outdim)

        start = 0
        outputs = {}
        for stream in self.outstreams:
            end = start + self.outdims[stream]
            print stream
            outputs[stream] = input[:, start:end]
            start = end

        return outputs

    #def enforce_silence(streams):


#    def expand_label():

    def simple_scale_variance_wrapper_0(self, speech, stream):

        return speech

    def simple_scale_variance_wrapper_p0(self, speech, stream):

        if stream == 'mgc':
            cep_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            ene_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=0.0)
            scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]])
        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance_wrapper_p2(self, speech, stream):

        if stream == 'mgc':
            cep_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            ene_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=0.2)
            scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]])
        if stream == 'lf0':
            scaled_speech = self.simple_scale_variance(speech,
                                                       stream,
                                                       gv_weight=0.2)
        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance_wrapper_p5(self, speech, stream):

        if stream == 'mgc':
            cep_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            ene_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=0.5)
            scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]])
        if stream == 'lf0':
            scaled_speech = self.simple_scale_variance(speech,
                                                       stream,
                                                       gv_weight=0.5)
        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance_wrapper_1(self, speech, stream):

        if stream == 'mgc':
            cep_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            ene_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]])
        if stream == 'lf0':
            scaled_speech = self.simple_scale_variance(speech,
                                                       stream,
                                                       gv_weight=1.0)
        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance_wrapper_m2(self, speech, stream):

        if stream == 'mgc':
            cep_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            ene_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=0.0)
            scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]])
        if stream == 'lf0':
            scaled_speech = self.simple_scale_variance(speech,
                                                       stream,
                                                       gv_weight=0.2)
        if stream == 'bap':
            scaled_speech = self.simple_scale_variance(speech,
                                                       stream,
                                                       gv_weight=1.0)

        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance_wrapper_n2(self, speech, stream):

        if stream == 'mgc':
            cep_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=1.0)
            ene_speech = self.simple_scale_variance(speech,
                                                    stream,
                                                    gv_weight=0.2)
            scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]])
        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance_wrapper_nfull(self, speech, stream):

        if stream == 'mgc':
            scaled_speech = self.simple_scale_variance(speech,
                                                       stream,
                                                       gv_weight=1.0)
        else:
            scaled_speech = speech
        return scaled_speech

    def simple_scale_variance(self, speech, stream, gv_weight=1.0):

        stream_std = self.stream_std[stream][0, :]
        static_std = stream_std[:self.indims[stream]]

        assert gv_weight <= 1.0 and gv_weight >= 0.0
        local_weight = 1.0 - gv_weight

        utt_mean = numpy.mean(speech, axis=0)
        utt_std = numpy.std(speech, axis=0)

        global_std = numpy.transpose(static_std)
        weighted_global_std = (gv_weight * global_std) + (local_weight *
                                                          utt_std)
        std_ratio = weighted_global_std / utt_std

        nframes, ndim = numpy.shape(speech)
        utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1))
        std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1))

        scaled_speech = (
            (speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix

        return scaled_speech
Beispiel #7
0
def main_function(cfg):
    file_paths = FilePaths(cfg)

    # get a logger for this main function
    logger = logging.getLogger("main")

    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")

    # later, we might do this via a handler that is created, attached and configured
    # using the standard config mechanism of the logging module
    # but for now we need to do it manually
    plotlogger.set_plot_path(cfg.plot_dir)

    # create plot dir if set to True
    if not os.path.exists(cfg.plot_dir) and cfg.plot:
        os.makedirs(cfg.plot_dir)

    #### parameter setting########
    hidden_layer_size = cfg.hyper_params['hidden_layer_size']

    ####prepare environment
    try:
        file_id_list = read_file_list(cfg.file_id_scp)
        logger.debug('Loaded file id list from %s' % cfg.file_id_scp)
    except IOError:
        # this means that open(...) threw an error
        logger.critical('Could not load file id list from %s' % cfg.file_id_scp)
        raise

    ###total file number including training, development, and testing
    total_file_number = len(file_id_list)
    assert cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number == total_file_number, 'check train, valid, test file number'

    data_dir = cfg.data_dir

    inter_data_dir = cfg.inter_data_dir
    nn_cmp_dir       = file_paths.nn_cmp_dir
    nn_cmp_norm_dir   = file_paths.nn_cmp_norm_dir
    model_dir = file_paths.model_dir
    gen_dir   = file_paths.gen_dir

    in_file_list_dict = {}

    for feature_name in list(cfg.in_dir_dict.keys()):
        in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False)

    nn_cmp_file_list         = file_paths.get_nn_cmp_file_list()
    nn_cmp_norm_file_list    = file_paths.get_nn_cmp_norm_file_list()

    ###normalisation information
    norm_info_file = file_paths.norm_info_file

    ### normalise input full context label
    # currently supporting two different forms of lingustic features
    # later, we should generalise this

    assert cfg.label_style == 'HTS', 'Only HTS-style labels are now supported as input to Merlin'

    label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats)
    add_feat_dim = sum(cfg.additional_features.values())
    lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim
    if cfg.VoiceConversion:
        lab_dim = cfg.cmp_dim
    logger.info('Input label dimension is %d' % lab_dim)
    suffix=str(lab_dim)


    if cfg.process_labels_in_work_dir:
        inter_data_dir = cfg.work_dir

    # the number can be removed
    file_paths.set_label_dir(label_normaliser.dimension, suffix, lab_dim)
    file_paths.set_label_file_list()

    binary_label_dir      = file_paths.binary_label_dir
    nn_label_dir          = file_paths.nn_label_dir
    nn_label_norm_dir     = file_paths.nn_label_norm_dir

    in_label_align_file_list = file_paths.in_label_align_file_list
    binary_label_file_list   = file_paths.binary_label_file_list
    nn_label_file_list       = file_paths.nn_label_file_list
    nn_label_norm_file_list  = file_paths.nn_label_norm_file_list

    min_max_normaliser = None

    label_norm_file = file_paths.label_norm_file

    test_id_list = file_paths.test_id_list

    if cfg.NORMLAB:
        # simple HTS labels
        logger.info('preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list, label_type=cfg.label_type)

        if cfg.additional_features:
            out_feat_file_list = file_paths.out_feat_file_list
            in_dim = label_normaliser.dimension

            for new_feature, new_feature_dim in cfg.additional_features.items():
                new_feat_dir  = os.path.join(data_dir, new_feature)
                new_feat_file_list = prepare_file_path_list(file_id_list, new_feat_dir, '.'+new_feature)

                merger = MergeFeat(lab_dim = in_dim, feat_dim = new_feature_dim)
                merger.merge_data(binary_label_file_list, new_feat_file_list, out_feat_file_list)
                in_dim += new_feature_dim

                binary_label_file_list = out_feat_file_list

        remover = SilenceRemover(n_cmp = lab_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features = cfg.add_frame_features, subphone_feats = cfg.subphone_feats)
        remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list)

        min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99)

        ###use only training data to find min-max information, then apply on the whole dataset
        if cfg.GenTestList:
            min_max_normaliser.load_min_max_values(label_norm_file)
        else:
            min_max_normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])

        ### enforce silence such that the normalization runs without removing silence: only for final synthesis
        if cfg.GenTestList and cfg.enforce_silence:
            min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list)
        else:
            min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)



    if min_max_normaliser != None and not cfg.GenTestList:
        ### save label normalisation information for unseen testing labels
        label_min_vector = min_max_normaliser.min_vector
        label_max_vector = min_max_normaliser.max_vector
        label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0)

        label_norm_info = numpy.array(label_norm_info, 'float32')
        fid = open(label_norm_file, 'wb')
        label_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' %(label_min_vector.size, label_norm_file))

    ### make output duration data
    if cfg.MAKEDUR:
        logger.info('creating duration (output) features')
        label_normaliser.prepare_dur_data(in_label_align_file_list, file_paths.dur_file_list, cfg.label_type, cfg.dur_feature_type)

    ### make output acoustic data
    if cfg.MAKECMP:
        logger.info('creating acoustic (output) features')
        delta_win = cfg.delta_win #[-0.5, 0.0, 0.5]
        acc_win = cfg.acc_win     #[1.0, -2.0, 1.0]

        if cfg.GenTestList:
            for feature_name in list(cfg.in_dir_dict.keys()):
                in_file_list_dict[feature_name] = prepare_file_path_list(test_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False)
            nn_cmp_file_list      = prepare_file_path_list(test_id_list, nn_cmp_dir, cfg.cmp_ext)
            nn_cmp_norm_file_list = prepare_file_path_list(test_id_list, nn_cmp_norm_dir, cfg.cmp_ext)
        
        acoustic_worker = AcousticComposition(delta_win = delta_win, acc_win = acc_win)

        if 'dur' in list(cfg.in_dir_dict.keys()) and cfg.AcousticModel:
            lf0_file_list = file_paths.get_lf0_file_list()
            acoustic_worker.make_equal_frames(dur_file_list, lf0_file_list, cfg.in_dimension_dict)

        acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict)

        if cfg.remove_silence_using_binary_labels:
            ## do this to get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim=label_composer.compute_label_dimension()

            silence_feature = 0 ## use first feature in label -- hardcoded for now
            logger.info('Silence removal from CMP using binary label file')

            ## overwrite the untrimmed audio with the trimmed version:
            trim_silence(nn_cmp_file_list, nn_cmp_file_list, cfg.cmp_dim,
                                binary_label_file_list, lab_dim, silence_feature)

        elif cfg.remove_silence_using_hts_labels: 
            ## back off to previous method using HTS labels:
            remover = SilenceRemover(n_cmp = cfg.cmp_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features = cfg.add_frame_features, subphone_feats = cfg.subphone_feats)
            remover.remove_silence(nn_cmp_file_list, in_label_align_file_list, nn_cmp_file_list) # save to itself

    ### save acoustic normalisation information for normalising the features back
    var_dir  = file_paths.var_dir
    var_file_dict = file_paths.get_var_dic()

    ### normalise output acoustic data
    if cfg.NORMCMP:
        logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation)
        cmp_norm_info = None
        if cfg.output_feature_normalisation == 'MVN':
            normaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim)
            if cfg.GenTestList:
                # load mean std values
                global_mean_vector, global_std_vector = normaliser.load_mean_std_values(norm_info_file)
            else:
                ###calculate mean and std vectors on the training data, and apply on the whole dataset
                global_mean_vector = normaliser.compute_mean(nn_cmp_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim)
                global_std_vector = normaliser.compute_std(nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector, 0, cfg.cmp_dim)
            normaliser.feature_normalisation(nn_cmp_file_list, nn_cmp_norm_file_list)
            cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0)

        elif cfg.output_feature_normalisation == 'MINMAX':
            min_max_normaliser = MinMaxNormalisation(feature_dimension = cfg.cmp_dim, min_value = 0.01, max_value = 0.99)
            if cfg.GenTestList:
                min_max_normaliser.load_min_max_values(norm_info_file)
            else:
                min_max_normaliser.find_min_max_values(nn_cmp_file_list[0:cfg.train_file_number])
            min_max_normaliser.normalise_data(nn_cmp_file_list, nn_cmp_norm_file_list)

            cmp_min_vector = min_max_normaliser.min_vector
            cmp_max_vector = min_max_normaliser.max_vector
            cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0)

        else:
            logger.critical('Normalisation type %s is not supported!\n' %(cfg.output_feature_normalisation))
            raise

        if not cfg.GenTestList:
            cmp_norm_info = numpy.array(cmp_norm_info, 'float32')
            fid = open(norm_info_file, 'wb')
            cmp_norm_info.tofile(fid)
            fid.close()
            logger.info('saved %s vectors to %s' %(cfg.output_feature_normalisation, norm_info_file))

            feature_index = 0
            for feature_name in list(cfg.out_dimension_dict.keys()):
                feature_std_vector = numpy.array(global_std_vector[:,feature_index:feature_index+cfg.out_dimension_dict[feature_name]], 'float32')

                fid = open(var_file_dict[feature_name], 'w')
                feature_var_vector = feature_std_vector**2
                feature_var_vector.tofile(fid)
                fid.close()

                logger.info('saved %s variance vector to %s' %(feature_name, var_file_dict[feature_name]))

                feature_index += cfg.out_dimension_dict[feature_name]

    train_x_file_list, train_y_file_list = file_paths.get_train_list_x_y()
    valid_x_file_list, valid_y_file_list = file_paths.get_valid_list_x_y()
    test_x_file_list, test_y_file_list = file_paths.get_test_list_x_y()

    # we need to know the label dimension before training the DNN
    # computing that requires us to look at the labels
    #
    label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats)
    add_feat_dim = sum(cfg.additional_features.values())
    lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim
    if cfg.VoiceConversion:
        lab_dim = cfg.cmp_dim

    logger.info('label dimension is %d' % lab_dim)

    combined_model_arch = str(len(hidden_layer_size))
    for hid_size in hidden_layer_size:
        combined_model_arch += '_' + str(hid_size)

    nnets_file_name = file_paths.get_nnets_file_name()
    temp_dir_name = file_paths.get_temp_nn_dir_name()

    gen_dir = os.path.join(gen_dir, temp_dir_name)

    if cfg.switch_to_keras:
        ### set configuration variables ###
        cfg.inp_dim = lab_dim
        cfg.out_dim = cfg.cmp_dim

        cfg.inp_feat_dir  = nn_label_norm_dir
        cfg.out_feat_dir  = nn_cmp_norm_dir
        cfg.pred_feat_dir = gen_dir

        if cfg.GenTestList and cfg.test_synth_dir!="None":
            cfg.inp_feat_dir  = cfg.test_synth_dir
            cfg.pred_feat_dir = cfg.test_synth_dir
        
        ### call kerasclass and use an instance ###
        keras_instance = KerasClass(cfg)

    ### DNN model training
    if cfg.TRAINDNN:

        var_dict = load_covariance(var_file_dict, cfg.out_dimension_dict)

        logger.info('training DNN')

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_mean_vector = cmp_min_max[0, ]
        cmp_std_vector  = cmp_min_max[1, ]


        try:
            os.makedirs(model_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create model directory %s' % model_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        try:
            if cfg.switch_to_keras:
                keras_instance.train_keras_model()
            else:
                train_DNN(train_xy_file_list = (train_x_file_list, train_y_file_list), \
                      valid_xy_file_list = (valid_x_file_list, valid_y_file_list), \
                      nnets_file_name = nnets_file_name, \
                      n_ins = lab_dim, n_outs = cfg.cmp_dim, ms_outs = cfg.multistream_outs, \
                      hyper_params = cfg.hyper_params, buffer_size = cfg.buffer_size, plot = cfg.plot, var_dict = var_dict,
                      cmp_mean_vector = cmp_mean_vector, cmp_std_vector = cmp_std_vector)
        except KeyboardInterrupt:
            logger.critical('train_DNN interrupted via keyboard')
            # Could 'raise' the exception further, but that causes a deep traceback to be printed
            # which we don't care about for a keyboard interrupt. So, just bail out immediately
            sys.exit(1)
        except:
            logger.critical('train_DNN threw an exception')
            raise



    if cfg.GENBNFEA:
        # Please only tune on this step when you want to generate bottleneck features from DNN
        gen_dir = file_paths.bottleneck_features

        bottleneck_size = min(hidden_layer_size)
        bottleneck_index = 0
        for i in range(len(hidden_layer_size)):
            if hidden_layer_size[i] == bottleneck_size:
                bottleneck_index = i

        logger.info('generating bottleneck features from DNN')

        try:
            os.makedirs(gen_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create generation directory %s' % gen_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        gen_file_id_list = file_id_list[0:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
        test_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

        gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext)

        dnn_hidden_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list, bottleneck_index)

    ### generate parameters from DNN
    gen_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
    test_x_file_list  = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

    if cfg.GenTestList:
        gen_file_id_list = test_id_list
        test_x_file_list = nn_label_norm_file_list
        if cfg.test_synth_dir!="None":
            gen_dir = cfg.test_synth_dir

    if cfg.DNNGEN:
        logger.info('generating from DNN')

        try:
            os.makedirs(gen_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create generation directory %s' % gen_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext)


        if cfg.switch_to_keras:
            keras_instance.test_keras_model()
        else:
            reshape_io = True if cfg.rnn_batch_training else False
            dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list, reshape_io)

        logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation)

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_min_vector = cmp_min_max[0, ]
        cmp_max_vector = cmp_min_max[1, ]

        if cfg.output_feature_normalisation == 'MVN':
            denormaliser = MeanVarianceNorm(feature_dimension = cfg.cmp_dim)
            denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector)

        elif cfg.output_feature_normalisation == 'MINMAX':
            denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value = 0.01, max_value = 0.99, min_vector = cmp_min_vector, max_vector = cmp_max_vector)
            denormaliser.denormalise_data(gen_file_list, gen_file_list)
        else:
            logger.critical('denormalising method %s is not supported!\n' %(cfg.output_feature_normalisation))
            raise

        if cfg.AcousticModel:
            ##perform MLPG to smooth parameter trajectory
            ## lf0 is included, the output features much have vuv.
            generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features, enforce_silence = cfg.enforce_silence)
            generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict, do_MLPG=cfg.do_MLPG, cfg=cfg)

        if cfg.DurationModel:
            ### Perform duration normalization(min. state dur set to 1) ###
            gen_dur_list   = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.dur_ext)
            gen_label_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.lab_ext)
            in_gen_label_align_file_list = prepare_file_path_list(gen_file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)

            generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features)
            generator.duration_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict)

            label_modifier = HTSLabelModification(silence_pattern = cfg.silence_pattern, label_type = cfg.label_type)
            label_modifier.modify_duration_labels(in_gen_label_align_file_list, gen_dur_list, gen_label_list)


    ### generate wav
    if cfg.GENWAV:
        logger.info('reconstructing waveform(s)')
        generate_wav(gen_dir, gen_file_id_list, cfg)     # generated speech
#       generate_wav(nn_cmp_dir, gen_file_id_list, cfg)  # reference copy synthesis speech

    ### setting back to original conditions before calculating objective scores ###
    if cfg.GenTestList:
        in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)
        binary_label_file_list   = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext)
        gen_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

    ### evaluation: RMSE and CORR for duration
    if cfg.CALMCD and cfg.DurationModel:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(inter_data_dir, 'ref_data')

        ref_dur_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.dur_ext)

        in_gen_label_align_file_list = in_label_align_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
        calculator = IndividualDistortionComp()

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number]
        test_file_id_list  = file_id_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            untrimmed_reference_data = in_file_list_dict['dur'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
            trim_silence(untrimmed_reference_data, ref_dur_list, cfg.dur_dim, \
                                untrimmed_test_labels, lab_dim, silence_feature)
        else:
            remover = SilenceRemover(n_cmp = cfg.dur_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features = cfg.add_frame_features)
            remover.remove_silence(in_file_list_dict['dur'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_dur_list)

        valid_dur_rmse, valid_dur_corr = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim)
        test_dur_rmse, test_dur_corr = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim)

        logger.info('Develop: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \
                    %(valid_dur_rmse, valid_dur_corr))
        logger.info('Test: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \
                    %(test_dur_rmse, test_dur_corr))

    ### evaluation: calculate distortion
    if cfg.CALMCD and cfg.AcousticModel:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(inter_data_dir, 'ref_data')

        ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext)
        ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext)
        ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext)

        in_gen_label_align_file_list = in_label_align_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
        calculator = IndividualDistortionComp()

        spectral_distortion = 0.0
        bap_mse             = 0.0
        f0_mse              = 0.0
        vuv_error           = 0.0

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number]
        test_file_id_list  = file_id_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            ## get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim=label_composer.compute_label_dimension()

            ## use first feature in label -- hardcoded for now
            silence_feature = 0

            ## Use these to trim silence:
            untrimmed_test_labels = binary_label_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]


        if 'mgc' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \
                                    untrimmed_test_labels, lab_dim, silence_feature)
            elif cfg.remove_silence_using_hts_labels:
                remover = SilenceRemover(n_cmp = cfg.mgc_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type)
                remover.remove_silence(in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_mgc_list)
            else:
                ref_data_dir = os.path.join(data_dir, 'mgc')
            valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim)
            test_spectral_distortion  = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim)
            valid_spectral_distortion *= (10 /numpy.log(10)) * numpy.sqrt(2.0)    ##MCD
            test_spectral_distortion  *= (10 /numpy.log(10)) * numpy.sqrt(2.0)    ##MCD


        if 'bap' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \
                                    untrimmed_test_labels, lab_dim, silence_feature)
            elif cfg.remove_silence_using_hts_labels:
                remover = SilenceRemover(n_cmp = cfg.bap_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type)
                remover.remove_silence(in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_bap_list)
            else:
                ref_data_dir = os.path.join(data_dir, 'bap')
            valid_bap_mse = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim)
            test_bap_mse  = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim)
            valid_bap_mse = valid_bap_mse / 10.0    ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC
            test_bap_mse  = test_bap_mse / 10.0    ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC

        if 'lf0' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \
                                    untrimmed_test_labels, lab_dim, silence_feature)
            elif cfg.remove_silence_using_hts_labels:
                remover = SilenceRemover(n_cmp = cfg.lf0_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type)
                remover.remove_silence(in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_lf0_list)
            else:
                ref_data_dir = os.path.join(data_dir, 'lf0')
            valid_f0_mse, valid_f0_corr, valid_vuv_error   = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim)
            test_f0_mse , test_f0_corr, test_vuv_error    = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim)

        logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
                    %(valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error*100.))
        logger.info('Test   : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
                    %(test_spectral_distortion , test_bap_mse , test_f0_mse , test_f0_corr, test_vuv_error*100.))
Beispiel #8
0
    p.add_argument('-s', '--senlst', dest='senlst', required=True)
    p.add_argument('-c', '--config', dest='config', required=True)
    a = p.parse_args()

    load_config(a.config)
    from __init__ import *

    with open(a.senlst) as f:
        sentences = [l.rstrip() for l in f if l]

    hts2 = [path.join(HTS2DIR, s + '.lab') for s in sentences]
    lab1 = [path.join(LAB1DIR, s + '.lab') for s in sentences]
    lab2 = [path.join(LAB2DIR, s + '.lab') for s in sentences]
    lab3 = [path.join(LAB3DIR, s + '.lab') for s in sentences]

    binarizer = HTSLabelNormalisation(
        question_file_name=path.join(RESDIR, '600.hed'))
    binarizer.perform_normalisation(hts2, lab1)

    remover = SilenceRemover(n_cmp=binarizer.dimension,
                             silence_pattern=['*-#+*'])
    remover.remove_silence(lab1, hts2, lab2)

    normalizer = MinMaxNormalisation(feature_dimension=binarizer.dimension,
                                     min_value=0.01,
                                     max_value=0.99)
    normalizer.find_min_max_values(lab2)
    print1(normalizer.min_vector)
    print1(normalizer.max_vector)
    lu.write_binfile(normalizer.min_vector, path.join(LABSDIR, 'min'))
    lu.write_binfile(normalizer.max_vector, path.join(LABSDIR, 'max'))
Beispiel #9
0
def main_function(cfg):


    # get a logger for this main function
    logger = logging.getLogger("main")

    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")

    # later, we might do this via a handler that is created, attached and configured
    # using the standard config mechanism of the logging module
    # but for now we need to do it manually
    plotlogger.set_plot_path(cfg.plot_dir)

    #### parameter setting########
    hidden_layer_size = cfg.hyper_params['hidden_layer_size']


    ####prepare environment

    try:
        file_id_list = read_file_list(cfg.file_id_scp)
        logger.debug('Loaded file id list from %s' % cfg.file_id_scp)
    except IOError:
        # this means that open(...) threw an error
        logger.critical('Could not load file id list from %s' % cfg.file_id_scp)
        raise

    ###total file number including training, development, and testing
    total_file_number = len(file_id_list)

    data_dir = cfg.data_dir

    nn_cmp_dir       = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_norm_dir   = os.path.join(data_dir, 'nn_norm'  + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))

    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    gen_dir   = os.path.join(cfg.work_dir, 'gen')

    in_file_list_dict = {}

    for feature_name in list(cfg.in_dir_dict.keys()):
        in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False)

    nn_cmp_file_list         = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext)
    nn_cmp_norm_file_list    = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext)

    ###normalisation information
    norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label
    # currently supporting two different forms of lingustic features
    # later, we should generalise this

    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim
        logger.info('Input label dimension is %d' % lab_dim)
        suffix=str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix='composed'

    if cfg.process_labels_in_work_dir:
        label_data_dir = cfg.work_dir
    else:
        label_data_dir = data_dir

    # the number can be removed
    binary_label_dir      = os.path.join(label_data_dir, 'binary_label_'+suffix)
    nn_label_dir          = os.path.join(label_data_dir, 'nn_no_silence_lab_'+suffix)
    nn_label_norm_dir     = os.path.join(label_data_dir, 'nn_no_silence_lab_norm_'+suffix)

    in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)
    binary_label_file_list   = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext)
    nn_label_file_list       = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext)
    nn_label_norm_file_list  = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext)

    # to do - sanity check the label dimension here?



    min_max_normaliser = None
    label_norm_file = 'label_norm_%s.dat' %(cfg.label_style)
    label_norm_file = os.path.join(label_data_dir, label_norm_file)

    if cfg.NORMLAB and (cfg.label_style == 'HTS'):
        # simple HTS labels
        logger.info('preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list)

        remover = SilenceRemover(n_cmp = lab_dim, silence_pattern = cfg.silence_pattern)
        remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list)

        min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99)
        ###use only training data to find min-max information, then apply on the whole dataset
        min_max_normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])
        min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)


    if cfg.NORMLAB and (cfg.label_style == 'composed'):
        # new flexible label preprocessor

        logger.info('preparing label data (input) using "composed" style labels')
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)

        logger.info('Loaded label configuration')
        # logger.info('%s' % label_composer.configuration.labels )

        lab_dim=label_composer.compute_label_dimension()
        logger.info('label dimension will be %d' % lab_dim)

        if cfg.precompile_xpaths:
            label_composer.precompile_xpaths()

        # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees)
        # create all the lists of these, ready to pass to the label composer

        in_label_align_file_list = {}
        for label_style, label_style_required in label_composer.label_styles.items():
            if label_style_required:
                logger.info('labels of style %s are required - constructing file paths for them' % label_style)
                if label_style == 'xpath':
                    in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, cfg.xpath_label_align_dir, cfg.utt_ext, False)
                elif label_style == 'hts':
                    in_label_align_file_list['hts'] = prepare_file_path_list(file_id_list, cfg.hts_label_align_dir, cfg.lab_ext, False)
                else:
                    logger.critical('unsupported label style %s specified in label configuration' % label_style)
                    raise Exception

            # now iterate through the files, one at a time, constructing the labels for them
            num_files=len(file_id_list)
            logger.info('the label styles required are %s' % label_composer.label_styles)

            for i in range(num_files):
                logger.info('making input label features for %4d of %4d' % (i+1,num_files))

                # iterate through the required label styles and open each corresponding label file

                # a dictionary of file descriptors, pointing at the required files
                required_labels={}

                for label_style, label_style_required in label_composer.label_styles.items():

                    # the files will be a parallel set of files for a single utterance
                    # e.g., the XML tree and an HTS label file
                    if label_style_required:
                        required_labels[label_style] = open(in_label_align_file_list[label_style][i] , 'r')
                        logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i])

                logger.debug('label styles with open files: %s' % required_labels)
                label_composer.make_labels(required_labels,out_file_name=binary_label_file_list[i],fill_missing_values=cfg.fill_missing_values,iterate_over_frames=cfg.iterate_over_frames)

                # now close all opened files
                for fd in required_labels.values():
                    fd.close()


        # silence removal
        if cfg.remove_silence_using_binary_labels:
            silence_feature = 0 ## use first feature in label -- hardcoded for now
            logger.info('Silence removal from label using silence feature: %s'%(label_composer.configuration.labels[silence_feature]))
            logger.info('Silence will be removed from CMP files in same way')
            ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming:
            trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \
                                binary_label_file_list, lab_dim, silence_feature)
        else:
            logger.info('No silence removal done')
            # start from the labels we have just produced, not trimmed versions
            nn_label_file_list = binary_label_file_list

        min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99)
        ###use only training data to find min-max information, then apply on the whole dataset
        min_max_normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])
        min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)

    if min_max_normaliser != None:
        ### save label normalisation information for unseen testing labels
        label_min_vector = min_max_normaliser.min_vector
        label_max_vector = min_max_normaliser.max_vector
        label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0)

        label_norm_info = numpy.array(label_norm_info, 'float32')
        fid = open(label_norm_file, 'wb')
        label_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' %(label_min_vector.size, label_norm_file))




    ### make output acoustic data
    if cfg.MAKECMP:
        logger.info('creating acoustic (output) features')
        delta_win = cfg.delta_win #[-0.5, 0.0, 0.5]
        acc_win = cfg.acc_win     #[1.0, -2.0, 1.0]

        acoustic_worker = AcousticComposition(delta_win = delta_win, acc_win = acc_win)
        acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict)

        if cfg.remove_silence_using_binary_labels:
            ## do this to get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim=label_composer.compute_label_dimension()

            silence_feature = 0 ## use first feature in label -- hardcoded for now
            logger.info('Silence removal from CMP using binary label file')

            ## overwrite the untrimmed audio with the trimmed version:
            trim_silence(nn_cmp_file_list, nn_cmp_file_list, cfg.cmp_dim,
                                binary_label_file_list, lab_dim, silence_feature)

        else: ## back off to previous method using HTS labels:
            remover = SilenceRemover(n_cmp = cfg.cmp_dim, silence_pattern = cfg.silence_pattern)
            remover.remove_silence(nn_cmp_file_list[0:cfg.train_file_number+cfg.valid_file_number],
                                   in_label_align_file_list[0:cfg.train_file_number+cfg.valid_file_number],
                                   nn_cmp_file_list[0:cfg.train_file_number+cfg.valid_file_number]) # save to itself

    ### save acoustic normalisation information for normalising the features back
    var_dir   = os.path.join(data_dir, 'var')
    if not os.path.exists(var_dir):
        os.makedirs(var_dir)

    var_file_dict = {}
    for feature_name in list(cfg.out_dimension_dict.keys()):
        var_file_dict[feature_name] = os.path.join(var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name]))

    ### normalise output acoustic data
    if cfg.NORMCMP:
        logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation)
        cmp_norm_info = None
        if cfg.output_feature_normalisation == 'MVN':
            normaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim)
            ###calculate mean and std vectors on the training data, and apply on the whole dataset
            global_mean_vector = normaliser.compute_mean(nn_cmp_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim)
            global_std_vector = normaliser.compute_std(nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector, 0, cfg.cmp_dim)

            normaliser.feature_normalisation(nn_cmp_file_list[0:cfg.train_file_number+cfg.valid_file_number],
                                             nn_cmp_norm_file_list[0:cfg.train_file_number+cfg.valid_file_number])
            cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0)

        elif cfg.output_feature_normalisation == 'MINMAX':
            min_max_normaliser = MinMaxNormalisation(feature_dimension = cfg.cmp_dim)
            global_mean_vector = min_max_normaliser.compute_mean(nn_cmp_file_list[0:cfg.train_file_number])
            global_std_vector = min_max_normaliser.compute_std(nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector)

            min_max_normaliser = MinMaxNormalisation(feature_dimension = cfg.cmp_dim, min_value = 0.01, max_value = 0.99)
            min_max_normaliser.find_min_max_values(nn_cmp_file_list[0:cfg.train_file_number])
            min_max_normaliser.normalise_data(nn_cmp_file_list, nn_cmp_norm_file_list)

            cmp_min_vector = min_max_normaliser.min_vector
            cmp_max_vector = min_max_normaliser.max_vector
            cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0)

        else:
            logger.critical('Normalisation type %s is not supported!\n' %(cfg.output_feature_normalisation))
            raise

        cmp_norm_info = numpy.array(cmp_norm_info, 'float32')
        fid = open(norm_info_file, 'wb')
        cmp_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' %(cfg.output_feature_normalisation, norm_info_file))

        feature_index = 0
        for feature_name in list(cfg.out_dimension_dict.keys()):
            feature_std_vector = numpy.array(global_std_vector[:,feature_index:feature_index+cfg.out_dimension_dict[feature_name]], 'float32')

            fid = open(var_file_dict[feature_name], 'w')
            feature_std_vector.tofile(fid)
            fid.close()

            logger.info('saved %s variance vector to %s' %(feature_name, var_file_dict[feature_name]))

            feature_index += cfg.out_dimension_dict[feature_name]

    train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number]
    train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number]
    valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number]
    valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number]
    test_x_file_list  = nn_label_norm_file_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
    test_y_file_list  = nn_cmp_norm_file_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]


    # we need to know the label dimension before training the DNN
    # computing that requires us to look at the labels
    #
    # currently, there are two ways to do this
    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim

    elif cfg.label_style == 'composed':
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)
        lab_dim=label_composer.compute_label_dimension()

    logger.info('label dimension is %d' % lab_dim)

    combined_model_arch = str(len(hidden_layer_size))
    for hid_size in hidden_layer_size:
        combined_model_arch += '_' + str(hid_size)

    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.nn.model' \
                      %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch),
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate'])


    ### DNN model training
    if cfg.TRAINDNN:

        var_dict = load_covariance(var_file_dict, cfg.out_dimension_dict)

        logger.info('training DNN')

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_mean_vector = cmp_min_max[0, ]
        cmp_std_vector  = cmp_min_max[1, ]


        try:
            os.makedirs(model_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create model directory %s' % model_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        try:
            train_DNN(train_xy_file_list = (train_x_file_list, train_y_file_list), \
                      valid_xy_file_list = (valid_x_file_list, valid_y_file_list), \
                      nnets_file_name = nnets_file_name, \
                      n_ins = lab_dim, n_outs = cfg.cmp_dim, ms_outs = cfg.multistream_outs, \
                      hyper_params = cfg.hyper_params, buffer_size = cfg.buffer_size, plot = cfg.plot, var_dict = var_dict,
                      cmp_mean_vector = cmp_mean_vector, cmp_std_vector = cmp_std_vector, init_dnn_model_file = cfg.start_from_trained_model)
        except KeyboardInterrupt:
            logger.critical('train_DNN interrupted via keyboard')
            # Could 'raise' the exception further, but that causes a deep traceback to be printed
            # which we don't care about for a keyboard interrupt. So, just bail out immediately
            sys.exit(1)
        except:
            logger.critical('train_DNN threw an exception')
            raise

    ### generate parameters from DNN
    temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d' \
                    %(cfg.combined_model_name, cfg.combined_feature_name, int(cfg.do_post_filtering), \
                      cfg.train_file_number, lab_dim, cfg.cmp_dim, \
                      len(hidden_layer_size), hidden_layer_size[0])
    gen_dir = os.path.join(gen_dir, temp_dir_name)

    gen_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
    test_x_file_list  = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

    if cfg.DNNGEN:
        logger.info('generating from DNN')

        try:
            os.makedirs(gen_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create generation directory %s' % gen_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext)
        dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)

        logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation)

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_min_vector = cmp_min_max[0, ]
        cmp_max_vector = cmp_min_max[1, ]

        if cfg.output_feature_normalisation == 'MVN':
            denormaliser = MeanVarianceNorm(feature_dimension = cfg.cmp_dim)
            denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector)

        elif cfg.output_feature_normalisation == 'MINMAX':
            denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value = 0.01, max_value = 0.99, min_vector = cmp_min_vector, max_vector = cmp_max_vector)
            denormaliser.denormalise_data(gen_file_list, gen_file_list)
        else:
            logger.critical('denormalising method %s is not supported!\n' %(cfg.output_feature_normalisation))
            raise

        ##perform MLPG to smooth parameter trajectory
        ## lf0 is included, the output features much have vuv.
        generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features)
        generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict)


    ### generate wav
    if cfg.GENWAV:
        logger.info('reconstructing waveform(s)')
        print(len(gen_file_id_list))
        generate_wav(gen_dir, gen_file_id_list[cfg.valid_file_number:cfg.valid_file_number+cfg.test_file_number], cfg)     # generated speech
#       generate_wav(nn_cmp_dir, gen_file_id_list)  # reference copy synthesis speech

    ### evaluation: calculate distortion
    if cfg.CALMCD:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(data_dir, 'ref_data')

        ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext)
        ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext)
        ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext)

        in_gen_label_align_file_list = in_label_align_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
        calculator = IndividualDistortionComp()

        spectral_distortion = 0.0
        bap_mse             = 0.0
        f0_mse              = 0.0
        vuv_error           = 0.0

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number]
        test_file_id_list  = file_id_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            ## get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim=label_composer.compute_label_dimension()

            ## use first feature in label -- hardcoded for now
            silence_feature = 0

            ## Use these to trim silence:
            untrimmed_test_labels = binary_label_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]


        if 'mgc' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \
                                    untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp = cfg.mgc_dim, silence_pattern = cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_mgc_list)
            valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim)
            test_spectral_distortion  = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim)
            valid_spectral_distortion *= (10 /numpy.log(10)) * numpy.sqrt(2.0)    ##MCD
            test_spectral_distortion  *= (10 /numpy.log(10)) * numpy.sqrt(2.0)    ##MCD


        if 'bap' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \
                                    untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp = cfg.bap_dim, silence_pattern = cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_bap_list)
            valid_bap_mse        = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim)
            test_bap_mse         = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim)
            valid_bap_mse = valid_bap_mse / 10.0    ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC
            test_bap_mse  = test_bap_mse / 10.0    ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC

        if 'lf0' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \
                                    untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp = cfg.lf0_dim, silence_pattern = cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_lf0_list)
            valid_f0_mse, valid_vuv_error   = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim)
            test_f0_mse , test_vuv_error    = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim)

        logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0: %.3f Hz; VUV: %.3f%%' \
                    %(valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_vuv_error*100.))
        logger.info('Test   : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0: %.3f Hz; VUV: %.3f%%' \
                    %(test_spectral_distortion , test_bap_mse , test_f0_mse , test_vuv_error*100.))
Beispiel #10
0
def main_function(cfg, in_dir, out_dir):

    # get a logger for this main function
    logger = logging.getLogger("main")

    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")

    #### parameter setting########
    hidden_layers_sizes = cfg.hyper_params['hidden_layer_size']

    file_id_list = []

    if cfg.label_style == 'HTS':
        ext = '.lab'
    else:
        ext = '.utt'

    synth_utts = glob.glob(in_dir + '/*' + ext)
    for fname in synth_utts:
        junk, name = os.path.split(fname)
        file_id_list.append(name.replace(ext, ''))

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    ###total file number including training, development, and testing
    #total_file_number = len(file_id_list)

    data_dir = cfg.data_dir

    #nn_cmp_dir       = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    #nn_cmp_norm_dir   = os.path.join(data_dir, 'nn_norm'  + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))

    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    gen_dir = os.path.join(out_dir, 'gen')

    #in_file_list_dict = {}

    #for feature_name in cfg.in_dir_dict.keys():
    #    in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False)

    #nn_cmp_file_list         = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext)
    #nn_cmp_norm_file_list    = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext)

    ###normalisation information
    norm_info_file = os.path.join(
        data_dir, 'norm_info' + cfg.combined_feature_name + '_' +
        str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label

    # currently supporting two different forms of lingustic features
    # later, we should generalise this

    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension
        logger.info('Input label dimension is %d' % lab_dim)
        suffix = str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix = 'composed'

    # the number can be removed
    binary_label_dir = os.path.join(out_dir, 'lab_bin')
    nn_label_norm_dir = os.path.join(out_dir, 'lab_bin_norm')

    in_label_align_file_list = prepare_file_path_list(file_id_list, in_dir,
                                                      cfg.lab_ext)
    binary_label_file_list = prepare_file_path_list(file_id_list,
                                                    binary_label_dir,
                                                    cfg.lab_ext)
    nn_label_norm_file_list = prepare_file_path_list(file_id_list,
                                                     nn_label_norm_dir,
                                                     cfg.lab_ext)

    ## need this to find normalisation info:
    if cfg.process_labels_in_work_dir:
        label_data_dir = cfg.work_dir
    else:
        label_data_dir = data_dir

    min_max_normaliser = None
    label_norm_file = 'label_norm_%s.dat' % (cfg.label_style)
    label_norm_file = os.path.join(label_data_dir, label_norm_file)

    if cfg.label_style == 'HTS':
        # simple HTS labels
        logger.info(
            'preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list,
                                               binary_label_file_list)

    else:

        logger.info(
            'preparing label data (input) using "composed" style labels')
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)

        logger.info('Loaded label configuration')
        # logger.info('%s' % label_composer.configuration.labels )

        lab_dim = label_composer.compute_label_dimension()
        logger.info('label dimension will be %d' % lab_dim)

        if cfg.precompile_xpaths:
            label_composer.precompile_xpaths()

        # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees)
        # create all the lists of these, ready to pass to the label composer

        in_label_align_file_list = {}
        for label_style, label_style_required in label_composer.label_styles.iteritems(
        ):
            if label_style_required:
                logger.info(
                    'labels of style %s are required - constructing file paths for them'
                    % label_style)
                if label_style == 'xpath':
                    in_label_align_file_list['xpath'] = prepare_file_path_list(
                        file_id_list, in_dir, cfg.utt_ext, False)
                elif label_style == 'hts':
                    logger.critical('script not tested with HTS labels')
                else:
                    logger.critical(
                        'unsupported label style %s specified in label configuration'
                        % label_style)
                    raise Exception

            # now iterate through the files, one at a time, constructing the labels for them
            num_files = len(file_id_list)
            logger.info('the label styles required are %s' %
                        label_composer.label_styles)

            for i in xrange(num_files):
                logger.info('making input label features for %4d of %4d' %
                            (i + 1, num_files))

                # iterate through the required label styles and open each corresponding label file

                # a dictionary of file descriptors, pointing at the required files
                required_labels = {}

                for label_style, label_style_required in label_composer.label_styles.iteritems(
                ):

                    # the files will be a parallel set of files for a single utterance
                    # e.g., the XML tree and an HTS label file
                    if label_style_required:
                        required_labels[label_style] = open(
                            in_label_align_file_list[label_style][i], 'r')
                        logger.debug(' opening label file %s' %
                                     in_label_align_file_list[label_style][i])

                logger.debug('label styles with open files: %s' %
                             required_labels)
                label_composer.make_labels(
                    required_labels,
                    out_file_name=binary_label_file_list[i],
                    fill_missing_values=cfg.fill_missing_values,
                    iterate_over_frames=cfg.iterate_over_frames)

                # now close all opened files
                for fd in required_labels.itervalues():
                    fd.close()

    # no silence removal for synthesis ...

    ## minmax norm:
    min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim,
                                             min_value=0.01,
                                             max_value=0.99)

    # reload stored minmax values: (TODO -- move reading and writing into MinMaxNormalisation class)
    fid = open(label_norm_file, 'rb')

    ## This doesn't work -- precision is lost -- reads in as float64
    #label_norm_info = numpy.fromfile(fid)  ## label_norm_info = numpy.array(label_norm_info, 'float32')

    ## use struct to enforce float32:
    nbytes = os.stat(label_norm_file)[6]  # length in bytes
    data = fid.read(nbytes)  # = read until bytes run out
    fid.close()
    m = nbytes / 4  ## number 32 bit floats
    format = str(m) + "f"
    label_norm_info = struct.unpack(format, data)
    label_norm_info = numpy.array(label_norm_info)

    min_max_normaliser.min_vector = label_norm_info[:m / 2]
    min_max_normaliser.max_vector = label_norm_info[m / 2:]

    ###  apply precompuated min-max to the whole dataset
    min_max_normaliser.normalise_data(binary_label_file_list,
                                      nn_label_norm_file_list)

    ### make output acoustic data
    #    if cfg.MAKECMP:

    ### retrieve acoustic normalisation information for normalising the features back
    var_dir = os.path.join(data_dir, 'var')
    var_file_dict = {}
    for feature_name in cfg.out_dimension_dict.keys():
        var_file_dict[feature_name] = os.path.join(
            var_dir,
            feature_name + '_' + str(cfg.out_dimension_dict[feature_name]))

    ### normalise output acoustic data


#    if cfg.NORMCMP:

    combined_model_arch = str(len(hidden_layers_sizes))
    for hid_size in hidden_layers_sizes:
        combined_model_arch += '_' + str(hid_size)
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch),
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)

    ### DNN model training
    #    if cfg.TRAINDNN:

    ##if cfg.DNNGEN:
    logger.info('generating from DNN')

    try:
        os.makedirs(gen_dir)
    except OSError as e:
        if e.errno == errno.EEXIST:
            # not an error - just means directory already exists
            pass
        else:
            logger.critical('Failed to create generation directory %s' %
                            gen_dir)
            logger.critical(' OS error was: %s' % e.strerror)
            raise

    gen_file_list = prepare_file_path_list(file_id_list, gen_dir, cfg.cmp_ext)

    dnn_generation(nn_label_norm_file_list, nnets_file_name, lab_dim,
                   cfg.cmp_dim, gen_file_list)

    logger.debug('denormalising generated output using method %s' %
                 cfg.output_feature_normalisation)

    fid = open(norm_info_file, 'rb')
    cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
    fid.close()
    cmp_min_max = cmp_min_max.reshape((2, -1))
    cmp_min_vector = cmp_min_max[0, ]
    cmp_max_vector = cmp_min_max[1, ]

    if cfg.output_feature_normalisation == 'MVN':
        denormaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim)
        denormaliser.feature_denormalisation(gen_file_list, gen_file_list,
                                             cmp_min_vector, cmp_max_vector)

    elif cfg.output_feature_normalisation == 'MINMAX':
        denormaliser = MinMaxNormalisation(cfg.cmp_dim,
                                           min_value=0.01,
                                           max_value=0.99,
                                           min_vector=cmp_min_vector,
                                           max_vector=cmp_max_vector)
        denormaliser.denormalise_data(gen_file_list, gen_file_list)
    else:
        logger.critical('denormalising method %s is not supported!\n' %
                        (cfg.output_feature_normalisation))
        raise

    ##perform MLPG to smooth parameter trajectory
    ## lf0 is included, the output features much have vuv.
    generator = ParameterGeneration(gen_wav_features=cfg.gen_wav_features)
    generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim,
                                     cfg.out_dimension_dict,
                                     cfg.file_extension_dict, var_file_dict)

    logger.info('Simple variance expansion')
    test_var_scaling = False
    scaled_dir = gen_dir + '_scaled'
    if test_var_scaling:
        file_id_list = simple_scale_variance_CONTINUUM(gen_dir, scaled_dir,
                                                       var_file_dict,
                                                       cfg.out_dimension_dict,
                                                       file_id_list)
    else:
        simple_scale_variance(gen_dir,
                              scaled_dir,
                              var_file_dict,
                              cfg.out_dimension_dict,
                              file_id_list,
                              gv_weight=1.0)  ## gv_weight hard coded here!

    ### generate wav ----
    #if cfg.GENWAV:
    logger.info('reconstructing waveform(s)')
    #generate_wav_glottHMM(scaled_dir, file_id_list)
    generate_wav(scaled_dir, file_id_list, cfg)
Beispiel #11
0
def main_function_synth(cfg, dnn_model):

    # get a logger for this main function
    logger = logging.getLogger("main")

    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")

    # later, we might do this via a handler that is created, attached and configured
    # using the standard config mechanism of the logging module
    # but for now we need to do it manually
    plotlogger.set_plot_path(cfg.plot_dir)

    #### parameter setting########
    hidden_layer_size = cfg.hyper_params['hidden_layer_size']

    ####prepare environment

    try:
        file_id_list = read_file_list(cfg.file_id_scp)
        logger.debug('Loaded file id list from %s' % cfg.file_id_scp)
    except IOError:
        # this means that open(...) threw an error
        logger.critical('Could not load file id list from %s' %
                        cfg.file_id_scp)
        raise

    ###total file number including training, development, and testing
    total_file_number = len(file_id_list)

    data_dir = cfg.data_dir

    nn_cmp_dir = os.path.join(
        data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_norm_dir = os.path.join(
        data_dir,
        'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))

    #model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    gen_dir = os.path.join(cfg.work_dir, 'gen')

    in_file_list_dict = {}

    for feature_name in cfg.in_dir_dict.keys():
        in_file_list_dict[feature_name] = prepare_file_path_list(
            file_id_list, cfg.in_dir_dict[feature_name],
            cfg.file_extension_dict[feature_name], False)

    nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir,
                                              cfg.cmp_ext)
    nn_cmp_norm_file_list = prepare_file_path_list(file_id_list,
                                                   nn_cmp_norm_dir,
                                                   cfg.cmp_ext)

    ###normalisation information
    norm_info_file = os.path.join(
        data_dir, 'norm_info' + cfg.combined_feature_name + '_' +
        str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label
    # currently supporting two different forms of lingustic features
    # later, we should generalise this

    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name,
            add_frame_features=cfg.add_frame_features,
            subphone_feats=cfg.subphone_feats)
        add_feat_dim = sum(cfg.additional_features.values())
        lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim
        logger.info('Input label dimension is %d' % lab_dim)
        suffix = str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix = 'composed'
    dnn_generation
    if cfg.process_labels_in_work_dir:
        label_data_dir = cfg.work_dir
    else:
        label_data_dir = data_dir

    # the number can be removed
    binary_label_dir = os.path.join(
        label_data_dir, 'binary_label_' + str(label_normaliser.dimension))
    nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix)
    nn_label_norm_dir = os.path.join(label_data_dir,
                                     'nn_no_silence_lab_norm_' + suffix)

    in_label_align_file_list = prepare_file_path_list(file_id_list,
                                                      cfg.in_label_align_dir,
                                                      cfg.lab_ext, False)
    binary_label_file_list = prepare_file_path_list(file_id_list,
                                                    binary_label_dir,
                                                    cfg.lab_ext)
    nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir,
                                                cfg.lab_ext)
    nn_label_norm_file_list = prepare_file_path_list(file_id_list,
                                                     nn_label_norm_dir,
                                                     cfg.lab_ext)
    dur_file_list = prepare_file_path_list(file_id_list, cfg.in_dur_dir,
                                           cfg.dur_ext)
    lf0_file_list = prepare_file_path_list(file_id_list, cfg.in_lf0_dir,
                                           cfg.lf0_ext)

    # to do - sanity check the label dimension here?

    min_max_normaliser = None
    label_norm_file = 'label_norm_%s_%d.dat' % (cfg.label_style, lab_dim)
    label_norm_file = os.path.join(label_data_dir, label_norm_file)

    if cfg.GenTestList:
        try:
            test_id_list = read_file_list(cfg.test_id_scp)
            logger.debug('Loaded file id list from %s' % cfg.test_id_scp)
        except IOError:
            # this means that open(...) threw an error
            logger.critical('Could not load file id list from %s' %
                            cfg.test_id_scp)
            raise

        in_label_align_file_list = prepare_file_path_list(
            test_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)
        binary_label_file_list = prepare_file_path_list(
            test_id_list, binary_label_dir, cfg.lab_ext)
        nn_label_file_list = prepare_file_path_list(test_id_list, nn_label_dir,
                                                    cfg.lab_ext)
        nn_label_norm_file_list = prepare_file_path_list(
            test_id_list, nn_label_norm_dir, cfg.lab_ext)

    if cfg.NORMLAB and (cfg.label_style == 'HTS'):
        # simple HTS labels
        logger.info(
            'preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list,
                                               binary_label_file_list,
                                               label_type=cfg.label_type)

        if cfg.additional_features:
            out_feat_dir = os.path.join(data_dir, 'binary_label_' + suffix)
            out_feat_file_list = prepare_file_path_list(
                file_id_list, out_feat_dir, cfg.lab_ext)
            in_dim = label_normaliser.dimension
            for new_feature, new_feature_dim in cfg.additional_features.iteritems(
            ):
                new_feat_dir = os.path.join(data_dir, new_feature)
                new_feat_file_list = prepare_file_path_list(
                    file_id_list, new_feat_dir, '.' + new_feature)

                merger = MergeFeat(lab_dim=in_dim, feat_dim=new_feature_dim)
                merger.merge_data(binary_label_file_list, new_feat_file_list,
                                  out_feat_file_list)
                in_dim += new_feature_dim

                binary_label_file_list = out_feat_file_list

        remover = SilenceRemover(n_cmp=lab_dim,
                                 silence_pattern=cfg.silence_pattern,
                                 label_type=cfg.label_type,
                                 remove_frame_features=cfg.add_frame_features,
                                 subphone_feats=cfg.subphone_feats)
        remover.remove_silence(binary_label_file_list,
                               in_label_align_file_list, nn_label_file_list)

        min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim,
                                                 min_value=0.01,
                                                 max_value=0.99)
        ###use only training data to find min-max information, then apply on the whole dataset
        if cfg.GenTestList:
            min_max_normaliser.load_min_max_values(label_norm_file)
        else:
            min_max_normaliser.find_min_max_values(
                nn_label_file_list[0:cfg.train_file_number])
        ### enforce silence such that the normalization runs without removing silence: only for final synthesis
        if cfg.GenTestList and cfg.enforce_silence:
            min_max_normaliser.normalise_data(binary_label_file_list,
                                              nn_label_norm_file_list)
        else:
            min_max_normaliser.normalise_data(nn_label_file_list,
                                              nn_label_norm_file_list)

    if cfg.NORMLAB and (cfg.label_style == 'composed'):
        # new flexible label preprocessor

        logger.info(
            'preparing label data (input) using "composed" style labels')
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)

        logger.info('Loaded label configuration')
        # logger.info('%s' % label_composer.configuration.labels )

        lab_dim = label_composer.compute_label_dimension()
        logger.info('label dimension will be %d' % lab_dim)

        if cfg.precompile_xpaths:
            label_composer.precompile_xpaths()

        # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees)
        # create all the lists of these, ready to pass to the label composer

        in_label_align_file_list = {}
        for label_style, label_style_required in label_composer.label_styles.iteritems(
        ):
            if label_style_required:
                logger.info(
                    'labels of style %s are required - constructing file paths for them'
                    % label_style)
                if label_style == 'xpath':
                    in_label_align_file_list['xpath'] = prepare_file_path_list(
                        file_id_list, cfg.xpath_label_align_dir, cfg.utt_ext,
                        False)
                elif label_style == 'hts':
                    in_label_align_file_list['hts'] = prepare_file_path_list(
                        file_id_list, cfg.hts_label_align_dir, cfg.lab_ext,
                        False)
                else:
                    logger.critical(
                        'unsupported label style %s specified in label configuration'
                        % label_style)
                    raise Exception

            # now iterate through the files, one at a time, constructing the labels for them
            num_files = len(file_id_list)
            logger.info('the label styles required are %s' %
                        label_composer.label_styles)

            for i in xrange(num_files):
                logger.info('making input label features for %4d of %4d' %
                            (i + 1, num_files))

                # iterate through the required label styles and open each corresponding label file

                # a dictionary of file descriptors, pointing at the required files
                required_labels = {}

                for label_style, label_style_required in label_composer.label_styles.iteritems(
                ):

                    # the files will be a parallel set of files for a single utterance
                    # e.g., the XML tree and an HTS label file
                    if label_style_required:
                        required_labels[label_style] = open(
                            in_label_align_file_list[label_style][i], 'r')
                        logger.debug(' opening label file %s' %
                                     in_label_align_file_list[label_style][i])

                logger.debug('label styles with open files: %s' %
                             required_labels)
                label_composer.make_labels(
                    required_labels,
                    out_file_name=binary_label_file_list[i],
                    fill_missing_values=cfg.fill_missing_values,
                    iterate_over_frames=cfg.iterate_over_frames)

                # now close all opened files
                for fd in required_labels.itervalues():
                    fd.close()

        # silence removal
        if cfg.remove_silence_using_binary_labels:
            silence_feature = 0  ## use first feature in label -- hardcoded for now
            logger.info(
                'Silence removal from label using silence feature: %s' %
                (label_composer.configuration.labels[silence_feature]))
            logger.info('Silence will be removed from CMP files in same way')
            ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming:
            trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \
                 binary_label_file_list, lab_dim, silence_feature)
        else:
            logger.info('No silence removal done')
            # start from the labels we have just produced, not trimmed versions
            nn_label_file_list = binary_label_file_list

        min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim,
                                                 min_value=0.01,
                                                 max_value=0.99)
        ###use only training data to find min-max information, then apply on the whole dataset
        min_max_normaliser.find_min_max_values(
            nn_label_file_list[0:cfg.train_file_number])
        min_max_normaliser.normalise_data(nn_label_file_list,
                                          nn_label_norm_file_list)

    if min_max_normaliser != None and not cfg.GenTestList:
        ### save label normalisation information for unseen testing labels
        label_min_vector = min_max_normaliser.min_vector
        label_max_vector = min_max_normaliser.max_vector
        label_norm_info = numpy.concatenate(
            (label_min_vector, label_max_vector), axis=0)

        label_norm_info = numpy.array(label_norm_info, 'float32')
        fid = open(label_norm_file, 'wb')
        label_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' %
                    (label_min_vector.size, label_norm_file))

    ### make output duration data
    if cfg.MAKEDUR:
        logger.info('creating duration (output) features')
        label_type = cfg.label_type
        feature_type = cfg.dur_feature_type
        label_normaliser.prepare_dur_data(in_label_align_file_list,
                                          dur_file_list, label_type,
                                          feature_type)

    ### make output acoustic data
    if cfg.MAKECMP:
        logger.info('creating acoustic (output) features')
        delta_win = cfg.delta_win  #[-0.5, 0.0, 0.5]
        acc_win = cfg.acc_win  #[1.0, -2.0, 1.0]

        acoustic_worker = AcousticComposition(delta_win=delta_win,
                                              acc_win=acc_win)
        if 'dur' in cfg.in_dir_dict.keys() and cfg.AcousticModel:
            acoustic_worker.make_equal_frames(dur_file_list, lf0_file_list,
                                              cfg.in_dimension_dict)
        acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list,
                                        cfg.in_dimension_dict,
                                        cfg.out_dimension_dict)

        if cfg.remove_silence_using_binary_labels:
            ## do this to get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim = label_composer.compute_label_dimension()

            silence_feature = 0  ## use first feature in label -- hardcoded for now
            logger.info('Silence removal from CMP using binary label file')

            ## overwrite the untrimmed audio with the trimmed version:
            trim_silence(nn_cmp_file_list, nn_cmp_file_list, cfg.cmp_dim,
                         binary_label_file_list, lab_dim, silence_feature)

        else:  ## back off to previous method using HTS labels:
            remover = SilenceRemover(
                n_cmp=cfg.cmp_dim,
                silence_pattern=cfg.silence_pattern,
                label_type=cfg.label_type,
                remove_frame_features=cfg.add_frame_features,
                subphone_feats=cfg.subphone_feats)
            remover.remove_silence(
                nn_cmp_file_list[0:cfg.train_file_number +
                                 cfg.valid_file_number],
                in_label_align_file_list[0:cfg.train_file_number +
                                         cfg.valid_file_number],
                nn_cmp_file_list[0:cfg.train_file_number +
                                 cfg.valid_file_number])  # save to itself

    ### save acoustic normalisation information for normalising the features back
    var_dir = os.path.join(data_dir, 'var')
    if not os.path.exists(var_dir):
        os.makedirs(var_dir)

    var_file_dict = {}
    for feature_name in cfg.out_dimension_dict.keys():
        var_file_dict[feature_name] = os.path.join(
            var_dir,
            feature_name + '_' + str(cfg.out_dimension_dict[feature_name]))

    ### normalise output acoustic data
    if cfg.NORMCMP:
        logger.info('normalising acoustic (output) features using method %s' %
                    cfg.output_feature_normalisation)
        cmp_norm_info = None
        if cfg.output_feature_normalisation == 'MVN':
            normaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim)
            ###calculate mean and std vectors on the training data, and apply on the whole dataset
            global_mean_vector = normaliser.compute_mean(
                nn_cmp_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim)
            global_std_vector = normaliser.compute_std(
                nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector,
                0, cfg.cmp_dim)

            normaliser.feature_normalisation(
                nn_cmp_file_list[0:cfg.train_file_number +
                                 cfg.valid_file_number],
                nn_cmp_norm_file_list[0:cfg.train_file_number +
                                      cfg.valid_file_number])
            cmp_norm_info = numpy.concatenate(
                (global_mean_vector, global_std_vector), axis=0)

        elif cfg.output_feature_normalisation == 'MINMAX':
            min_max_normaliser = MinMaxNormalisation(
                feature_dimension=cfg.cmp_dim)
            global_mean_vector = min_max_normaliser.compute_mean(
                nn_cmp_file_list[0:cfg.train_file_number])
            global_std_vector = min_max_normaliser.compute_std(
                nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector)

            min_max_normaliser = MinMaxNormalisation(
                feature_dimension=cfg.cmp_dim, min_value=0.01, max_value=0.99)
            min_max_normaliser.find_min_max_values(
                nn_cmp_file_list[0:cfg.train_file_number])
            min_max_normaliser.normalise_data(nn_cmp_file_list,
                                              nn_cmp_norm_file_list)

            cmp_min_vector = min_max_normaliser.min_vector
            cmp_max_vector = min_max_normaliser.max_vector
            cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector),
                                              axis=0)

        else:
            logger.critical('Normalisation type %s is not supported!\n' %
                            (cfg.output_feature_normalisation))
            raise

        cmp_norm_info = numpy.array(cmp_norm_info, 'float32')
        fid = open(norm_info_file, 'wb')
        cmp_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' %
                    (cfg.output_feature_normalisation, norm_info_file))

        feature_index = 0
        for feature_name in cfg.out_dimension_dict.keys():
            feature_std_vector = numpy.array(
                global_std_vector[:, feature_index:feature_index +
                                  cfg.out_dimension_dict[feature_name]],
                'float32')

            fid = open(var_file_dict[feature_name], 'w')
            feature_var_vector = feature_std_vector**2
            feature_var_vector.tofile(fid)
            fid.close()

            logger.info('saved %s variance vector to %s' %
                        (feature_name, var_file_dict[feature_name]))

            feature_index += cfg.out_dimension_dict[feature_name]

    train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number]
    train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number]
    valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.
                                                train_file_number +
                                                cfg.valid_file_number]
    valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg.
                                              train_file_number +
                                              cfg.valid_file_number]
    test_x_file_list = nn_label_norm_file_list[
        cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number +
        cfg.valid_file_number + cfg.test_file_number]
    test_y_file_list = nn_cmp_norm_file_list[
        cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number +
        cfg.valid_file_number + cfg.test_file_number]

    ### generate parameters from DNN
    temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d_%d' \
        %(cfg.combined_model_name, cfg.combined_feature_name, int(cfg.do_post_filtering), \
          cfg.train_file_number, lab_dim, cfg.cmp_dim, \
          len(hidden_layer_size), hidden_layer_size[0], hidden_layer_size[-1])
    gen_dir = os.path.join(gen_dir, temp_dir_name)

    gen_file_id_list = file_id_list[cfg.
                                    train_file_number:cfg.train_file_number +
                                    cfg.valid_file_number +
                                    cfg.test_file_number]
    test_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.
                                               train_file_number +
                                               cfg.valid_file_number +
                                               cfg.test_file_number]

    if cfg.GenTestList:
        gen_file_id_list = test_id_list
        test_x_file_list = nn_label_norm_file_list
        ### comment the below line if you don't want the files in a separate folder
        gen_dir = cfg.test_synth_dir

    if cfg.DNNGEN:
        logger.info('generating from DNN')

        try:
            os.makedirs(gen_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create generation directory %s' %
                                gen_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir,
                                               cfg.cmp_ext)
        #dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)
        dnn_generation_yuhao(test_x_file_list, dnn_model, lab_dim, cfg.cmp_dim,
                             gen_file_list)

        logger.debug('denormalising generated output using method %s' %
                     cfg.output_feature_normalisation)

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_min_vector = cmp_min_max[0, ]
        cmp_max_vector = cmp_min_max[1, ]

        if cfg.output_feature_normalisation == 'MVN':
            denormaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim)
            denormaliser.feature_denormalisation(gen_file_list, gen_file_list,
                                                 cmp_min_vector,
                                                 cmp_max_vector)

        elif cfg.output_feature_normalisation == 'MINMAX':
            denormaliser = MinMaxNormalisation(cfg.cmp_dim,
                                               min_value=0.01,
                                               max_value=0.99,
                                               min_vector=cmp_min_vector,
                                               max_vector=cmp_max_vector)
            denormaliser.denormalise_data(gen_file_list, gen_file_list)
        else:
            logger.critical('denormalising method %s is not supported!\n' %
                            (cfg.output_feature_normalisation))
            raise

        if cfg.AcousticModel:
            ##perform MLPG to smooth parameter trajectory
            ## lf0 is included, the output features much have vuv.
            generator = ParameterGeneration(
                gen_wav_features=cfg.gen_wav_features,
                enforce_silence=cfg.enforce_silence)
            generator.acoustic_decomposition(gen_file_list,
                                             cfg.cmp_dim,
                                             cfg.out_dimension_dict,
                                             cfg.file_extension_dict,
                                             var_file_dict,
                                             do_MLPG=cfg.do_MLPG,
                                             cfg=cfg)

        if cfg.DurationModel:
            ### Perform duration normalization(min. state dur set to 1) ###
            gen_dur_list = prepare_file_path_list(gen_file_id_list, gen_dir,
                                                  cfg.dur_ext)
            gen_label_list = prepare_file_path_list(gen_file_id_list, gen_dir,
                                                    cfg.lab_ext)
            in_gen_label_align_file_list = prepare_file_path_list(
                gen_file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)

            generator = ParameterGeneration(
                gen_wav_features=cfg.gen_wav_features)
            generator.duration_decomposition(gen_file_list, cfg.cmp_dim,
                                             cfg.out_dimension_dict,
                                             cfg.file_extension_dict)

            label_modifier = HTSLabelModification(
                silence_pattern=cfg.silence_pattern, label_type=cfg.label_type)
            label_modifier.modify_duration_labels(in_gen_label_align_file_list,
                                                  gen_dur_list, gen_label_list)

    ### generate wav
    if cfg.GENWAV:
        logger.info('reconstructing waveform(s)')
        generate_wav(gen_dir, gen_file_id_list, cfg)  # generated speech
    #    	generate_wav(nn_cmp_dir, gen_file_id_list, cfg)  # reference copy synthesis speech

    ### setting back to original conditions before calculating objective scores ###
    if cfg.GenTestList:
        in_label_align_file_list = prepare_file_path_list(
            file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)
        binary_label_file_list = prepare_file_path_list(
            file_id_list, binary_label_dir, cfg.lab_ext)
        gen_file_id_list = file_id_list[cfg.train_file_number:cfg.
                                        train_file_number +
                                        cfg.valid_file_number +
                                        cfg.test_file_number]

    ### evaluation: RMSE and CORR for duration
    if cfg.CALMCD and cfg.DurationModel:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(data_dir, 'ref_data')

        ref_dur_list = prepare_file_path_list(gen_file_id_list, ref_data_dir,
                                              cfg.dur_ext)

        in_gen_label_align_file_list = in_label_align_file_list[
            cfg.train_file_number:cfg.train_file_number +
            cfg.valid_file_number + cfg.test_file_number]
        calculator = IndividualDistortionComp()

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.
                                          train_file_number +
                                          cfg.valid_file_number]
        test_file_id_list = file_id_list[cfg.train_file_number +
                                         cfg.valid_file_number:cfg.
                                         train_file_number +
                                         cfg.valid_file_number +
                                         cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            untrimmed_reference_data = in_file_list_dict[
                'dur'][cfg.train_file_number:cfg.train_file_number +
                       cfg.valid_file_number + cfg.test_file_number]
            trim_silence(untrimmed_reference_data, ref_dur_list, cfg.dur_dim, \
                 untrimmed_test_labels, lab_dim, silence_feature)
        else:
            remover = SilenceRemover(
                n_cmp=cfg.dur_dim,
                silence_pattern=cfg.silence_pattern,
                label_type=cfg.label_type,
                remove_frame_features=cfg.add_frame_features)
            remover.remove_silence(
                in_file_list_dict['dur']
                [cfg.train_file_number:cfg.train_file_number +
                 cfg.valid_file_number + cfg.test_file_number],
                in_gen_label_align_file_list, ref_dur_list)

        valid_dur_rmse, valid_dur_corr = calculator.compute_distortion(
            valid_file_id_list, ref_data_dir, gen_dir, cfg.dur_ext,
            cfg.dur_dim)
        test_dur_rmse, test_dur_corr = calculator.compute_distortion(
            test_file_id_list, ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim)

        logger.info('Develop: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \
           %(valid_dur_rmse, valid_dur_corr))
        logger.info('Test: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \
           %(test_dur_rmse, test_dur_corr))

    ### evaluation: calculate distortion
    if cfg.CALMCD and cfg.AcousticModel:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(data_dir, 'ref_data')

        ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir,
                                              cfg.mgc_ext)
        ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir,
                                              cfg.bap_ext)
        ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir,
                                              cfg.lf0_ext)

        in_gen_label_align_file_list = in_label_align_file_list[
            cfg.train_file_number:cfg.train_file_number +
            cfg.valid_file_number + cfg.test_file_number]
        calculator = IndividualDistortionComp()

        spectral_distortion = 0.0
        bap_mse = 0.0
        f0_mse = 0.0
        vuv_error = 0.0

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.
                                          train_file_number +
                                          cfg.valid_file_number]
        test_file_id_list = file_id_list[cfg.train_file_number +
                                         cfg.valid_file_number:cfg.
                                         train_file_number +
                                         cfg.valid_file_number +
                                         cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            ## get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim = label_composer.compute_label_dimension()

            ## use first feature in label -- hardcoded for now
            silence_feature = 0

            ## Use these to trim silence:
            untrimmed_test_labels = binary_label_file_list[
                cfg.train_file_number:cfg.train_file_number +
                cfg.valid_file_number + cfg.test_file_number]

        if cfg.in_dimension_dict.has_key('mgc'):
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict[
                    'mgc'][cfg.train_file_number:cfg.train_file_number +
                           cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \
                     untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.mgc_dim,
                                         silence_pattern=cfg.silence_pattern,
                                         label_type=cfg.label_type)
                remover.remove_silence(
                    in_file_list_dict['mgc']
                    [cfg.train_file_number:cfg.train_file_number +
                     cfg.valid_file_number + cfg.test_file_number],
                    in_gen_label_align_file_list, ref_mgc_list)
            valid_spectral_distortion = calculator.compute_distortion(
                valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext,
                cfg.mgc_dim)
            test_spectral_distortion = calculator.compute_distortion(
                test_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext,
                cfg.mgc_dim)
            valid_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(
                2.0)  ##MCD
            test_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(
                2.0)  ##MCD

        if cfg.in_dimension_dict.has_key('bap'):
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict[
                    'bap'][cfg.train_file_number:cfg.train_file_number +
                           cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \
                     untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.bap_dim,
                                         silence_pattern=cfg.silence_pattern,
                                         label_type=cfg.label_type)
                remover.remove_silence(
                    in_file_list_dict['bap']
                    [cfg.train_file_number:cfg.train_file_number +
                     cfg.valid_file_number + cfg.test_file_number],
                    in_gen_label_align_file_list, ref_bap_list)
            valid_bap_mse = calculator.compute_distortion(
                valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext,
                cfg.bap_dim)
            test_bap_mse = calculator.compute_distortion(
                test_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext,
                cfg.bap_dim)
            valid_bap_mse = valid_bap_mse / 10.0  ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC
            test_bap_mse = test_bap_mse / 10.0  ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC

        if cfg.in_dimension_dict.has_key('lf0'):
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict[
                    'lf0'][cfg.train_file_number:cfg.train_file_number +
                           cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \
                     untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.lf0_dim,
                                         silence_pattern=cfg.silence_pattern,
                                         label_type=cfg.label_type)
                remover.remove_silence(
                    in_file_list_dict['lf0']
                    [cfg.train_file_number:cfg.train_file_number +
                     cfg.valid_file_number + cfg.test_file_number],
                    in_gen_label_align_file_list, ref_lf0_list)
            valid_f0_mse, valid_f0_corr, valid_vuv
            #    if gnp._boardId is not None:
            #        import gpu_lock
            #        gpu_lock.free_lock(gnp._boardId)_error   = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim)
            test_f0_mse, test_f0_corr, test_vuv_error = calculator.compute_distortion(
                test_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext,
                cfg.lf0_dim)

        logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
           %(valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error*100.))
        logger.info('Test   : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
           %(test_spectral_distortion , test_bap_mse , test_f0_mse , test_f0_corr, test_vuv_error*100.))
Beispiel #12
0
def main_function(cfg, in_dir, out_dir):    
    
    
    # get a logger for this main function
    logger = logging.getLogger("main")
    
    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")
    
    
    #### parameter setting########
    hidden_layers_sizes = cfg.hyper_params['hidden_layer_size']
    
    file_id_list = []
    
    if cfg.label_style == 'HTS':
        ext = '.lab'
    else:
        ext = '.utt'
        
    synth_utts = glob.glob(in_dir + '/*' + ext)
    for fname in synth_utts:
        junk,name = os.path.split(fname)
        file_id_list.append(name.replace(ext,''))

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    ###total file number including training, development, and testing
    #total_file_number = len(file_id_list)
    
    data_dir = cfg.data_dir

    #nn_cmp_dir       = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    #nn_cmp_norm_dir   = os.path.join(data_dir, 'nn_norm'  + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    
    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    gen_dir   = os.path.join(out_dir, 'gen')    

    #in_file_list_dict = {}

    #for feature_name in cfg.in_dir_dict.keys():
    #    in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False)

    #nn_cmp_file_list         = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext)
    #nn_cmp_norm_file_list    = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext)

    ###normalisation information
    norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')
    
    ### normalise input full context label

    # currently supporting two different forms of lingustic features
    # later, we should generalise this 

    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension
        logger.info('Input label dimension is %d' % lab_dim)
        suffix=str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix='composed'

    # the number can be removed
    binary_label_dir      = os.path.join(out_dir, 'lab_bin')
    nn_label_norm_dir     = os.path.join(out_dir, 'lab_bin_norm')


    in_label_align_file_list = prepare_file_path_list(file_id_list, in_dir, cfg.lab_ext)
    binary_label_file_list   = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext)
    nn_label_norm_file_list  = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext)

    ## need this to find normalisation info:
    if cfg.process_labels_in_work_dir:
        label_data_dir = cfg.work_dir
    else:
        label_data_dir = data_dir
    
    min_max_normaliser = None
    label_norm_file = 'label_norm_%s.dat' %(cfg.label_style)
    label_norm_file = os.path.join(label_data_dir, label_norm_file)
    
    if cfg.label_style == 'HTS':
        # simple HTS labels 
        logger.info('preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list) 

    else:

        logger.info('preparing label data (input) using "composed" style labels')
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)

        logger.info('Loaded label configuration')
        # logger.info('%s' % label_composer.configuration.labels )

        lab_dim=label_composer.compute_label_dimension()
        logger.info('label dimension will be %d' % lab_dim)
    
        if cfg.precompile_xpaths:
            label_composer.precompile_xpaths()
    
        # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees)
        # create all the lists of these, ready to pass to the label composer

        in_label_align_file_list = {}
        for label_style, label_style_required in label_composer.label_styles.iteritems():
            if label_style_required:
                logger.info('labels of style %s are required - constructing file paths for them' % label_style)
                if label_style == 'xpath':
                    in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, in_dir, cfg.utt_ext, False)
                elif label_style == 'hts':
                    logger.critical('script not tested with HTS labels')        
                else:
                    logger.critical('unsupported label style %s specified in label configuration' % label_style)
                    raise Exception
    
            # now iterate through the files, one at a time, constructing the labels for them 
            num_files=len(file_id_list)
            logger.info('the label styles required are %s' % label_composer.label_styles)
        
            for i in xrange(num_files):
                logger.info('making input label features for %4d of %4d' % (i+1,num_files))

                # iterate through the required label styles and open each corresponding label file

                # a dictionary of file descriptors, pointing at the required files
                required_labels={}
            
                for label_style, label_style_required in label_composer.label_styles.iteritems():
                
                    # the files will be a parallel set of files for a single utterance
                    # e.g., the XML tree and an HTS label file
                    if label_style_required:
                        required_labels[label_style] = open(in_label_align_file_list[label_style][i] , 'r')
                        logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i])

                logger.debug('label styles with open files: %s' % required_labels)
                label_composer.make_labels(required_labels,out_file_name=binary_label_file_list[i],fill_missing_values=cfg.fill_missing_values,iterate_over_frames=cfg.iterate_over_frames)
                
                # now close all opened files
                for fd in required_labels.itervalues():
                    fd.close()
    
    
    # no silence removal for synthesis ...

    ## minmax norm:
    min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99)

    # reload stored minmax values: (TODO -- move reading and writing into MinMaxNormalisation class)
    fid = open(label_norm_file, 'rb')
    
    ## This doesn't work -- precision is lost -- reads in as float64
    #label_norm_info = numpy.fromfile(fid)  ## label_norm_info = numpy.array(label_norm_info, 'float32')

    ## use struct to enforce float32:
    nbytes = os.stat(label_norm_file)[6]  # length in bytes
    data = fid.read(nbytes)               # = read until bytes run out 
    fid.close()
    m = nbytes / 4  ## number 32 bit floats
    format = str(m)+"f"
    label_norm_info = struct.unpack(format, data)
    label_norm_info = numpy.array(label_norm_info)

    min_max_normaliser.min_vector = label_norm_info[:m/2]
    min_max_normaliser.max_vector = label_norm_info[m/2:]         

    ###  apply precompuated min-max to the whole dataset
    min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list)



    ### make output acoustic data
#    if cfg.MAKECMP:
   
    ### retrieve acoustic normalisation information for normalising the features back
    var_dir   = os.path.join(data_dir, 'var')
    var_file_dict = {}
    for feature_name in cfg.out_dimension_dict.keys():
        var_file_dict[feature_name] = os.path.join(var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name]))
        
        
    ### normalise output acoustic data
#    if cfg.NORMCMP:

    combined_model_arch = str(len(hidden_layers_sizes))
    for hid_size in hidden_layers_sizes:
        combined_model_arch += '_' + str(hid_size)
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), 
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)
 
    ### DNN model training
#    if cfg.TRAINDNN:

    ##if cfg.DNNGEN:
    logger.info('generating from DNN')

    try:
        os.makedirs(gen_dir)
    except OSError as e:
        if e.errno == errno.EEXIST:
            # not an error - just means directory already exists
            pass
        else:
            logger.critical('Failed to create generation directory %s' % gen_dir)
            logger.critical(' OS error was: %s' % e.strerror)
            raise

    gen_file_list = prepare_file_path_list(file_id_list, gen_dir, cfg.cmp_ext)

    dnn_generation(nn_label_norm_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)

    logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation)

    fid = open(norm_info_file, 'rb')
    cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
    fid.close()
    cmp_min_max = cmp_min_max.reshape((2, -1))
    cmp_min_vector = cmp_min_max[0, ] 
    cmp_max_vector = cmp_min_max[1, ]

    if cfg.output_feature_normalisation == 'MVN':
        denormaliser = MeanVarianceNorm(feature_dimension = cfg.cmp_dim)
        denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector)
    
    elif cfg.output_feature_normalisation == 'MINMAX':
        denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value = 0.01, max_value = 0.99, min_vector = cmp_min_vector, max_vector = cmp_max_vector)
        denormaliser.denormalise_data(gen_file_list, gen_file_list)
    else:
        logger.critical('denormalising method %s is not supported!\n' %(cfg.output_feature_normalisation))
        raise

    ##perform MLPG to smooth parameter trajectory
    ## lf0 is included, the output features much have vuv. 
    generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features)
    generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict)    

    
    logger.info('Simple variance expansion')
    test_var_scaling=False
    scaled_dir = gen_dir + '_scaled'
    if test_var_scaling:
        file_id_list = simple_scale_variance_CONTINUUM(gen_dir, scaled_dir, var_file_dict, cfg.out_dimension_dict, file_id_list)
    else:
        simple_scale_variance(gen_dir, scaled_dir, var_file_dict, cfg.out_dimension_dict, file_id_list, gv_weight=1.0)  ## gv_weight hard coded here!

    ### generate wav ----
    #if cfg.GENWAV:
    logger.info('reconstructing waveform(s)')
    #generate_wav_glottHMM(scaled_dir, file_id_list)   
    generate_wav(scaled_dir, file_id_list, cfg)   
    def __init__(self, cfg):

        # model type (duration or acoustic)
        self.model_output_type = cfg.model_output_type

        # ----------------------------------------------------
        # ------------------- Input-Output -------------------
        # ----------------------------------------------------

        self.label_type = cfg.label_type
        self.cmp_ext = cfg.cmp_ext
        inp_file_ext = cfg.inp_file_ext
        out_file_ext = cfg.out_file_ext
        self.label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name,
            add_frame_features=cfg.add_frame_features ==
            'True',  # must be bool
            subphone_feats=cfg.subphone_feats)

        # Create streams files (they store data from dimension dictionaries for synthesis)
        in_streams = sorted(cfg.in_dimension_dict.keys())
        indims = [str(cfg.in_dimension_dict[s]) for s in in_streams]
        self.out_streams = sorted(cfg.out_dimension_dict.keys())
        self.outdims = [
            str(cfg.out_dimension_dict[s]) for s in self.out_streams
        ]

        with open(os.path.join(cfg.model_dir, 'stream_info.txt'), 'w') as f:
            f.write(' '.join(in_streams) + '\n')
            f.write(' '.join(indims) + '\n')
            f.write(' '.join(self.out_streams) + '\n')
            f.write(' '.join(self.outdims) + '\n')

        # Input output dimensions
        self.inp_dim = cfg.inp_dim
        if self.model_output_type == 'duration':
            self.out_dim = cfg.dur_dim
        elif self.model_output_type == 'acoustic':
            self.out_dim = cfg.cmp_dim

        # Data normalization method
        self.inp_norm = cfg.inp_norm
        self.out_norm = cfg.out_norm

        # Norm stats files
        self.inp_stats_file = cfg.inp_stats_file
        self.out_stats_file_list = cfg.out_stats_file_list
        self.speaker_id = cfg.speaker_id
        self.shared_layer_flag = cfg.shared_layer_flag

        self.inp_scaler = None
        self.out_scaler = None

        # ---------------------------------------------------
        # ------------------- Directories -------------------
        # ---------------------------------------------------

        self.plot_dir = os.path.join(cfg.plot_dir, cfg.nnets_file_name)
        # Select data directories based on model input-output type
        if self.model_output_type == 'duration':

            # Input
            self.inp_feat_dir = cfg.inp_feat_dir_dur
            self.bin_lab_dir = cfg.bin_lab_dir_dur
            self.bin_lab_dir_nosilence = cfg.bin_lab_dir_dur_nosilence
            self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_dur_nosilence_norm

            # Output
            self.out_feat_dir = cfg.out_feat_dir_dur
            self.out_feat_dir_norm = cfg.out_feat_dir_dur_norm

        elif self.model_output_type == 'acoustic':

            # Input
            self.inp_feat_dir = cfg.inp_feat_dir_cmp
            self.bin_lab_dir = cfg.bin_lab_dir_cmp
            self.bin_lab_dir_nosilence = cfg.bin_lab_dir_cmp_nosilence
            self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_cmp_nosilence_norm

            # Output
            self.out_feat_dir = cfg.nn_cmp_dir
            self.out_feat_dir_norm = cfg.nn_cmp_norm_dir

        else:
            print("invalid model output type")
            raise

        # --------------------------------------------------------
        # ------------------- Model Parameters -------------------
        # --------------------------------------------------------

        self.sequential_training = cfg.sequential_training
        self.stateful = cfg.stateful

        self.json_model_file = cfg.json_model_file
        self.h5_model_file = cfg.h5_model_file
        self.model_params_file = cfg.model_params_file

        # -----------------------------------------------------------
        # ------------------- Generate file lists -------------------
        # -----------------------------------------------------------

        train_file_number = cfg.train_file_number
        valid_file_number = cfg.valid_file_number
        test_file_number = cfg.test_file_number

        # List of file ids
        self.file_id_scp = cfg.file_id_scp

        # Create train, valid and test file lists
        self.file_id_list = data_utils.read_file_list(self.file_id_scp)
        if cfg.shuffle_data:
            random.seed(1)
            random.shuffle(self.file_id_list
                           )  # Shuffle to get random valid and test utterances
        self.train_id_list = self.file_id_list[0:train_file_number]
        self.valid_id_list = self.file_id_list[
            train_file_number:train_file_number + valid_file_number]
        self.test_id_list = self.file_id_list[
            train_file_number + valid_file_number:train_file_number +
            valid_file_number + test_file_number]

        # Intermediate file lists
        self.inp_feat_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.inp_feat_dir, inp_file_ext)
        self.bin_lab_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.bin_lab_dir, inp_file_ext)
        self.bin_lab_nosilence_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.bin_lab_dir_nosilence, inp_file_ext)

        # Train, test, validation file lists
        self.inp_train_file_list = data_utils.prepare_file_path_list(
            self.train_id_list, self.bin_lab_dir_nosilence, inp_file_ext)
        self.out_train_file_list = data_utils.prepare_file_path_list(
            self.train_id_list, self.out_feat_dir, out_file_ext)
        self.inp_valid_file_list = data_utils.prepare_file_path_list(
            self.valid_id_list, self.bin_lab_dir_nosilence, inp_file_ext)
        self.out_valid_file_list = data_utils.prepare_file_path_list(
            self.valid_id_list, self.out_feat_dir, out_file_ext)
        self.inp_test_file_list = data_utils.prepare_file_path_list(
            self.test_id_list, self.bin_lab_dir_nosilence, inp_file_ext)
        self.out_test_file_list = data_utils.prepare_file_path_list(
            self.test_id_list, self.out_feat_dir, out_file_ext)

        # For cmp files generated as targets (applies to acoustic model only)
        self.nn_cmp_file_list = []
        self.nn_cmp_norm_file_list = []

        self.in_file_list_dict = {}
        for feature_name in list(cfg.in_dir_dict.keys()):
            self.in_file_list_dict[
                feature_name] = data_utils.prepare_file_path_list(
                    self.file_id_list, cfg.in_dir_dict[feature_name],
                    cfg.file_extension_dict[feature_name], False)

        # self.gen_test_file_list = data_utils.prepare_file_path_list(self.test_id_list, pred_feat_dir, out_file_ext)

        # if self.GenTestList:
        #     test_id_list = data_utils.read_file_list(test_id_scp)
        #     self.inp_test_file_list = data_utils.prepare_file_path_list(test_id_list, inp_feat_dir, inp_file_ext)
        #     self.gen_test_file_list = data_utils.prepare_file_path_list(test_id_list, pred_feat_dir, out_file_ext)

        # ------------------------------------------------------
        # ------------------- Main Processes -------------------
        # ------------------------------------------------------

        self.MAKELAB = cfg.MAKELAB  # make binary labels (required step before normalization and training)
        self.MAKECMP = cfg.MAKECMP
        self.NORMDATA = cfg.NORMDATA  # normalizes input and output data, creates data scaling objects
        self.TRAINDNN = cfg.TRAINDNN  # train the Keras model
        self.TESTDNN = cfg.TESTDNN  # test the Keras model

        # ----------------------------------------------------------
        # ------------------- Define Keras Model -------------------
        # ----------------------------------------------------------

        self.batch_size = cfg.batch_size
        model_params = {
            'inp_dim': self.inp_dim,
            'hidden_layer_size': cfg.hidden_layer_size,
            'shared_layer_flag': cfg.shared_layer_flag,
            'speaker_id': cfg.speaker_id,
            'out_dim': self.out_dim,
            'hidden_layer_type': cfg.hidden_layer_type,
            'output_layer_type': cfg.output_layer_type,
            'dropout_rate': cfg.dropout_rate,
            'loss_function': cfg.loss_function,
            'optimizer': cfg.optimizer,
            'l1': cfg.l1_reg,
            'l2': cfg.l2_reg,
            'gpu_num': cfg.gpu_num
        }

        rnn_params = {
            'merge_size': cfg.merge_size,
            'seq_length': cfg.seq_length,
            'bucket_range': cfg.bucket_range,
            'stateful': cfg.stateful,
            'training_algo': cfg.training_algo
        }

        training_params = {
            'batch_size': cfg.batch_size,
            'num_of_epochs': cfg.num_of_epochs,
            'shuffle_data': cfg.shuffle_data,
            'tensorboard_dir': os.path.join(cfg.plot_dir, cfg.nnets_file_name),
            'stopping_patience': cfg.stopping_patience,
            'restore_best_weights': cfg.restore_best_weights
        }

        self.keras_models = TrainKerasModels(model_params=model_params,
                                             rnn_params=rnn_params,
                                             training_params=training_params)
class KerasClass(object):
    def __init__(self, cfg):

        # model type (duration or acoustic)
        self.model_output_type = cfg.model_output_type

        # ----------------------------------------------------
        # ------------------- Input-Output -------------------
        # ----------------------------------------------------

        self.label_type = cfg.label_type
        self.cmp_ext = cfg.cmp_ext
        inp_file_ext = cfg.inp_file_ext
        out_file_ext = cfg.out_file_ext
        self.label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name,
            add_frame_features=cfg.add_frame_features ==
            'True',  # must be bool
            subphone_feats=cfg.subphone_feats)

        # Create streams files (they store data from dimension dictionaries for synthesis)
        in_streams = sorted(cfg.in_dimension_dict.keys())
        indims = [str(cfg.in_dimension_dict[s]) for s in in_streams]
        self.out_streams = sorted(cfg.out_dimension_dict.keys())
        self.outdims = [
            str(cfg.out_dimension_dict[s]) for s in self.out_streams
        ]

        with open(os.path.join(cfg.model_dir, 'stream_info.txt'), 'w') as f:
            f.write(' '.join(in_streams) + '\n')
            f.write(' '.join(indims) + '\n')
            f.write(' '.join(self.out_streams) + '\n')
            f.write(' '.join(self.outdims) + '\n')

        # Input output dimensions
        self.inp_dim = cfg.inp_dim
        if self.model_output_type == 'duration':
            self.out_dim = cfg.dur_dim
        elif self.model_output_type == 'acoustic':
            self.out_dim = cfg.cmp_dim

        # Data normalization method
        self.inp_norm = cfg.inp_norm
        self.out_norm = cfg.out_norm

        # Norm stats files
        self.inp_stats_file = cfg.inp_stats_file
        self.out_stats_file_list = cfg.out_stats_file_list
        self.speaker_id = cfg.speaker_id
        self.shared_layer_flag = cfg.shared_layer_flag

        self.inp_scaler = None
        self.out_scaler = None

        # ---------------------------------------------------
        # ------------------- Directories -------------------
        # ---------------------------------------------------

        self.plot_dir = os.path.join(cfg.plot_dir, cfg.nnets_file_name)
        # Select data directories based on model input-output type
        if self.model_output_type == 'duration':

            # Input
            self.inp_feat_dir = cfg.inp_feat_dir_dur
            self.bin_lab_dir = cfg.bin_lab_dir_dur
            self.bin_lab_dir_nosilence = cfg.bin_lab_dir_dur_nosilence
            self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_dur_nosilence_norm

            # Output
            self.out_feat_dir = cfg.out_feat_dir_dur
            self.out_feat_dir_norm = cfg.out_feat_dir_dur_norm

        elif self.model_output_type == 'acoustic':

            # Input
            self.inp_feat_dir = cfg.inp_feat_dir_cmp
            self.bin_lab_dir = cfg.bin_lab_dir_cmp
            self.bin_lab_dir_nosilence = cfg.bin_lab_dir_cmp_nosilence
            self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_cmp_nosilence_norm

            # Output
            self.out_feat_dir = cfg.nn_cmp_dir
            self.out_feat_dir_norm = cfg.nn_cmp_norm_dir

        else:
            print("invalid model output type")
            raise

        # --------------------------------------------------------
        # ------------------- Model Parameters -------------------
        # --------------------------------------------------------

        self.sequential_training = cfg.sequential_training
        self.stateful = cfg.stateful

        self.json_model_file = cfg.json_model_file
        self.h5_model_file = cfg.h5_model_file
        self.model_params_file = cfg.model_params_file

        # -----------------------------------------------------------
        # ------------------- Generate file lists -------------------
        # -----------------------------------------------------------

        train_file_number = cfg.train_file_number
        valid_file_number = cfg.valid_file_number
        test_file_number = cfg.test_file_number

        # List of file ids
        self.file_id_scp = cfg.file_id_scp

        # Create train, valid and test file lists
        self.file_id_list = data_utils.read_file_list(self.file_id_scp)
        if cfg.shuffle_data:
            random.seed(1)
            random.shuffle(self.file_id_list
                           )  # Shuffle to get random valid and test utterances
        self.train_id_list = self.file_id_list[0:train_file_number]
        self.valid_id_list = self.file_id_list[
            train_file_number:train_file_number + valid_file_number]
        self.test_id_list = self.file_id_list[
            train_file_number + valid_file_number:train_file_number +
            valid_file_number + test_file_number]

        # Intermediate file lists
        self.inp_feat_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.inp_feat_dir, inp_file_ext)
        self.bin_lab_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.bin_lab_dir, inp_file_ext)
        self.bin_lab_nosilence_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.bin_lab_dir_nosilence, inp_file_ext)

        # Train, test, validation file lists
        self.inp_train_file_list = data_utils.prepare_file_path_list(
            self.train_id_list, self.bin_lab_dir_nosilence, inp_file_ext)
        self.out_train_file_list = data_utils.prepare_file_path_list(
            self.train_id_list, self.out_feat_dir, out_file_ext)
        self.inp_valid_file_list = data_utils.prepare_file_path_list(
            self.valid_id_list, self.bin_lab_dir_nosilence, inp_file_ext)
        self.out_valid_file_list = data_utils.prepare_file_path_list(
            self.valid_id_list, self.out_feat_dir, out_file_ext)
        self.inp_test_file_list = data_utils.prepare_file_path_list(
            self.test_id_list, self.bin_lab_dir_nosilence, inp_file_ext)
        self.out_test_file_list = data_utils.prepare_file_path_list(
            self.test_id_list, self.out_feat_dir, out_file_ext)

        # For cmp files generated as targets (applies to acoustic model only)
        self.nn_cmp_file_list = []
        self.nn_cmp_norm_file_list = []

        self.in_file_list_dict = {}
        for feature_name in list(cfg.in_dir_dict.keys()):
            self.in_file_list_dict[
                feature_name] = data_utils.prepare_file_path_list(
                    self.file_id_list, cfg.in_dir_dict[feature_name],
                    cfg.file_extension_dict[feature_name], False)

        # self.gen_test_file_list = data_utils.prepare_file_path_list(self.test_id_list, pred_feat_dir, out_file_ext)

        # if self.GenTestList:
        #     test_id_list = data_utils.read_file_list(test_id_scp)
        #     self.inp_test_file_list = data_utils.prepare_file_path_list(test_id_list, inp_feat_dir, inp_file_ext)
        #     self.gen_test_file_list = data_utils.prepare_file_path_list(test_id_list, pred_feat_dir, out_file_ext)

        # ------------------------------------------------------
        # ------------------- Main Processes -------------------
        # ------------------------------------------------------

        self.MAKELAB = cfg.MAKELAB  # make binary labels (required step before normalization and training)
        self.MAKECMP = cfg.MAKECMP
        self.NORMDATA = cfg.NORMDATA  # normalizes input and output data, creates data scaling objects
        self.TRAINDNN = cfg.TRAINDNN  # train the Keras model
        self.TESTDNN = cfg.TESTDNN  # test the Keras model

        # ----------------------------------------------------------
        # ------------------- Define Keras Model -------------------
        # ----------------------------------------------------------

        self.batch_size = cfg.batch_size
        model_params = {
            'inp_dim': self.inp_dim,
            'hidden_layer_size': cfg.hidden_layer_size,
            'shared_layer_flag': cfg.shared_layer_flag,
            'speaker_id': cfg.speaker_id,
            'out_dim': self.out_dim,
            'hidden_layer_type': cfg.hidden_layer_type,
            'output_layer_type': cfg.output_layer_type,
            'dropout_rate': cfg.dropout_rate,
            'loss_function': cfg.loss_function,
            'optimizer': cfg.optimizer,
            'l1': cfg.l1_reg,
            'l2': cfg.l2_reg,
            'gpu_num': cfg.gpu_num
        }

        rnn_params = {
            'merge_size': cfg.merge_size,
            'seq_length': cfg.seq_length,
            'bucket_range': cfg.bucket_range,
            'stateful': cfg.stateful,
            'training_algo': cfg.training_algo
        }

        training_params = {
            'batch_size': cfg.batch_size,
            'num_of_epochs': cfg.num_of_epochs,
            'shuffle_data': cfg.shuffle_data,
            'tensorboard_dir': os.path.join(cfg.plot_dir, cfg.nnets_file_name),
            'stopping_patience': cfg.stopping_patience,
            'restore_best_weights': cfg.restore_best_weights
        }

        self.keras_models = TrainKerasModels(model_params=model_params,
                                             rnn_params=rnn_params,
                                             training_params=training_params)

    def make_labels(self):

        # simple HTS labels
        print('preparing label data (input) using standard HTS style labels')

        if not os.path.isfile(self.bin_lab_file_list[-1]):
            # This does not normalize the data as the name suggests, rather translates it to binary
            self.label_normaliser.perform_normalisation(
                self.inp_feat_file_list,
                self.bin_lab_file_list,
                label_type=self.label_type)

        # TODO: Additional features may be added in the future... parts of speech?  Some context for intonation?
        # if cfg.additional_features:
        #     out_feat_dir = os.path.join(cfg.data_dir, 'binary_label_%s_%s' % (cfg.label_type, str(self.inp_dim)))
        #     out_feat_file_list = data_utils.prepare_file_path_list(file_id_list, out_feat_dir, cfg.lab_ext)
        #     in_dim = self.label_normaliser.dimension
        #     for new_feature, new_feature_dim in cfg.additional_features.items():
        #         new_feat_dir = os.path.join(cfg.data_dir, new_feature)
        #         new_feat_file_list = data_utils.prepare_file_path_list(file_id_list, new_feat_dir, '.' + new_feature)
        #
        #         merger = MergeFeat(lab_dim=in_dim, feat_dim=new_feature_dim)
        #         merger.merge_data(binary_label_file_list, new_feat_file_list, out_feat_file_list)
        #         in_dim += new_feature_dim
        #
        #         binary_label_file_list = out_feat_file_list

        # This silence remover has little to no effect, no change in file 1
        if not os.path.isfile(self.bin_lab_nosilence_file_list[-1]):
            remover = SilenceRemover(
                n_cmp=self.inp_dim,
                silence_pattern=cfg.silence_pattern,
                label_type=cfg.label_type,
                remove_frame_features=cfg.add_frame_features,
                subphone_feats=cfg.subphone_feats)
            remover.remove_silence(self.bin_lab_file_list,
                                   self.inp_feat_file_list,
                                   self.bin_lab_nosilence_file_list)

    def make_cmp(self):

        # File lists for the final cmp files (these are re-generated below to fit a precise numpy data array)
        self.nn_cmp_file_list = data_utils.prepare_file_path_list(
            self.file_id_list, self.out_feat_dir, self.cmp_ext)
        # self.nn_cmp_norm_file_list = data_utils.prepare_file_path_list(self.file_id_list, self.out_feat_dir_norm,
        #                                                                self.cmp_ext)
        # TODO: Get the delta and acceleration windows from the recipe file.
        acoustic_worker = AcousticComposition(delta_win=[-0.5, 0.0, 0.5],
                                              acc_win=[1.0, -2.0, 1.0])

        # TODO: Lets try this at some point
        # if 'dur' in list(cfg.in_dir_dict.keys()) and cfg.AcousticModel:
        #     acoustic_worker.make_equal_frames(dur_file_list, lf0_file_list, cfg.in_dimension_dict)
        acoustic_worker.prepare_nn_data(self.in_file_list_dict,
                                        self.nn_cmp_file_list,
                                        cfg.in_dimension_dict,
                                        cfg.out_dimension_dict)

        remover = SilenceRemover(n_cmp=cfg.cmp_dim,
                                 silence_pattern=cfg.silence_pattern,
                                 label_type=cfg.label_type,
                                 remove_frame_features=cfg.add_frame_features,
                                 subphone_feats=cfg.subphone_feats)
        remover.remove_silence(
            self.nn_cmp_file_list[0:cfg.train_file_number +
                                  cfg.valid_file_number],
            self.inp_feat_file_list[0:cfg.train_file_number +
                                    cfg.valid_file_number],
            self.nn_cmp_file_list[0:cfg.train_file_number +
                                  cfg.valid_file_number])  # save to itself

    def normalize_data(self):

        # What type of normalization? -- its given as "method" in compute_norm_stats

        # Check if normalization stat files already exist
        if os.path.isfile(self.inp_stats_file) and os.path.isfile(
                self.out_stats_file_list[0]):
            self.inp_scaler = data_utils.load_norm_stats(self.inp_stats_file,
                                                         self.inp_dim,
                                                         method=self.inp_norm)
            self.out_scaler_list = []
            for speaker_norm_file in self.out_stats_file_list:
                self.out_scaler_list.append(
                    data_utils.load_norm_stats(speaker_norm_file,
                                               self.out_dim,
                                               method=self.out_norm))

        else:  # Create the scaler objects
            # Data must be in an a numpy array for normalization, therefore set sequential_training to false
            print(
                'preparing train_x, train_y from input and output feature files...'
            )
            if len(self.speaker_id) > 1:
                train_x, train_y_list, train_flen = data_utils.read_data_from_file_list_shared_2(
                    self.speaker_id,
                    self.inp_train_file_list,
                    self.out_train_file_list,
                    self.inp_dim,
                    self.out_dim,
                    sequential_training=False)
            else:
                train_x, train_y_list, train_flen = data_utils.read_data_from_file_list(
                    self.inp_train_file_list,
                    self.out_train_file_list,
                    self.inp_dim,
                    self.out_dim,
                    sequential_training=False)

            print('computing norm stats for train_x...')
            # I have removed scaling from binary variables (discrete_dict columns are all binary)
            ind = [int(i) for i in self.label_normaliser.discrete_dict.keys()]
            self.inp_scaler = data_utils.compute_norm_stats(
                train_x,
                self.inp_stats_file,
                method=self.inp_norm,
                no_scaling_ind=ind)

            # The output values should all be continuous except vuv (in acoustic model)
            print('computing norm stats for train_y...')
            if self.model_output_type == 'acoustic':
                vuv_index = self.out_streams.index('vuv')
                index = [sum([int(num) for num in self.outdims[0:vuv_index]])]
            else:
                index = []

            if type(train_y_list) != list:
                train_y_list = [train_y_list]

            self.out_scaler_list = []
            for train_y, speaker in zip(train_y_list, self.speaker_id):
                ind = np.where([
                    speaker in file_name
                    for file_name in self.out_stats_file_list
                ])[0][0]
                out_scaler = data_utils.compute_norm_stats(
                    train_y,
                    self.out_stats_file_list[ind],
                    method=self.out_norm,
                    no_scaling_ind=index)  # For vuv (the first column)
                self.out_scaler_list.append(out_scaler)

    def train_keras_model(self):

        # TODO: for large datasets, I might have to batch load the data to memory... I will cross that bridge when it comes
        #### load the data ####
        print(
            'preparing train_x, train_y from input and output feature files...'
        )
        train_x, train_y, train_flen = data_utils.read_data_from_file_list(
            self.inp_train_file_list,
            self.out_train_file_list,
            self.inp_dim,
            self.out_dim,
            sequential_training=self.sequential_training)
        print(
            'preparing valid_x, valid_y from input and output feature files...'
        )
        valid_x, valid_y, valid_flen = data_utils.read_data_from_file_list(
            self.inp_valid_file_list,
            self.out_valid_file_list,
            self.inp_dim,
            self.out_dim,
            sequential_training=self.sequential_training)

        #### normalize the data (the input and output scalers need to be already created) ####
        train_x = data_utils.norm_data(
            train_x,
            self.inp_scaler,
            sequential_training=self.sequential_training)
        valid_x = data_utils.norm_data(
            valid_x,
            self.inp_scaler,
            sequential_training=self.sequential_training)
        # For each speaker:
        if self.sequential_training:
            # Cycle through all utterances once
            for utt_key in train_y.keys():
                i = np.where(
                    [speaker in utt_key for speaker in self.speaker_id])[0][0]
                # Sequential training false because we are normalizing one utterance at at a time
                train_y[utt_key] = data_utils.norm_data(
                    train_y[utt_key],
                    self.out_scaler_list[i],
                    sequential_training=False)
            for utt_key in valid_y.keys():
                i = np.where(
                    [speaker in utt_key for speaker in self.speaker_id])[0][0]
                valid_y[utt_key] = data_utils.norm_data(
                    valid_y[utt_key],
                    self.out_scaler_list[i],
                    sequential_training=False)
        else:

            for i, scaler in enumerate(self.out_scaler_list):
                train_y = data_utils.norm_data(train_y,
                                               scaler,
                                               sequential_training=False)
                valid_y = data_utils.norm_data(valid_y,
                                               scaler,
                                               sequential_training=False)

        #### define the model ####
        if not self.sequential_training:
            self.keras_models.define_feedforward_model()
        elif self.sequential_training and not self.stateful and sum(
                self.shared_layer_flag) == 0:
            self.keras_models.define_sequence_model()
        elif self.sequential_training and not self.stateful and sum(
                self.shared_layer_flag) > 0:
            self.keras_models.define_shared_model()
        elif self.sequential_training and self.stateful and sum(
                self.shared_layer_flag) == 0:
            self.keras_models.define_stateful_model(batch_size=self.batch_size,
                                                    seq_length=self.seq_length)

        else:
            raise Exception('Model can not be defined with given settings.')

        #### train the model ####
        print('training...')
        shared = sum(self.shared_layer_flag)
        if not self.sequential_training:
            # Train feedforward model
            self.keras_models.train_feedforward_model(train_x, train_y,
                                                      valid_x, valid_y)
            self.keras_models.save_model(self.json_model_file,
                                         self.h5_model_file,
                                         self.model_params_file)

        elif self.sequential_training and self.batch_size == 1 and sum(
                self.shared_layer_flag) == 0:
            # Train recurrent model of batch size one
            self.keras_models.train_recurrent_model_batchsize_one(
                train_x, train_y, valid_x, valid_y)
            self.keras_models.save_model(self.json_model_file,
                                         self.h5_model_file,
                                         self.model_params_file)

        elif self.sequential_training and self.batch_size == 1 and sum(
                self.shared_layer_flag) > 0:
            self.keras_models.train_shared_model(train_x, train_y, valid_x,
                                                 valid_y)
            self.keras_models.save_models(self.json_model_file,
                                          self.h5_model_file,
                                          self.model_params_file)

        elif self.sequential_training and self.stateful:
            # Train recurrent model of many batches, should it be stateful?
            self.keras_models.train_recurrent_model(train_x,
                                                    train_y,
                                                    valid_x,
                                                    valid_y,
                                                    train_flen,
                                                    training_algo=1)
            self.keras_models.save_model(self.json_model_file,
                                         self.h5_model_file,
                                         self.model_params_file)

    def test_keras_model(self):

        # TODO: Overhaul this function

        #### load the model ####
        self.keras_models.load_model(self.json_model_file, self.h5_model_file)

        #### load the data ####
        print('preparing test_x from input feature files...')
        test_x, test_flen = data_utils.read_test_data_from_file_list(
            self.inp_test_file_list, self.inp_dim)

        #### normalize the data ####
        data_utils.norm_data(test_x, self.inp_scaler)

        #### compute predictions ####
        self.keras_models.predict(test_x, self.out_scaler,
                                  self.gen_test_file_list,
                                  self.sequential_training)

    def main_function(self):

        ### Implement each module ###
        if self.MAKELAB:
            self.make_labels()

        if self.MAKECMP:
            self.make_cmp()

        if self.NORMDATA:
            self.normalize_data()

        if self.TRAINDNN:
            self.train_keras_model()

        if self.TESTDNN:
            self.test_keras_model()
Beispiel #15
0
    def user_configuration(self, configFile=None):

        # get a logger
        logger = logging.getLogger("configuration")

        # load and parse the provided configFile, if provided
        if not configFile:
            logger.warn(
                'no user configuration file provided; using only built-in default settings'
            )
            return

        # load the config file
        try:
            cfgparser = configparser.ConfigParser()
            cfgparser.readfp(open(configFile))
            logger.debug(
                'successfully read and parsed user configuration file %s' %
                configFile)
        except:
            logger.fatal('error reading user configuration file %s' %
                         configFile)
            raise

        #work_dir must be provided before initialising other directories
        try:
            self.work_dir = cfgparser.get('Paths', 'work')
            self.data_dir = cfgparser.get('Paths', 'data')
            self.plot_dir = cfgparser.get('Paths', 'plot')
            self.model_output_type = cfgparser.get('Input-Output',
                                                   'model_output_type')

        except (configparser.NoSectionError, configparser.NoOptionError):
            self.work_dir = None
            self.data_dir = None
            self.plot_dir = None
            logger.critical('Paths:work has no value!')
            raise Exception

        # The model must be placed in the processors folder which is copied to the voice folder for synthesis
        if self.model_output_type == 'duration':
            self.model_dir = os.path.join(self.data_dir, 'processors',
                                          'duration_predictor')
        elif self.model_output_type == 'acoustic':
            self.model_dir = os.path.join(self.data_dir, 'processors',
                                          'acoustic_predictor')

        # default place for some data
        self.keras_dir = os.path.join(self.work_dir, 'keras')
        self.gen_dir = os.path.join(self.keras_dir, 'gen')
        self.stats_dir = os.path.join(self.keras_dir, 'stats')

        self.question_file_name = cfgparser.get('Labels', 'question_file_name')
        self.add_frame_features = cfgparser.get('Labels', 'add_frame_features')
        self.subphone_feats = cfgparser.get('Labels', 'subphone_feats')

        self.model_type = cfgparser.get('Labels', 'subphone_feats')

        # TODO: the configuration is inflexible, it has hard coded elements and is designed only for the acoustic model
        # TODO: improve flexibility and incorporate duration model elements
        # TODO: I am going to perform all data normalization tasks in KerasClass

        # Set up file paths to ossian defaults
        label_normaliser = HTSLabelNormalisation(
            question_file_name=self.question_file_name,
            add_frame_features=self.add_frame_features,
            subphone_feats=self.subphone_feats)
        self.inp_dim = label_normaliser.dimension
        # lab_dim = label_normaliser.dimension
        # logger.info('Input label dimension is %d' % lab_dim)
        # suffix = str(lab_dim)

        # the number can be removed
        # binary_label_dir = os.path.join(self.work_dir, 'binary_label_' + str(label_normaliser.dimension))
        # nn_label_dir = os.path.join(self.work_dir, 'nn_no_silence_lab_' + suffix)
        # self.def_inp_dir = os.path.join(self.work_dir, 'nn_no_silence_lab_norm_' + suffix)

        # self.def_inp_dir = os.path.join(self.work_dir, 'nn_no_silence_lab_norm_%s' % 1)
        # self.def_out_dir = os.path.join(self.work_dir, 'nn_norm_mgc_lf0_vuv_bap_%s' % 1)

        # ---------------------------------------------------
        # ------------------- Output data -------------------
        # ---------------------------------------------------
        # Binary data (already generated by ossian)
        self.out_feat_dir_dur = os.path.join(self.data_dir, 'dur')
        self.out_feat_dir_cmp = os.path.join(self.data_dir, 'cmp')

        self.out_feat_dir_dur_norm = os.path.join(self.data_dir, 'dur_norm')
        self.out_feat_dir_cmp_norm = os.path.join(self.data_dir, 'cmp_norm')

        self.nn_cmp_dir = os.path.join(self.data_dir, 'nn_cmp')
        self.nn_cmp_norm_dir = os.path.join(self.data_dir, 'nn_norm_cmp')

        # ---------------------------------------------------
        # ------------------- Input data -------------------
        # ---------------------------------------------------
        # Raw text data
        self.inp_feat_dir_dur = os.path.join(self.data_dir, 'lab_dur')
        self.inp_feat_dir_cmp = os.path.join(self.data_dir, 'lab_dnn')

        # Binary data
        self.bin_lab_dir_dur = os.path.join(
            self.data_dir, 'bin_lab_phone_%s' % str(self.inp_dim))
        self.bin_lab_dir_cmp = os.path.join(
            self.data_dir, 'bin_lab_state_%s' % str(self.inp_dim))

        # Binary data silence removed
        self.bin_lab_dir_dur_nosilence = os.path.join(
            self.data_dir, 'bin_lab_phone_no_sil_%s' % str(self.inp_dim))
        self.bin_lab_dir_cmp_nosilence = os.path.join(
            self.data_dir, 'bin_lab_state_no_sil_%s' % str(self.inp_dim))

        # Binary data silence removed and normalized
        self.bin_lab_dir_dur_nosilence_norm = os.path.join(
            self.data_dir, 'bin_lab_phone_no_sil_norm_%s' % str(self.inp_dim))
        self.bin_lab_dir_cmp_nosilence_norm = os.path.join(
            self.data_dir, 'bin_lab_state_no_sil_norm_%s' % str(self.inp_dim))

        # self.inter_data_dir = os.path.join(self.work_dir, 'inter_module')
        # self.def_inp_dir    = os.path.join(self.inter_data_dir, 'nn_no_silence_lab_norm_425')
        # self.def_out_dir    = os.path.join(self.inter_data_dir, 'nn_norm_mgc_lf0_vuv_bap_187')

        impossible_int = int(-99999)
        impossible_int = int(-99999)
        impossible_float = float(-99999.0)

        user_options = [

            # General paths
            ('work_dir', self.work_dir, 'Paths', 'work'),
            ('data_dir', self.data_dir, 'Paths', 'data'),
            ('plot_dir', self.model_dir, 'Paths', 'plot'),
            ('model_dir', self.model_dir, 'Paths', 'models'),
            ('stats_dir', self.stats_dir, 'Paths', 'stats'),
            ('gen_dir', self.gen_dir, 'Paths', 'gen'),

            # Output data paths
            ('out_feat_dir_dur', self.out_feat_dir_dur, 'Paths', 'out_feat'),
            ('out_feat_dir_cmp', self.out_feat_dir_cmp, 'Paths', 'out_feat'),
            ('out_feat_dir_dur_norm', self.out_feat_dir_dur_norm, 'Paths',
             'out_feat'),
            ('out_feat_dir_cmp_norm', self.out_feat_dir_cmp_norm, 'Paths',
             'out_feat'),

            # Input data paths
            ('inp_feat_dir_dur', self.inp_feat_dir_dur, 'Paths', 'inp_feat'),
            ('inp_feat_dir_cmp', self.inp_feat_dir_cmp, 'Paths', 'inp_feat'),
            ('bin_lab_dir_dur', self.bin_lab_dir_dur, 'Paths', 'inp_feat'),
            ('bin_lab_dir_cmp', self.bin_lab_dir_cmp, 'Paths', 'inp_feat'),
            ('bin_lab_dir_dur_nosilence', self.bin_lab_dir_dur_nosilence,
             'Paths', 'inp_feat'),
            ('bin_lab_dir_cmp_nosilence', self.bin_lab_dir_cmp_nosilence,
             'Paths', 'inp_feat'),
            ('bin_lab_dir_dur_nosilence_norm',
             self.bin_lab_dir_dur_nosilence_norm, 'Paths', 'inp_feat'),
            ('bin_lab_dir_cmp_nosilence_norm',
             self.bin_lab_dir_cmp_nosilence_norm, 'Paths', 'inp_feat'),

            # TODO: Where is the actual file list? Fix these variables -- I believe this is fixed
            ('file_id_scp',
             os.path.join(self.data_dir,
                          'processors/duration_predictor/filelist.txt'),
             'Paths', 'file_id_list'),
            # ('test_id_scp', os.path.join(self.data_dir, 'test_id_list.scp'), 'Paths', 'test_id_list'),

            # Labels
            ('label_type', 'phone_align', 'Labels', 'label_type'),
            ('silence_pattern', ['*-#+*'], 'Labels', 'silence_pattern'),

            # Input-Output
            # TODO: I can ad dur to this list to combine duration and acoustic modeling
            ('output_features', ['mgc', 'lf0', 'vuv',
                                 'bap'], 'Input-Output', 'output_features'),
            ('model_output_type', 'acoustic', 'Input-Output',
             'model_output_type'),
            ('inp_dim', self.inp_dim, 'Input-Output', 'inp_dim'),
            # ('out_dim', 187, 'Input-Output', 'out_dim'),
            ('mgc_dim', 60, 'Input-Output', 'mgc'),
            ('lf0_dim', 1, 'Input-Output', 'lf0'),
            ('bap_dim', 5, 'Input-Output', 'bap'),
            ('dmgc_dim', 180, 'Input-Output', 'mgc'),
            ('dlf0_dim', 3, 'Input-Output', 'lf0'),
            ('dbap_dim', 15, 'Input-Output', 'bap'),
            ('dur_dim', 5, 'Input-Output', 'cmp'),
            ('cmp_dim', 60 * 3 + 1 * 3 + 5 * 3, 'Input-Output', 'cmp'),
            ('inp_file_ext', '.lab', 'Input-Output', 'inp_file_ext'),
            ('out_file_ext', '.cmp', 'Input-Output', 'out_file_ext'),
            ('mgc_ext', '.mgc', 'Input-Output', 'mgc_ext'),
            ('bap_ext', '.bap', 'Input-Output', 'bap_ext'),
            ('lf0_ext', '.lf0', 'Input-Output', 'lf0_ext'),
            ('cmp_ext', '.cmp', 'Input-Output', 'cmp_ext'),
            ('lab_ext', '.lab', 'Input-Output', 'lab_ext'),
            ('utt_ext', '.utt', 'Input-Output', 'utt_ext'),
            ('stepw_ext', '.stepw', 'Input-Output', 'stepw_ext'),
            ('sp_ext', '.sp', 'Input-Output', 'sp_ext'),
            ('dur_ext', '.dur', 'Input-Output', 'dur_ext'),
            ('inp_norm', 'MVN', 'Input-Output', 'inp_norm'),
            ('out_norm', 'MVN', 'Input-Output', 'out_norm'),

            # Architecture
            ('hidden_layer_type', ['tanh', 'tanh', 'tanh', 'tanh'],
             'Architecture', 'hidden_layer_type'),
            ('hidden_layer_size', [1024, 1024, 1024,
                                   1024], 'Architecture', 'hidden_layer_size'),
            ('shared_layer_flag', [0, 0, 0,
                                   0], 'Architecture', 'shared_layer_flag'),
            ('speaker_id', ['placeholder'], 'Architecture', 'speaker_id'),
            ('batch_size', 256, 'Architecture', 'batch_size'),
            ('num_of_epochs', 1, 'Architecture', 'training_epochs'),
            ('stopping_patience', 10, 'Architecture', 'stopping_patience'),
            ('restore_best_weights', True, 'Architecture',
             'restore_best_weights'),
            ('dropout_rate', 0.0, 'Architecture', 'dropout_rate'),
            ('l1_reg', 0.0, 'Architecture', 'l1_reg'),
            ('l2_reg', 0.0, 'Architecture', 'l2_reg'),
            ('output_layer_type', 'linear', 'Architecture',
             'output_layer_type'),
            ('optimizer', 'adam', 'Architecture', 'optimizer'),
            ('loss_function', 'mse', 'Architecture', 'loss_function'),

            # RNN
            ('sequential_training', False, 'Architecture',
             'sequential_training'),
            ('stateful', False, 'Architecture', 'stateful'),
            ('use_high_batch_size', False, 'Architecture',
             'use_high_batch_size'),
            ('training_algo', 1, 'Architecture', 'training_algo'),
            ('merge_size', 1, 'Architecture', 'merge_size'),
            ('seq_length', 200, 'Architecture', 'seq_length'),
            ('bucket_range', 100, 'Architecture', 'bucket_range'),
            ('gpu_num', 1, 'Architecture', 'gpu_num'),

            # Data
            ('shuffle_data', True, 'Data', 'shuffle_data'),
            ('train_file_number', impossible_int, 'Data', 'train_file_number'),
            ('valid_file_number', impossible_int, 'Data', 'valid_file_number'),
            ('test_file_number', impossible_int, 'Data', 'test_file_number'),

            # Processes
            ('GenTestList', False, 'Processes', 'GenTestList'),
            ('NORMDATA', False, 'Processes', 'NORMDATA'),
            ('TRAINDNN', False, 'Processes', 'TRAINDNN'),
            ('TESTDNN', False, 'Processes', 'TESTDNN'),
            ('MAKELAB', False, 'Processes', 'MAKELAB'),
            ('MAKECMP', False, 'Processes', 'MAKECMP'),
        ]

        # this uses exec(...) which is potentially dangerous since arbitrary code could be executed
        for (variable, default, section, option) in user_options:
            # default value
            value = None

            try:
                # first, look for a user-set value for this variable in the config file
                value = cfgparser.get(section, option)
                user_or_default = 'user'

            except (configparser.NoSectionError, configparser.NoOptionError):
                # use default value, if there is one
                if (default == None) or \
                   (default == '')   or \
                   ((type(default) == int) and (default == impossible_int)) or \
                   ((type(default) == float) and (default == impossible_float))  :
                    logger.critical('%20s has no value!' %
                                    (section + ":" + option))
                    raise Exception
                else:
                    value = default
                    user_or_default = 'default'

            if type(default) == str:
                exec('self.%s = "%s"' % (variable, value))
            elif type(default) == int:
                exec('self.%s = int(%s)' % (variable, value))
            elif type(default) == float:
                exec('self.%s = float(%s)' % (variable, value))
            elif type(default) == bool:
                exec('self.%s = bool(%s)' % (variable, value))
            elif type(default) == list:
                exec('self.%s = list(%s)' % (variable, value))
            elif type(default) == dict:
                exec('self.%s = dict(%s)' % (variable, value))
            else:
                logger.critical(
                    'Variable %s has default value of unsupported type %s',
                    variable, type(default))
                raise Exception(
                    'Internal error in configuration settings: unsupported default type'
                )

            logger.info('%20s has %7s value %s' %
                        (section + ":" + option, user_or_default, value))
Beispiel #16
0
    # Configuration object cfg from config argument
    cfg = configuration.cfg
    cfg.configure(sys.argv[1])

    # Get training file id list
    file_id_list = read_file_list(cfg.file_id_scp)

    # Y data file lists
    # nn_cmp_dir = os.path.join(cfg.data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_norm_dir = os.path.join(cfg.data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    # nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext)
    # nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext)

    # Get label dimensions
    label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name,
                                             add_frame_features=cfg.add_frame_features,
                                             subphone_feats=cfg.subphone_feats)
    add_feat_dim = sum(cfg.additional_features.values())
    inp_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim
    out_dim = cfg.cmp_dim

    # X data file lists
    # binary_label_dir = os.path.join(cfg.data_dir, 'binary_label_'+str(label_normaliser.dimension))
    # nn_label_dir = os.path.join(cfg.data_dir, 'nn_no_silence_lab_'+str(lab_dim))
    nn_label_norm_dir = os.path.join(cfg.data_dir, 'nn_no_silence_lab_norm_'+str(inp_dim))
    # binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext)
    # nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext)
    # nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext)

    # Split files into train and test sets
    train_file_number = cfg.train_file_number
def main_function(cfg):

    logger = logging.getLogger("main")
    plotlogger = logging.getLogger("plotting")
    plotlogger.set_plot_path(cfg.plot_dir)

    data_dir = cfg.data_dir

    label_normaliser = HTSLabelNormalisation(
        question_file_name=cfg.question_file_name,
        add_frame_features=cfg.add_frame_features,
        subphone_feats=cfg.subphone_feats)
    add_feat_dim = sum(cfg.additional_features.values())
    lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim
    logger.info('Input label dimension is %d' % lab_dim)
    suffix = str(lab_dim)
    label_data_dir = cfg.work_dir

    binary_label_dir = os.path.join(
        label_data_dir, 'binary_label_' + str(label_normaliser.dimension))
    nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix)
    nn_label_norm_dir = os.path.join(label_data_dir, 'output')

    min_max_normaliser = None
    label_norm_file = 'label_norm_%s_%d.dat' % (cfg.label_style, lab_dim)
    label_norm_file = os.path.join(data_dir, label_norm_file)

    try:
        test_id_list = read_file_list(cfg.test_id_scp)
        logger.debug('Loaded file id list from %s' % cfg.test_id_scp)
    except IOError:
        logger.critical('Could not load file id list from %s' %
                        cfg.test_id_scp)
        raise

    in_label_align_file_list = prepare_file_path_list(test_id_list,
                                                      cfg.in_label_align_dir,
                                                      cfg.lab_ext, False)
    binary_label_file_list = prepare_file_path_list(test_id_list,
                                                    binary_label_dir,
                                                    cfg.lab_ext)
    nn_label_file_list = prepare_file_path_list(test_id_list, nn_label_dir,
                                                cfg.lab_ext)
    nn_label_norm_file_list = prepare_file_path_list(test_id_list,
                                                     nn_label_norm_dir,
                                                     cfg.lab_ext)

    logger.info('preparing label data (input) using standard HTS style labels')
    label_normaliser.perform_normalisation(in_label_align_file_list,
                                           binary_label_file_list,
                                           label_type=cfg.label_type)
    remover = SilenceRemover(n_cmp=lab_dim,
                             silence_pattern=cfg.silence_pattern,
                             label_type=cfg.label_type,
                             remove_frame_features=cfg.add_frame_features,
                             subphone_feats=cfg.subphone_feats)
    remover.remove_silence(binary_label_file_list, in_label_align_file_list,
                           nn_label_file_list)

    min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim,
                                             min_value=0.01,
                                             max_value=0.99)
    min_max_normaliser.load_min_max_values(label_norm_file)
    min_max_normaliser.normalise_data(binary_label_file_list,
                                      nn_label_norm_file_list)