Esempio n. 1
0
class NNDurationModel(NN):
    ## assume single stream, with 1 output = with n elements for n states of 1 phone
    def __init__(self, model_dir, question_file_name):
        super(NNDurationModel, self).__init__(model_dir)
        #self.label_expander = HTSLabelNormalisation(question_file_name=question_file_name, add_frame_features=False, subphone_feats='none') # , label_type='phone_align')
        self.label_expander = HTSDurationLabelNormalisation(
            question_file_name=question_file_name)

    def generate(self,
                 htk_label_file,
                 enforce_silence=False,
                 mlpg=True,
                 vuv_thresh=0.5,
                 fzero_scale=1.0):

        input = self.label_expander.load_labels_with_state_alignment(
            htk_label_file)
        output = self.predict(input)

        ## as these are state durations in frames, enforce integer valued output, values greater than 0:
        output = numpy.round(output)
        output[output < 1] = 1
        output = numpy.array(output, dtype='int')

        return output
Esempio n. 2
0
def main_function(cfg, outdir, model_pickle_file=None):

    hidden_layer_size = cfg.hyper_params['hidden_layer_size']
    data_dir = cfg.data_dir
    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    #     norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label
    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(
            question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim
        print(('Input label dimension is %d' % lab_dim))
        suffix = str(lab_dim)
    elif cfg.label_style == 'HTS_duration':
        label_normaliser = HTSDurationLabelNormalisation(
            question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension  ## + cfg.appended_input_dim
        print(('Input label dimension is %d' % lab_dim))
        suffix = str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix = 'composed'

    combined_model_arch = str(len(hidden_layer_size))
    for hid_size in hidden_layer_size:
        combined_model_arch += '_' + str(hid_size)

    ## if made with run_lstm:--
    '''
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \
                      %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch),
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate'])
    '''

    ## if made with run_dnn:--
    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch),
                        combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)

    ## override the name computed from config variables if model_pickle_file specified:
    if model_pickle_file != None:
        nnets_file_name = model_pickle_file

    print('store DNN')

    store_network(nnets_file_name, outdir)
Esempio n. 3
0
def main_function(cfg):
    # get a logger for this main function
    logger = logging.getLogger("main")

    # get another logger to handle plotting duties
    plotlogger = logging.getLogger("plotting")

    # later, we might do this via a handler that is created, attached and configured
    # using the standard config mechanism of the logging module
    # but for now we need to do it manually
    plotlogger.set_plot_path(cfg.plot_dir)

    #### parameter setting########
    hidden_layers_sizes = cfg.hyper_params['hidden_layer_size']

    ####prepare environment

    try:
        file_id_list = read_file_list(cfg.file_id_scp)
        logger.debug('Loaded file id list from %s' % cfg.file_id_scp)
    except IOError:
        # this means that open(...) threw an error
        logger.critical('Could not load file id list from %s' % cfg.file_id_scp)
        raise

    ###total file number including training, development, and testing
    total_file_number = len(file_id_list)

    data_dir = cfg.data_dir

    nn_cmp_dir = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_nosil_dir = os.path.join(data_dir, 'nn_nosil' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))
    nn_cmp_norm_dir = os.path.join(data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim))

    model_dir = os.path.join(cfg.work_dir, 'nnets_model')
    gen_dir = os.path.join(cfg.work_dir, 'gen')

    in_file_list_dict = {}

    for feature_name in list(cfg.in_dir_dict.keys()):
        in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name],
                                                                 cfg.file_extension_dict[feature_name], False)

    nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext)
    nn_cmp_nosil_file_list = prepare_file_path_list(file_id_list, nn_cmp_nosil_dir, cfg.cmp_ext)
    nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext)

    ###normalisation information
    norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(
        cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat')

    ### normalise input full context label

    # currently supporting two different forms of lingustic features
    # later, we should generalise this

    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim
        logger.info('Input label dimension is %d' % lab_dim)
        suffix = str(lab_dim)
    elif cfg.label_style == 'HTS_duration':
        label_normaliser = HTSDurationLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension  ## + cfg.appended_input_dim
        logger.info('Input label dimension is %d' % lab_dim)
        suffix = str(lab_dim)
    # no longer supported - use new "composed" style labels instead
    elif cfg.label_style == 'composed':
        # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name)
        suffix = 'composed'

    if cfg.process_labels_in_work_dir:
        label_data_dir = cfg.work_dir
    else:
        label_data_dir = data_dir

    # the number can be removed
    binary_label_dir = os.path.join(label_data_dir, 'binary_label_' + suffix)
    nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix)
    nn_label_norm_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_norm_' + suffix)
    #    nn_label_norm_mvn_dir = os.path.join(data_dir, 'nn_no_silence_lab_norm_'+suffix)

    in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False)
    binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext)
    nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext)
    nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext)

    # to do - sanity check the label dimension here?



    min_max_normaliser = None
    label_norm_file = 'label_norm_%s.dat' % (cfg.label_style)
    label_norm_file = os.path.join(label_data_dir, label_norm_file)

    normaliser = math_statis.Statis(feature_dimension=lab_dim, read_func=file_util.load_binary_file_frame,
                                    writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99)
    if cfg.NORMLAB and (cfg.label_style in ['HTS', 'HTS_duration']):
        # simple HTS labels
        logger.info('preparing label data (input) using standard HTS style labels')
        label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list)

        if cfg.label_style == 'HTS':
            remover = SilenceRemover(n_cmp=lab_dim, silence_pattern=cfg.silence_pattern)
            remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list)
        elif cfg.label_style == 'HTS_duration':
            ## don't remove silences for duration
            nn_label_file_list = binary_label_file_list
        ###use only training data to find min-max information, then apply on the whole dataset
        normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])
        normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)

    if cfg.NORMLAB and (cfg.label_style == 'composed'):
        # new flexible label preprocessor

        logger.info('preparing label data (input) using "composed" style labels')
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)

        logger.info('Loaded label configuration')
        # logger.info('%s' % label_composer.configuration.labels )

        lab_dim = label_composer.compute_label_dimension()
        logger.info('label dimension will be %d' % lab_dim)

        if cfg.precompile_xpaths:
            label_composer.precompile_xpaths()

        # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees)
        # create all the lists of these, ready to pass to the label composer

        in_label_align_file_list = {}
        for label_style, label_style_required in label_composer.label_styles.items():
            if label_style_required:
                logger.info('labels of style %s are required - constructing file paths for them' % label_style)
                if label_style == 'xpath':
                    in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, cfg.xpath_label_align_dir,
                                                                               cfg.utt_ext, False)
                elif label_style == 'hts':
                    in_label_align_file_list['hts'] = prepare_file_path_list(file_id_list, cfg.hts_label_align_dir,
                                                                             cfg.lab_ext, False)
                else:
                    logger.critical('unsupported label style %s specified in label configuration' % label_style)
                    raise Exception

            # now iterate through the files, one at a time, constructing the labels for them
            num_files = len(file_id_list)
            logger.info('the label styles required are %s' % label_composer.label_styles)

            for i in range(num_files):
                logger.info('making input label features for %4d of %4d' % (i + 1, num_files))

                # iterate through the required label styles and open each corresponding label file

                # a dictionary of file descriptors, pointing at the required files
                required_labels = {}

                for label_style, label_style_required in label_composer.label_styles.items():

                    # the files will be a parallel set of files for a single utterance
                    # e.g., the XML tree and an HTS label file
                    if label_style_required:
                        required_labels[label_style] = open(in_label_align_file_list[label_style][i], 'r')
                        logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i])

                logger.debug('label styles with open files: %s' % required_labels)
                label_composer.make_labels(required_labels, out_file_name=binary_label_file_list[i],
                                           fill_missing_values=cfg.fill_missing_values,
                                           iterate_over_frames=cfg.iterate_over_frames)

                # now close all opened files
                for fd in required_labels.values():
                    fd.close()

        # silence removal
        if cfg.remove_silence_using_binary_labels:
            silence_feature = 0  ## use first feature in label -- hardcoded for now
            logger.info('Silence removal from label using silence feature: %s' % (
            label_composer.configuration.labels[silence_feature]))
            logger.info('Silence will be removed from CMP files in same way')
            ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming:
            trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \
                         binary_label_file_list, lab_dim, silence_feature, percent_to_keep=5)
        else:
            logger.info('No silence removal done')
            # start from the labels we have just produced, not trimmed versions
            nn_label_file_list = binary_label_file_list

        ###use only training data to find min-max information, then apply on the whole dataset
        normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number])
        normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list)

    if normaliser != None:
        ### save label normalisation information for unseen testing labels
        label_min_vector = normaliser.min_vector
        label_max_vector = normaliser.max_vector
        label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0)

        label_norm_info = numpy.array(label_norm_info, 'float32')
        fid = open(label_norm_file, 'wb')
        label_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' % (label_min_vector.size, label_norm_file))

    ### make output acoustic data
    if cfg.MAKECMP:
        logger.info('creating acoustic (output) features')
        delta_win = cfg.delta_win  # [-0.5, 0.0, 0.5]
        acc_win = cfg.acc_win  # [1.0, -2.0, 1.0]

        acoustic_worker = AcousticComposition(delta_win=delta_win, acc_win=acc_win)
        acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict,
                                        cfg.out_dimension_dict)

        if cfg.label_style == 'HTS':

            if cfg.remove_silence_using_binary_labels:
                ## do this to get lab_dim:
                label_composer = LabelComposer()
                label_composer.load_label_configuration(cfg.label_config_file)
                lab_dim = label_composer.compute_label_dimension()

                silence_feature = 0  ## use first feature in label -- hardcoded for now
                logger.info('Silence removal from CMP using binary label file')

                ## overwrite the untrimmed audio with the trimmed version:
                trim_silence(nn_cmp_file_list, nn_cmp_nosil_file_list, cfg.cmp_dim,
                             binary_label_file_list, lab_dim, silence_feature)

            else:  ## back off to previous method using HTS labels:
                remover = SilenceRemover(n_cmp=cfg.cmp_dim, silence_pattern=cfg.silence_pattern)
                remover.remove_silence(nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number],
                                       in_label_align_file_list[0:cfg.train_file_number + cfg.valid_file_number],
                                       nn_cmp_nosil_file_list[
                                       0:cfg.train_file_number + cfg.valid_file_number])  # save to itself


        elif cfg.label_style == 'HTS_duration':
            ## don't remove silences for duration
            nn_cmp_nosil_file_list = nn_cmp_file_list
            pass

    ### save acoustic normalisation information for normalising the features back
    var_dir = os.path.join(data_dir, 'var')
    if not os.path.exists(var_dir):
        os.makedirs(var_dir)

    var_file_dict = {}
    for feature_name in list(cfg.out_dimension_dict.keys()):
        var_file_dict[feature_name] = os.path.join(var_dir,
                                                   feature_name + '_' + str(cfg.out_dimension_dict[feature_name]))

    ### normalise output acoustic data
    if cfg.NORMCMP:
        logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation)
        cmp_norm_info = None
        normaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                        writer_func=file_util.array_to_binary_file)
        if cfg.output_feature_normalisation == 'MVN':

            ###calculate mean and std vectors on the training data, and apply on the whole dataset
            global_mean_vector = normaliser.compute_mean(nn_cmp_nosil_file_list[0:cfg.train_file_number], 0,
                                                         cfg.cmp_dim)
            global_std_vector = normaliser.compute_std(nn_cmp_nosil_file_list[0:cfg.train_file_number],
                                                       global_mean_vector, 0, cfg.cmp_dim)

            normaliser.feature_normalisation(nn_cmp_nosil_file_list[0:cfg.train_file_number + cfg.valid_file_number],
                                             nn_cmp_norm_file_list[0:cfg.train_file_number + cfg.valid_file_number])
            cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0)

        elif cfg.output_feature_normalisation == 'MINMAX':

            global_mean_vector = normaliser.compute_mean(nn_cmp_nosil_file_list[0:cfg.train_file_number])
            global_std_vector = normaliser.compute_std(nn_cmp_nosil_file_list[0:cfg.train_file_number],
                                                               global_mean_vector)
            normaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                            writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99)
            # min_max_normaliser = MinMaxNormalisation(feature_dimension=cfg.cmp_dim, min_value=0.01, max_value=0.99)
            normaliser.find_min_max_values(nn_cmp_nosil_file_list[0:cfg.train_file_number])
            normaliser.normalise_data(nn_cmp_nosil_file_list, nn_cmp_norm_file_list)

            cmp_min_vector = normaliser.min_vector
            cmp_max_vector = normaliser.max_vector
            cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0)

        else:
            logger.critical('Normalisation type %s is not supported!\n' % (cfg.output_feature_normalisation))
            raise

        cmp_norm_info = numpy.array(cmp_norm_info, 'float32')
        fid = open(norm_info_file, 'wb')
        cmp_norm_info.tofile(fid)
        fid.close()
        logger.info('saved %s vectors to %s' % (cfg.output_feature_normalisation, norm_info_file))
        # logger.debug(' value was\n%s' % cmp_norm_info)

        feature_index = 0
        for feature_name in list(cfg.out_dimension_dict.keys()):
            feature_std_vector = numpy.array(
                global_std_vector[:, feature_index:feature_index + cfg.out_dimension_dict[feature_name]], 'float32')

            fid = open(var_file_dict[feature_name], 'w')
            feature_std_vector.tofile(fid)
            fid.close()

            logger.info('saved %s variance vector to %s' % (feature_name, var_file_dict[feature_name]))
            # logger.debug(' value was\n%s' % feature_std_vector)

            feature_index += cfg.out_dimension_dict[feature_name]

    train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number]
    train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number]
    valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number]
    valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number]
    test_x_file_list = nn_label_norm_file_list[
                       cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
    test_y_file_list = nn_cmp_norm_file_list[
                       cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

    # we need to know the label dimension before training the DNN
    # computing that requires us to look at the labels
    #
    # currently, there are two ways to do this
    if cfg.label_style == 'HTS':
        label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name)
        lab_dim = label_normaliser.dimension + cfg.appended_input_dim

    elif cfg.label_style == 'composed':
        label_composer = LabelComposer()
        label_composer.load_label_configuration(cfg.label_config_file)
        lab_dim = label_composer.compute_label_dimension()

    logger.info('label dimension is %d' % lab_dim)

    combined_model_arch = str(len(hidden_layers_sizes))
    for hid_size in hidden_layers_sizes:
        combined_model_arch += '_' + str(hid_size)

    nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \
                      % (model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch),
                         combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number)

    ### DNN model training
    if cfg.TRAINDNN:

        logger.info('training DNN')

        try:
            os.makedirs(model_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create model directory %s' % model_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        try:
            # print   'start DNN'
            train_DNN(train_xy_file_list=(train_x_file_list, train_y_file_list), \
                      valid_xy_file_list=(valid_x_file_list, valid_y_file_list), \
                      nnets_file_name=nnets_file_name, \
                      n_ins=lab_dim, n_outs=cfg.cmp_dim, ms_outs=cfg.multistream_outs, \
                      hyper_params=cfg.hyper_params, buffer_size=cfg.buffer_size, plot=cfg.plot)
        except KeyboardInterrupt:
            logger.critical('train_DNN interrupted via keyboard')
            # Could 'raise' the exception further, but that causes a deep traceback to be printed
            # which we don't care about for a keyboard interrupt. So, just bail out immediately
            sys.exit(1)
        except:
            logger.critical('train_DNN threw an exception')
            raise

    ### generate parameters from DNN
    temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d' \
                    % (cfg.model_type, cfg.combined_feature_name, int(cfg.do_post_filtering), \
                       cfg.train_file_number, lab_dim, cfg.cmp_dim, \
                       len(hidden_layers_sizes), hidden_layers_sizes[0])
    gen_dir = os.path.join(gen_dir, temp_dir_name)

    gen_file_id_list = file_id_list[
                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
    test_x_file_list = nn_label_norm_file_list[
                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

    if cfg.DNNGEN:
        logger.info('generating from DNN')

        try:
            os.makedirs(gen_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                # not an error - just means directory already exists
                pass
            else:
                logger.critical('Failed to create generation directory %s' % gen_dir)
                logger.critical(' OS error was: %s' % e.strerror)
                raise

        gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext)

        #        dnn_generation(valid_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)
        dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list)

        logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation)

        fid = open(norm_info_file, 'rb')
        cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_min_vector = cmp_min_max[0,]
        cmp_max_vector = cmp_min_max[1,]
        denormaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                          writer_func=file_util.array_to_binary_file)
        if cfg.output_feature_normalisation == 'MVN'
            denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector)

        elif cfg.output_feature_normalisation == 'MINMAX':
            denormaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame,
                                              writer_func=file_util.array_to_binary_file,
                                              min_value=0.01, max_value=0.99, min_vector=cmp_min_vector,
                                              max_vector=cmp_max_vector)
            denormaliser.denormalise_data(gen_file_list, gen_file_list)
        else:
            logger.critical('denormalising method %s is not supported!\n' % (cfg.output_feature_normalisation))
            raise

        ##perform MLPG to smooth parameter trajectory
        ## lf0 is included, the output features much have vuv.
        generator = ParameterGeneration(gen_wav_features=cfg.gen_wav_features)
        generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict,
                                         var_file_dict)

    ### generate wav
    if cfg.GENWAV:
        logger.info('reconstructing waveform(s)')
        generate_wav(gen_dir, gen_file_id_list, cfg)  # generated speech
    #       generate_wav(nn_cmp_dir, gen_file_id_list)  # reference copy synthesis speech

    ### evaluation: calculate distortion
    if cfg.CALMCD:
        logger.info('calculating MCD')

        ref_data_dir = os.path.join(data_dir, 'ref_data')

        ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext)
        ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext)
        ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext)

        in_gen_label_align_file_list = in_label_align_file_list[
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
        calculator = IndividualDistortionComp()

        spectral_distortion = 0.0
        bap_mse = 0.0
        f0_mse = 0.0
        vuv_error = 0.0

        valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number]
        test_file_id_list = file_id_list[
                            cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

        if cfg.remove_silence_using_binary_labels:
            ## get lab_dim:
            label_composer = LabelComposer()
            label_composer.load_label_configuration(cfg.label_config_file)
            lab_dim = label_composer.compute_label_dimension()

            ## use first feature in label -- hardcoded for now
            silence_feature = 0

            ## Use these to trim silence:
            untrimmed_test_labels = binary_label_file_list[
                                    cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]

        if 'mgc' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['mgc'][
                                           cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \
                             untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.mgc_dim, silence_pattern=cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['mgc'][
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number],
                                       in_gen_label_align_file_list, ref_mgc_list)
            valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir,
                                                                      cfg.mgc_ext, cfg.mgc_dim)
            test_spectral_distortion = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir,
                                                                     cfg.mgc_ext, cfg.mgc_dim)
            valid_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(2.0)  ##MCD
            test_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(2.0)  ##MCD

        if 'bap' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['bap'][
                                           cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \
                             untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.bap_dim, silence_pattern=cfg.silence_pattern)
                remover.remove_silence(in_file_list_dict['bap'][
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number],
                                       in_gen_label_align_file_list, ref_bap_list)
            valid_bap_mse = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext,
                                                          cfg.bap_dim)
            test_bap_mse = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext,
                                                         cfg.bap_dim)
            valid_bap_mse = valid_bap_mse / 10.0
            test_bap_mse = test_bap_mse / 10.0

        if 'lf0' in cfg.in_dimension_dict:
            if cfg.remove_silence_using_binary_labels:
                untrimmed_reference_data = in_file_list_dict['lf0'][
                                           cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number]
                trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \
                             untrimmed_test_labels, lab_dim, silence_feature)
            else:
                remover = SilenceRemover(n_cmp=cfg.lf0_dim, silence_pattern=['*-#+*'])
                remover.remove_silence(in_file_list_dict['lf0'][
                                       cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number],
                                       in_gen_label_align_file_list, ref_lf0_list)
            valid_f0_mse, valid_f0_corr, valid_vuv_error = calculator.compute_distortion(valid_file_id_list,
                                                                                         ref_data_dir, gen_dir,
                                                                                         cfg.lf0_ext, cfg.lf0_dim)
            test_f0_mse, test_f0_corr, test_vuv_error = calculator.compute_distortion(test_file_id_list, ref_data_dir,
                                                                                      gen_dir, cfg.lf0_ext, cfg.lf0_dim)

        logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
                    % (valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error * 100.))
        logger.info('Test   : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \
                    % (test_spectral_distortion, test_bap_mse, test_f0_mse, test_f0_corr, test_vuv_error * 100.))
Esempio n. 4
0
 def __init__(self, model_dir, question_file_name):
     super(NNDurationModel, self).__init__(model_dir)
     #self.label_expander = HTSLabelNormalisation(question_file_name=question_file_name, add_frame_features=False, subphone_feats='none') # , label_type='phone_align')
     self.label_expander = HTSDurationLabelNormalisation(
         question_file_name=question_file_name)