Esempio n. 1
0
                                             feat_len_kv=list_klen)

    group = model.in_size // data_iterator_source.get_feat_dim()

    sorted_data_idx = np.argsort(
        data_iterator_infer.get_feat_length())[::-1].tolist()
    data_rr = iter_minibucket(sorted_data_idx,
                              opts['chunk'],
                              shuffle=False,
                              excludes=[])
    list_saved = []
    for rr in tqdm(list(data_rr), ascii=True, ncols=50):
        curr_feat_list = data_iterator_infer.get_feat_by_index(rr)
        feat_mat, feat_len = batch_speech(opts['gpu'],
                                          curr_feat_list,
                                          feat_sil=feat_sil,
                                          group=group,
                                          start_sil=0,
                                          end_sil=0)
        pred_out = invert_mel_to_linear(model,
                                        Variable(feat_mat),
                                        feat_len,
                                        group=group)
        # convert to cpu & numpy #
        pred_out = pred_out.data.cpu().numpy()
        pred_out = [pred_out[ii, 0:feat_len[ii]] for ii in range(len(rr))]
        # save results
        for ii in range(len(rr)):
            _info = {}
            key_ii = data_iterator_infer.get_key_by_index(rr[ii])
            path_feat_ii = os.path.join(tmpdir, '{}.npz'.format(key_ii))
            feat_ii = pred_out[ii][0:feat_len[ii]]
Esempio n. 2
0
        for set_name, set_rr, set_train_mode in [('train', train_rr, True),
                                                 ('dev', dev_rr, False),
                                                 ('test', test_rr, False)]:
            for rr in tqdm_wrapper(set_rr):
                tic = timeit.default_timer()
                curr_key_list = feat_in_iterator[set_name].get_key_by_index(rr)
                curr_feat_in_list = feat_in_iterator[set_name].get_feat_by_key(
                    curr_key_list)
                curr_feat_out_list = feat_out_iterator[
                    set_name].get_feat_by_key(curr_key_list)
                # print(1, timeit.default_timer() - tic); tic = timeit.default_timer()
                feat_in_mat, feat_in_len = batch_speech(
                    opts['gpu'],
                    curr_feat_in_list,
                    feat_sil=feat_in_sil,
                    group=group,
                    start_sil=0,
                    end_sil=opts['pad_sil'])
                feat_out_mat, feat_out_len = batch_speech(
                    opts['gpu'],
                    curr_feat_out_list,
                    feat_sil=feat_out_sil,
                    group=group,
                    start_sil=0,
                    end_sil=opts['pad_sil'])
                # print(2, timeit.default_timer() - tic); tic = timeit.default_timer()
                _tmp_loss = fn_batch(feat_in_mat,
                                     feat_in_len,
                                     feat_out_mat,
                                     feat_out_len,
Esempio n. 3
0
    for rr in tqdm(list(data_rr), ascii=True, ncols=50):
        curr_key_list = [list_key[rrii] for rrii in rr]
        curr_feat_list = [list_feat[rrii]() for rrii in rr]  # lazy load call
        if args.mode == 'pred':
            pass
        elif args.mode == 'tf':
            curr_text_list = text_iterator.get_text_by_key(curr_key_list)
            text_mat, text_len = batch_text(args.gpu, curr_text_list)
        else:
            raise ValueError
        curr_feat_list = [
            np.hstack(x) if isinstance(x, (list, tuple)) else x
            for x in curr_feat_list
        ]
        feat_mat, feat_len = batch_speech(args.gpu,
                                          curr_feat_list,
                                          feat_sil=None)

        if args.mode == 'pred':
            if args.search == 'greedy':
                curr_best_hypothesis, _, curr_best_att = greedy_search(
                    model, feat_mat, feat_len, map_text2idx, args.max_target)
            elif args.search == 'beam':
                assert args.kbeam is not None, "kbeam must be specified"
                curr_best_hypothesis, _, curr_best_att = beam_search(
                    model, feat_mat, feat_len, map_text2idx, args.max_target,
                    args.kbeam, args.coeff_lp)
            else:
                raise ValueError('search method is not defined')
        elif args.mode == 'tf':
            _, _, curr_best_att = teacher_forcing(model,
        def iter_cycle_asr2tts(set_name, set_train_mode, set_rr):
            rr = set_rr
            rr = sort_reverse(rr, feat_len[set_name])
            rr_key = feat_iterator[set_name].get_key_by_index(rr)
            curr_feat_list = feat_iterator[set_name].get_feat_by_key(rr_key)
            curr_feat_mat, curr_feat_len = batch_speech(
                opts['gpu'],
                curr_feat_list,
                feat_sil=feat_sil,
                group=opts['tts_group'],
                start_sil=1,
                end_sil=opts['tts_pad_sil'])
            # modified feature for ASR #
            curr_feat_mat_for_asr = curr_feat_mat[:, 1:-opts[
                'tts_pad_sil']].contiguous().view(len(set_rr), -1, NDIM_FEAT)
            curr_feat_len_for_asr = [len(x) for x in curr_feat_list]

            if opts['asr_gen_search']['type'] == 'greedy':
                curr_pred_text_list, curr_pred_text_len, curr_pred_att_mat = generator_text.greedy_search(
                    model_asr,
                    curr_feat_mat_for_asr,
                    curr_feat_len_for_asr,
                    map_text2idx=map_text2idx,
                    max_target=opts['asr_gen_cutoff'])
            elif opts['asr_gen_search']['type'] == 'beam':
                curr_pred_text_list, curr_pred_text_len = [], []
                for ii in range(0, len(rr), opts['asr_gen_search']['chunk']):
                    _start_ii = ii
                    _end_ii = min(ii + opts['asr_gen_search']['chunk'],
                                  len(rr))
                    curr_pred_text_list_ii, curr_pred_text_len_ii, _ = generator_text.beam_search(
                        model_asr,
                        curr_feat_mat_for_asr[_start_ii:_end_ii],
                        curr_feat_len_for_asr[_start_ii:_end_ii],
                        map_text2idx=map_text2idx,
                        max_target=opts['asr_gen_cutoff'],
                        kbeam=opts['asr_gen_search']['kbeam'])
                    curr_pred_text_list.extend(curr_pred_text_list_ii)
                    curr_pred_text_len.extend(curr_pred_text_len_ii)
            if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
                curr_spkvec_list = feat_spkvec_iterator.get_feat_by_key(rr_key)

            # TODO: filter bad text #
            curr_pred_quality = generator_text.eval_gen_text_quality(
                None, curr_pred_text_len, None)
            curr_pred_valid_idx = [
                x for x, y in enumerate(curr_pred_quality) if y == 1
            ]

            m_asr_gen_info['total'] += len(rr)
            m_asr_gen_info['valid'] += len(curr_pred_valid_idx)
            if len(curr_pred_valid_idx) == 0:
                return None

            curr_pred_text_list = batch_select(curr_pred_text_list,
                                               curr_pred_valid_idx)
            if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
                curr_spkvec_list = batch_select(curr_spkvec_list,
                                                curr_pred_valid_idx)
            curr_pred_text_len = batch_select(curr_pred_text_len,
                                              curr_pred_valid_idx)

            curr_feat_mat = batch_select(curr_feat_mat, curr_pred_valid_idx)
            curr_feat_len = batch_select(curr_feat_len, curr_pred_valid_idx)

            # zip & sort dec #
            curr_pred_text_list = batch_sorter(curr_pred_text_list,
                                               curr_pred_text_len)
            if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
                curr_spkvec_list = batch_sorter(curr_spkvec_list,
                                                curr_pred_text_len)
            curr_feat_mat = batch_sorter(curr_feat_mat, curr_pred_text_len)
            curr_feat_len = batch_sorter(curr_feat_len, curr_pred_text_len)
            curr_pred_text_len = batch_sorter(
                curr_pred_text_len,
                curr_pred_text_len)  # sort key must be on the last step

            curr_pred_text_mat, curr_pred_text_len = batch_text(
                opts['gpu'],
                curr_pred_text_list,
            )

            if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
                curr_aux_info = {'speaker_vector': curr_spkvec_list}
            else:
                curr_aux_info = None
            _loss, _loss_feat, _loss_bce_fend, _loss_spk_emb, _acc_fend = fn_batch_tts(
                model_tts,
                curr_pred_text_mat,
                curr_pred_text_len,
                curr_feat_mat,
                curr_feat_len,
                aux_info=curr_aux_info,
                train_step=set_train_mode,
                coeff_loss=opts['coeff_unpair'])
            _loss /= opts['coeff_unpair']

            assert_nan(_loss)
            _count = len(curr_pred_text_list)
            m_tts_loss[set_name] += _loss * _count
            m_tts_loss_feat[set_name] += _loss_feat * _count
            m_tts_loss_bce[set_name] += _loss_bce_fend * _count
            m_tts_loss_spk_emb[set_name] += _loss_spk_emb * _count
            m_tts_acc[set_name] += _acc_fend * _count
            m_tts_count[set_name] += _count

            if tf_writer is not None:
                auto_writer_info_tts(set_name, _loss, _loss_feat,
                                     _loss_bce_fend, _loss_spk_emb, _acc_fend)
        def iter_cycle_tts2asr(set_name, set_train_mode, set_rr):
            rr = set_rr
            rr = sort_reverse(rr, text_len[set_name])
            rr_key = text_iterator[set_name].get_key_by_index(rr)
            curr_text_list = text_iterator[set_name].get_text_by_key(rr_key)
            if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
                if opts['tts_spk_sample'] is None:
                    curr_spkvec_list = feat_spkvec_iterator.get_feat_by_key(
                        rr_key)
                elif opts['tts_spk_sample'] == 'uniform':
                    _sample_rr_key = random.sample(feat_iterator[set_name].key,
                                                   k=len(set_rr))
                    curr_spkvec_list = feat_spkvec_iterator.get_feat_by_key(
                        _sample_rr_key)
                else:
                    raise NotImplementedError()
                curr_aux_info = {'speaker_vector': curr_spkvec_list}
            else:
                curr_aux_info = None
            curr_text_mat, curr_text_len = batch_text(opts['gpu'],
                                                      curr_text_list)

            curr_pred_feat_list, curr_pred_feat_len, curr_pred_att_mat = generator_speech.decode_greedy_pred(
                model_tts,
                curr_text_mat,
                curr_text_len,
                group=opts['tts_group'],
                feat_sil=feat_sil,
                aux_info=curr_aux_info,
                max_target=opts['tts_gen_cutoff'] // opts['tts_group'])

            # filter bad speech #
            curr_pred_quality = generator_text.eval_gen_text_quality(
                None, curr_pred_feat_len, None)
            curr_pred_valid_idx = [
                x for x, y in enumerate(curr_pred_quality) if y == 1
            ]

            m_tts_gen_info['total'] += len(rr)
            m_tts_gen_info['valid'] += len(curr_pred_valid_idx)
            if len(curr_pred_valid_idx) == 0:
                return None

            curr_pred_feat_list = batch_select(curr_pred_feat_list,
                                               curr_pred_valid_idx)
            curr_pred_feat_len = batch_select(curr_pred_feat_len,
                                              curr_pred_valid_idx)

            curr_text_mat = batch_select(curr_text_mat, curr_pred_valid_idx)
            curr_text_len = batch_select(curr_text_len, curr_pred_valid_idx)

            # zip & sort dec #
            curr_pred_feat_list = batch_sorter(curr_pred_feat_list,
                                               curr_pred_feat_len)
            curr_text_mat = batch_sorter(curr_text_mat, curr_pred_feat_len)
            curr_text_len = batch_sorter(curr_text_len, curr_pred_feat_len)
            curr_pred_feat_len = batch_sorter(
                curr_pred_feat_len,
                curr_pred_feat_len)  # sort key must be on the last step

            curr_pred_feat_mat, curr_pred_feat_len = batch_speech(
                opts['gpu'], curr_pred_feat_list)
            # if sorted(curr_pred_feat_len, reverse=True) != curr_pred_feat_len :
            # import ipdb; ipdb.set_trace()
            _loss, _acc = fn_batch_asr(model_asr,
                                       curr_pred_feat_mat,
                                       curr_pred_feat_len,
                                       curr_text_mat,
                                       curr_text_len,
                                       train_step=set_train_mode,
                                       coeff_loss=opts['coeff_unpair'])
            _loss /= opts['coeff_unpair']
            assert_nan(_loss)
            _count = len(rr)
            m_asr_loss[set_name] += _loss * _count
            m_asr_acc[set_name] += _acc * _count
            m_asr_count[set_name] += _count
            if tf_writer is not None:
                auto_writer_info_asr(set_name, _loss, _acc)
Esempio n. 6
0
    all_embed = []
    all_speaker_id = []
    all_key = []

    for rr in tqdm(list(
            iter_minibucket(sorted_feat_idx, args.batchsize, shuffle=False)),
                   ncols=60):
        curr_feat_list = feat_iterator.get_feat_by_index(rr)
        curr_key_list = feat_iterator.get_key_by_index(rr)
        curr_speaker_list = [map_key2spk[x] for x in curr_key_list]
        curr_speaker_list_id = [map_spk2id[x] for x in curr_speaker_list]
        all_speaker_id.extend(curr_speaker_list_id)
        all_key.extend(curr_key_list)
        feat_mat, feat_len = batch_speech(args.gpu,
                                          curr_feat_list,
                                          feat_sil=feat_sil,
                                          group=1,
                                          start_sil=1,
                                          end_sil=1)

        res_embed = model(Variable(feat_mat), feat_len)
        res_embed = res_embed.cpu().data.numpy()
        all_embed.append(res_embed)
        pass

    all_embed = np.concatenate(all_embed).astype(np.float64)
    all_speaker_id = np.array(all_speaker_id, dtype=np.int32)

    # save speaker vector
    if args.path is None:
        args.path = tempfile.mkdtemp()
        print('Create temporary dir: {}'.format(args.path), file=sys.stderr)