Example #1
0
    def __init__(self, wspecifier, filetype='mat',
                 write_num_frames=None, compress=False, compression_method=2):
        self.writer_scp = None
        # Used for writing scp
        self.filename = None
        self.filetype = filetype
        self.kwargs = {}

        if filetype == 'mat':
            if compress:
                self.writer = kaldiio.WriteHelper(
                    wspecifier, compression_method=compression_method)
            else:
                self.writer = kaldiio.WriteHelper(wspecifier)

        elif filetype in ['hdf5', 'sound.hdf5']:
            # ark,scp:out.ark,out.scp -> {'ark': 'out.ark', 'scp': 'out.scp'}
            ark_scp, filepath = wspecifier.split(':', 1)
            if ark_scp not in ['ark', 'scp,ark', 'ark,scp']:
                raise ValueError(
                    '{} is not allowed: {}'.format(ark_scp, wspecifier))
            ark_scps = ark_scp.split(',')
            filepaths = filepath.split(',')
            if len(ark_scps) != len(filepaths):
                raise ValueError(
                    'Mismatch: {} and {}'.format(ark_scp, filepath))
            spec_dict = dict(zip(ark_scps, filepaths))
            if filetype == 'sound.hdf5':
                self.writer = SoundHDF5File(spec_dict['ark'], 'w')
            else:
                self.writer = h5py.File(spec_dict['ark'], 'w')
            self.filename = spec_dict['ark']
            if 'scp' in spec_dict:
                self.writer_scp = io.open(
                    spec_dict['scp'], 'w', encoding='utf-8')

        else:
            raise ValueError('Not supporting: filetype={}'.format(filetype))

        if write_num_frames is not None:
            if ':' not in write_num_frames:
                raise ValueError('Must include ":", write_num_frames={}'
                                 .format(write_num_frames))

            nframes_type, nframes_file = write_num_frames.split(':', 1)
            if nframes_type != 'ark,t':
                raise ValueError(
                    'Only supporting text mode. '
                    'e.g. --write-num-frames=ark,t:foo.txt :'
                    '{}'.format(nframes_type))

            self.writer_nframe = io.open(nframes_file, 'w', encoding='utf-8')
        else:
            self.writer_nframe = None
 def __init__(
     self, wspecifier, write_num_frames=None, compress=False, compression_method=2
 ):
     if compress:
         self.writer = kaldiio.WriteHelper(
             wspecifier, compression_method=compression_method
         )
     else:
         self.writer = kaldiio.WriteHelper(wspecifier)
     self.writer_scp = None
     if write_num_frames is not None:
         self.writer_nframe = get_num_frames_writer(write_num_frames)
     else:
         self.writer_nframe = None
Example #3
0
def test_load_inputs_and_targets_legacy_format(tmpdir):
    # batch = [("F01_050C0101_PED_REAL",
    #          {"input": [{"feat": "some/path.ark:123"}],
    #           "output": [{"tokenid": "1 2 3 4"}],
    ark = str(tmpdir.join('test.ark'))
    scp = str(tmpdir.join('test.scp'))

    desire_xs = []
    desire_ys = []
    with kaldiio.WriteHelper('ark,scp:{},{}'.format(ark, scp)) as f:
        for i in range(10):
            x = np.random.random((100, 100)).astype(np.float32)
            uttid = 'uttid{}'.format(i)
            f[uttid] = x
            desire_xs.append(x)
            desire_ys.append(np.array([1, 2, 3, 4]))

    batch = []
    with open(scp, 'r') as f:
        for line in f:
            uttid, path = line.strip().split()
            batch.append((uttid,
                          {'input': [{'feat': path,
                                      'name': 'input1'}],
                           'output': [{'tokenid': '1 2 3 4',
                                       'name': 'target1'}]}))

    load_inputs_and_targets = LoadInputsAndTargets()
    xs, ys = load_inputs_and_targets(batch)
    for x, xd in zip(xs, desire_xs):
        np.testing.assert_array_equal(x, xd)
    for y, yd in zip(ys, desire_ys):
        np.testing.assert_array_equal(y, yd)
Example #4
0
def feats_scp(tmp_path):
    p = tmp_path / "feats.scp"
    p2 = tmp_path / "feats.ark"
    with kaldiio.WriteHelper(f"ark,scp:{p2},{p}") as w:
        w["a"] = np.random.randn(100, 80)
        w["b"] = np.random.randn(150, 80)
    return str(p)
Example #5
0
def generate_json_data(config, mode, nexamples):
    """Generate Json data for test."""

    # pylint: disable=too-many-locals
    tmpdir = Path(tempfile.mkdtemp())
    ark = str(tmpdir.joinpath('test.ark'))
    scp = str(tmpdir.joinpath('test.scp'))
    ilens = 100
    nfeat = 40
    nexamples = nexamples
    desire_xs = []
    desire_ilens = []
    desire_ys = []
    desire_olens = []
    with kaldiio.WriteHelper('ark,scp:{},{}'.format(ark, scp)) as out_f:
        for i in range(nexamples):
            # pylint: disable=invalid-name
            x = np.random.random((ilens, nfeat)).astype(np.float32)
            uttid = 'uttid{}'.format(i)
            out_f[uttid] = x
            desire_xs.append(x)
            desire_ilens.append(ilens)
            desire_ys.append(np.array([1, 2, 3, 10]))
            desire_olens.append(4)

    dummy_json = {}
    dummy_json['utts'] = {}
    with open(scp, 'r') as out_f:
        for line in out_f:
            uttid, path = line.strip().split()
            dummy_json['utts'][uttid] = {
                'input': [{
                    'feat': path,
                    'name': 'input1',
                    'shape': [ilens, nfeat]
                }],
                'output': [{
                    'tokenid': '1 2 3 10',
                    'name': 'output1',
                    'shape': [4, 10]
                }]
            }

    path = tmpdir.joinpath('{}.json'.format(mode))
    path.touch(exist_ok=True)
    path = str(path.resolve())
    with open(path, 'w') as out_f:
        json.dump(dummy_json, out_f, cls=JsonNumpyEncoder)
        config['data'][mode]['paths'] = [path]

    return desire_xs, desire_ilens, desire_ys, desire_olens
def main(args):
    if args['--datadir']:
        data_dir = args['--datadir']
    else:
        data_dir = hp.data.eval_path
    device = torch.device(hp.device)
    print('[INFO] device: %s' % device)
    dataset_name = os.path.basename(os.path.normpath(data_dir))
    print('[INFO] dataset: %s' % dataset_name)

    # Load model
    embed_net = SpeechEmbedder().to(device)
    embed_net.load_state_dict(torch.load(hp.model.model_path))
    embed_net.eval()
    # Features
    eval_gen = DL.ARKUtteranceGenerator(data_dir, apply_vad=True)
    eval_loader = DataLoader(eval_gen,
                             batch_size=hp.test.M,
                             shuffle=False,
                             num_workers=hp.test.num_workers,
                             drop_last=False)
    dwriter = kaldiio.WriteHelper('ark,scp:%s_dvecs.ark,%s_dvecs.scp' %
                                  (dataset_name, dataset_name))

    cnt = 0
    processed = []
    for key_bt, feat_bt in eval_loader:
        feat_bt = feat_bt.to(device)
        t_start = time.time()
        # feat dim [M_files, n_chunks_in_file, frames, n_mels]
        mf, nchunks, frames, nmels = feat_bt.shape
        print(feat_bt.shape)
        stack_shape = (mf * nchunks, frames, nmels)

        feat_stack = torch.reshape(feat_bt, stack_shape)
        dvec_stack = embed_net(feat_stack)
        dvec_bt = torch.reshape(
            dvec_stack, (mf, dvec_stack.size(0) // mf, dvec_stack.size(1)))

        for key, dvec in zip(key_bt, dvec_bt):
            mean_dvec = torch.mean(dvec, dim=0).detach()
            mean_dvec = mean_dvec.cpu().numpy()
            dwriter(key, mean_dvec)
            processed.append(key)
            print('%d. Processed: %s' % (cnt, key))
            cnt += 1
        t_end = time.time()
        print('Elapsed: %.4f' % (t_end - t_start))
Example #7
0
    def predict(self, input_source, input_reference):
        """Compute prediction"""
        # inference
        out_dir = Path(tempfile.mkdtemp())
        out_path = out_dir / Path(
            os.path.basename(str(input_source)).split(".")[0] + "_converted_gen.wav"
        )
        src_wav_path = input_source
        ref_wav_path = input_reference
        feat_writer = kaldiio.WriteHelper(
            "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + "/feats.1")
        )
        src_mel, src_lf0 = extract_logmel(src_wav_path, self.mean, self.std)
        ref_mel, _ = extract_logmel(ref_wav_path, self.mean, self.std)

        src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(self.device)
        src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(self.device)
        ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(self.device)
        out_filename = os.path.basename(src_wav_path).split(".")[0]

        with torch.no_grad():
            z, _, _, _ = self.encoder.encode(src_mel)
            lf0_embs = self.encoder_lf0(src_lf0)
            spk_emb = self.encoder_spk(ref_mel)
            output = self.decoder(z, lf0_embs, spk_emb)

            feat_writer[out_filename + "_converted"] = output.squeeze(0).cpu().numpy()
            feat_writer[out_filename + "_source"] = src_mel.squeeze(0).cpu().numpy().T
            feat_writer[out_filename + "_reference"] = (
                ref_mel.squeeze(0).cpu().numpy().T
            )

        feat_writer.close()

        print("synthesize waveform...")
        cmd = [
            "parallel-wavegan-decode",
            "--checkpoint",
            "./vocoder/checkpoint-3000000steps.pkl",
            "--feats-scp",
            f"{str(out_dir)}/feats.1.scp",
            "--outdir",
            str(out_dir),
        ]
        subprocess.call(cmd)

        return out_path
Example #8
0
    def __init__(
        self,
        storage_path: Pathlike,
        compression_method: Optional[int] = None,
        *args,
        **kwargs,
    ):
        if not is_module_available("kaldiio"):
            raise ValueError(
                "To read Kaldi feats.scp, please 'pip install kaldiio' first.")
        import kaldiio

        super().__init__()
        self.storage_dir = Path(storage_path)
        self.storage_dir.mkdir(parents=True, exist_ok=True)
        self.storage_path_ = str(self.storage_dir / "feats.scp")
        self.storage = kaldiio.WriteHelper(
            f"ark,scp:{self.storage_dir}/feats.ark,{self.storage_dir}/feats.scp",
            compression_method=compression_method,
        )
Example #9
0
def main(config, args):
    device = set_device(1)

    # g-vector extractor
    model = get_instance(module_model, config['model'])
    chkpt = torch.load(args.resume)
    try:
        model.load_state_dict(chkpt['model'])
    except:
        model.load_state_dict(chkpt)
    model = model.to(device)

    config['dataset']['args']['wav_scp'] = os.path.join(args.data, 'wav.scp')
    config['dataset']['args']['utt2spk'] = None
    testset = get_instance(dataset, config['dataset'])
    testloader = DataLoader(testset,
                            batch_size=1,
                            shuffle=False,
                            num_workers=4,
                            drop_last=False)

    model.eval()
    utt2embd = {}
    for i, (utt, data) in enumerate(tqdm(testloader, ncols=80)):
        utt = utt[0]
        data = data.float().to(device)
        with torch.no_grad():
            embd = model.extractor(data)
        embd = embd.squeeze(0).cpu().numpy()
        utt2embd[utt] = embd

    embd_wfile = 'ark,scp:{0}/embedding.ark,{0}/embedding.scp'.format(
        args.data)
    with kaldiio.WriteHelper(embd_wfile) as writer:
        for utt, embd in utt2embd.items():
            writer(utt, embd)
Example #10
0
def recog(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.
    """
    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    model.recog_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = list(range(args.ngpu))
        logging.info('gpu id: ' + str(gpu_id))
        model.cuda()
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)


    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']

    load_inputs_and_targets = LoadInputsAndTargets(
        mode='asr', load_output=True, sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={'train': False})
    import kaldiio
    import time
    with torch.no_grad(), \
            kaldiio.WriteHelper('ark,scp:{o}.ark,{o}.scp'.format(o=args.out)) as f:
        ys = []
        xs = []
        for idx, utt_id in enumerate(js.keys()):
            logging.info('(%d/%d) decoding ' + utt_id, idx, len(js.keys()))
            batch = [(utt_id, js[utt_id])]
            data = load_inputs_and_targets(batch)
            feat = data[0][0]
            ys.append(data[1][0])
            # x = torch.LongTensor(x).to(device)

            # decode and write
            start_time = time.time()
            # include the inference here
            # have the layer specification here
            # skeleton model.inference(x, args, layer)
            scores, outs = model.inference(feat, ys, args, train_args.char_list)
            xs.append(scores)
            logging.info("inference speed = %s msec / frame." % (
                (time.time() - start_time) / (int(outs.size(0)) * 1000)))
            logging.warning("output length reaches maximum length (%s)." % utt_id)
            logging.info('(%d/%d) %s (size:%d->%d)' % (
                idx + 1, len(js.keys()), utt_id, len(feat), outs.size(0)))
            f[utt_id] = outs.cpu().numpy()
        from espnet.nets.pytorch_backend.nets_utils import th_accuracy
        preds = torch.stack(xs).view(len(xs), -1)
        labels = torch.LongTensor(ys).view(len(xs), 1)
        acc = th_accuracy(preds, labels, -1)
        logging.warn("Final acc is (%.2f)" % (acc*100))
Example #11
0
def decode(args):
    """Decode with the given args

    :param Namespace args: The program arguments
    """
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info('args: ' + key + ': ' + str(vars(args)[key]))

    # define model
    tacotron2 = Tacotron2(idim, odim, train_args)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    torch_load(args.model, tacotron2)
    tacotron2.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    tacotron2 = tacotron2.to(device)

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    load_inputs_and_targets = LoadInputsAndTargets(
        mode='tts',
        load_input=False,
        sort_in_input_length=False,
        use_speaker_embedding=train_args.use_speaker_embedding,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None else args.preprocess_conf)

    with torch.no_grad(), kaldiio.WriteHelper(
            'ark,scp:{o}.ark,{o}.scp'.format(o=args.out)) as f:
        for idx, utt_id in enumerate(js.keys()):
            batch = [(utt_id, js[utt_id])]
            with using_transform_config({'train': False}):
                data = load_inputs_and_targets(batch)
            if train_args.use_speaker_embedding:
                spemb = data[1][0]
                spemb = torch.FloatTensor(spemb).to(device)
            else:
                spemb = None
            x = data[0][0]
            x = torch.LongTensor(x).to(device)

            # decode and write
            outs, _, _ = tacotron2.inference(x, args, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warning("output length reaches maximum length (%s)." %
                                utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            f[utt_id] = outs.cpu().numpy()
Example #12
0
def main(argv):
    """Load the model, generate kernel and bandpass plots."""
    parser = get_parser()
    args = parser.parse_args(argv)

    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    if torch.cuda.is_available() and ("cuda" in args.device):
        device = args.device
    else:
        device = "cpu"

    if args.toolkit == "speechbrain":
        from speechbrain.dataio.preprocess import AudioNormalizer
        from speechbrain.pretrained import EncoderClassifier

        # Prepare spk2utt for mean x-vector
        spk2utt = dict()
        with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader:
            for line in reader:
                details = line.split()
                spk2utt[details[0]] = details[1:]

        # TODO(nelson): The model inference can be moved into functon.
        classifier = EncoderClassifier.from_hparams(
            source=args.pretrained_model, run_opts={"device": device})
        audio_norm = AudioNormalizer()

        wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp"))
        os.makedirs(args.out_folder, exist_ok=True)
        writer_utt = kaldiio.WriteHelper(
            "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder))
        writer_spk = kaldiio.WriteHelper(
            "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format(
                args.out_folder))

        for speaker in tqdm(spk2utt):
            xvectors = list()
            for utt in spk2utt[speaker]:
                in_sr, wav = wav_scp[utt]
                # Amp Normalization -1 ~ 1
                amax = np.amax(np.absolute(wav))
                wav = wav.astype(np.float32) / amax
                # Freq Norm
                wav = audio_norm(torch.from_numpy(wav), in_sr).to(device)
                # X-vector Embedding
                embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0]
                writer_utt[utt] = np.squeeze(embeds)
                xvectors.append(embeds)

            # Speaker Normalization
            embeds = np.mean(np.stack(xvectors, 0), 0)
            writer_spk[speaker] = embeds
        writer_utt.close()
        writer_spk.close()

    elif args.toolkit == "espnet":
        raise NotImplementedError(
            "Follow details at: https://github.com/espnet/espnet/issues/3040")
    else:
        raise ValueError(
            f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi"
        )
Example #13
0
def decode(args):
    """Decode with E2E-TTS model."""
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info('args: ' + key + ': ' + str(vars(args)[key]))

    # define model
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, TTSInterface)
    logging.info(model)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    torch_load(args.model, model)
    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    load_inputs_and_targets = LoadInputsAndTargets(
        mode='tts', load_input=False, sort_in_input_length=False,
        use_speaker_embedding=train_args.use_speaker_embedding,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={'train': False}  # Switch the mode of preprocessing
    )

    # define function for plot prob and att_ws
    def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
        import matplotlib.pyplot as plt
        shape = array.shape
        if len(shape) == 1:
            # for eos probability
            plt.figure(figsize=figsize, dpi=dpi)
            plt.plot(array)
            plt.xlabel("Frame")
            plt.ylabel("Probability")
            plt.ylim([0, 1])
        elif len(shape) == 2:
            # for tacotron 2 attention weights, whose shape is (out_length, in_length)
            plt.figure(figsize=figsize, dpi=dpi)
            plt.imshow(array, aspect="auto")
            plt.xlabel("Input")
            plt.ylabel("Output")
        elif len(shape) == 4:
            # for transformer attention weights, whose shape is (#leyers, #heads, out_length, in_length)
            plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi)
            for idx1, xs in enumerate(array):
                for idx2, x in enumerate(xs, 1):
                    plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2)
                    plt.imshow(x, aspect="auto")
                    plt.xlabel("Input")
                    plt.ylabel("Output")
        else:
            raise NotImplementedError("Support only from 1D to 4D array.")
        plt.tight_layout()
        if not os.path.exists(os.path.dirname(figname)):
            # NOTE: exist_ok = True is needed for parallel process decoding
            os.makedirs(os.path.dirname(figname), exist_ok=True)
        plt.savefig(figname)
        plt.close()

    with torch.no_grad(), \
            kaldiio.WriteHelper('ark,scp:{o}.ark,{o}.scp'.format(o=args.out)) as f:

        for idx, utt_id in enumerate(js.keys()):
            batch = [(utt_id, js[utt_id])]
            data = load_inputs_and_targets(batch)
            if train_args.use_speaker_embedding:
                spemb = data[1][0]
                spemb = torch.FloatTensor(spemb).to(device)
            else:
                spemb = None
            x = data[0][0]
            x = torch.LongTensor(x).to(device)

            # decode and write
            start_time = time.time()
            outs, probs, att_ws = model.inference(x, args, spemb=spemb)
            logging.info("inference speed = %s msec / frame." % (
                (time.time() - start_time) / (int(outs.size(0)) * 1000)))
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warning("output length reaches maximum length (%s)." % utt_id)
            logging.info('(%d/%d) %s (size:%d->%d)' % (
                idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            f[utt_id] = outs.cpu().numpy()

            # plot prob and att_ws
            if probs is not None:
                _plot_and_save(probs.cpu().numpy(), os.path.dirname(args.out) + "/probs/%s_prob.png" % utt_id)
            if att_ws is not None:
                _plot_and_save(att_ws.cpu().numpy(), os.path.dirname(args.out) + "/att_ws/%s_att_ws.png" % utt_id)
"""Example of loading a pre-trained APC model."""

import torch

from apc_model import APCModel
from utils import PrenetConfig, RNNConfig
# added by Sameer
import kaldiio
import sys

feats_scp = sys.argv[1]
segments = sys.argv[2]
scp_file = sys.argv[3]

ark_file = scp_file.replace('.scp', '.ark')
writer = kaldiio.WriteHelper('ark,scp:%s,%s' % (ark_file, scp_file))

if segments:
    reader = kaldiio.ReadHelper('scp:%s' % feats_scp, segments=segments)
else:
    reader = kaldiio.ReadHelper('scp:%s' % feats_scp)


def main():
    prenet_config = None
    rnn_config = RNNConfig(input_size=80,
                           hidden_size=512,
                           num_layers=3,
                           dropout=0.,
                           residual=True)  # Sameer Added residual=True
    pretrained_apc = APCModel(mel_dim=80,
Example #15
0
def decode(args, teacher_args):
    """Decode with E2E-TTS model."""
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info("args: " + key + ": " + str(vars(args)[key]))
    
    train_args.encoder_resume = None
    # define model
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args, args, teacher_args)
    
    assert isinstance(model, TTSInterface)
    logging.info(model)

    # load trained model parameters
    logging.info("reading model parameters from " + args.model)
    if args.use_amp:
        checkpoint = torch.load(args.model)
        model.load_state_dict(checkpoint['model'])
    else:
        torch_load(args.model, model)
    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # read json data
    with open(args.json, "rb") as f:
        js = json.load(f)["utts"]

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    from io_utils_fcl import LoadInputsAndTargets
        
    load_inputs_and_targets = LoadInputsAndTargets(
        mode="tts",
        load_input=False,
        sort_in_input_length=False,
        use_second_target=False,
        use_speaker_embedding=train_args.use_speaker_embedding,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
        pad_eos=args.pad_eos,
    )

    os.makedirs(os.path.dirname(args.out), exist_ok=True)
    # define writer instances
    feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=args.out))
    inference_speeds = []
    # start decoding
    for idx, utt_id in enumerate(js.keys()):
        # setup inputs
        batch = [(utt_id, js[utt_id])]
        data = load_inputs_and_targets(batch)
        # print(data)
        x = torch.LongTensor(data[0][0]).to(device)
        spemb = None
        if train_args.use_speaker_embedding:
            spemb = torch.FloatTensor(data[1][0]).to(device)

        # decode and write
        start_time = time.time()
        outs = model.inference(x, args, spemb=spemb) 
        inference_speed = int(outs.size(0)) / (time.time() - start_time)
        inference_speeds.append(inference_speed)
        logging.info(
            "inference speed = %.1f frames / sec."
            % (inference_speed)
        )
        feat_writer[utt_id] = outs.cpu().numpy()
                               
    logging.info(
            "average inference speed = %.1f frames / sec."
            % (sum(inference_speeds)/(idx+1))
        )
    avg_infer_speed = sum(inference_speeds)/(idx+1)
    exp_name = args.model.split('/')[-3]
    fp = open(f'{exp_name}.txt','w')
    fp.write(str(avg_infer_speed))
    fp.close()
    # close file object
    feat_writer.close()
Example #16
0
def inference(
    output_dir: str,
    batch_size: int,
    dtype: str,
    ngpu: int,
    seed: int,
    num_workers: int,
    log_level: Union[int, str],
    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
    key_file: Optional[str],
    train_config: Optional[str],
    model_file: Optional[str],
    threshold: float,
    minlenratio: float,
    maxlenratio: float,
    use_att_constraint: bool,
    backward_window: int,
    forward_window: int,
    allow_variable_data_keys: bool,
    vocoder_conf: dict,
):
    """Perform TTS model decoding."""
    assert check_argument_types()
    if batch_size > 1:
        raise NotImplementedError("batch decoding is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")
    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    if ngpu >= 1:
        device = "cuda"
    else:
        device = "cpu"

    # 1. Set random-seed
    set_all_random_seed(seed)

    # 2. Build model
    model, train_args = TTSTask.build_model_from_file(train_config, model_file,
                                                      device)
    model.to(dtype=getattr(torch, dtype)).eval()
    tts = model.tts
    normalize = model.normalize
    logging.info(f"Normalization:\n{normalize}")
    logging.info(f"TTS:\n{tts}")

    # 3. Build data-iterator
    loader = TTSTask.build_streaming_iterator(
        data_path_and_name_and_type,
        dtype=dtype,
        batch_size=batch_size,
        key_file=key_file,
        num_workers=num_workers,
        preprocess_fn=TTSTask.build_preprocess_fn(train_args, False),
        collate_fn=TTSTask.build_collate_fn(train_args),
        allow_variable_data_keys=allow_variable_data_keys,
        inference=True,
    )

    # 4. Build converter from spectrogram to waveform
    if model.feats_extract is not None:
        vocoder_conf.update(model.feats_extract.get_parameters())
    if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf:
        spc2wav = Spectrogram2Waveform(**vocoder_conf)
        logging.info(f"Vocoder: {spc2wav}")
    else:
        spc2wav = None
        logging.info(
            "Vocoder is not used because vocoder_conf is not sufficient")

    # 5. Start for-loop
    output_dir = Path(output_dir)
    (output_dir / "norm").mkdir(parents=True, exist_ok=True)
    (output_dir / "denorm").mkdir(parents=True, exist_ok=True)
    (output_dir / "wav").mkdir(parents=True, exist_ok=True)

    # FIXME(kamo): I think we shouldn't depend on kaldi-format any more.
    #  How about numpy or HDF5?
    #  >>> with NpyScpWriter() as f:
    with kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(
            o=output_dir / "norm/feats")) as f, kaldiio.WriteHelper(
                "ark,scp:{o}.ark,{o}.scp".format(o=output_dir /
                                                 "denorm/feats")) as g:
        for idx, (keys, batch) in enumerate(loader, 1):
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            batch = to_device(batch, device)

            key = keys[0]
            # Change to single sequence and remove *_length
            # because inference() requires 1-seq, not mini-batch.
            _data = {
                k: v[0]
                for k, v in batch.items() if not k.endswith("_lengths")
            }
            start_time = time.perf_counter()

            # TODO(kamo): Now att_ws is not used.
            outs, probs, att_ws = tts.inference(
                **_data,
                threshold=threshold,
                maxlenratio=maxlenratio,
                minlenratio=minlenratio,
            )
            outs_denorm = normalize.inverse(outs[None])[0][0]
            insize = next(iter(_data.values())).size(0)
            logging.info("inference speed = {} msec / frame.".format(
                (time.perf_counter() - start_time) /
                (int(outs.size(0)) * 1000)))
            logging.info(f"{key} (size:{insize}->{outs.size(0)})")
            if outs.size(0) == insize * maxlenratio:
                logging.warning(
                    f"output length reaches maximum length ({key}).")
            f[key] = outs.cpu().numpy()
            g[key] = outs_denorm.cpu().numpy()

            # TODO(kamo): Write scp
            if spc2wav is not None:
                wav = spc2wav(outs_denorm.cpu().numpy())
                sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs,
                         "PCM_16")
Example #17
0
def convert(args):
    src_wav_path = args.source_wav
    ref_wav_path = args.reference_wav

    out_dir = args.converted_wav_path
    os.makedirs(out_dir, exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(in_channels=80,
                      channels=512,
                      n_embeddings=512,
                      z_dim=64,
                      c_dim=256)
    encoder_lf0 = Encoder_lf0()
    encoder_spk = Encoder_spk()
    decoder = Decoder_ac(dim_neck=64)
    encoder.to(device)
    encoder_lf0.to(device)
    encoder_spk.to(device)
    decoder.to(device)

    checkpoint_path = args.model_path
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    encoder_spk.load_state_dict(checkpoint["encoder_spk"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    encoder_spk.eval()
    decoder.eval()

    mel_stats = np.load('./mel_stats/stats.npy')
    mean = mel_stats[0]
    std = mel_stats[1]
    feat_writer = kaldiio.WriteHelper(
        "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + '/feats.1'))
    src_mel, src_lf0 = extract_logmel(src_wav_path, mean, std)
    ref_mel, _ = extract_logmel(ref_wav_path, mean, std)
    src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(device)
    src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(device)
    ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device)
    out_filename = os.path.basename(src_wav_path).split('.')[0]
    with torch.no_grad():
        z, _, _, _ = encoder.encode(src_mel)
        lf0_embs = encoder_lf0(src_lf0)
        spk_emb = encoder_spk(ref_mel)
        output = decoder(z, lf0_embs, spk_emb)

        feat_writer[out_filename +
                    '_converted'] = output.squeeze(0).cpu().numpy()
        feat_writer[out_filename +
                    '_source'] = src_mel.squeeze(0).cpu().numpy().T
        feat_writer[out_filename +
                    '_reference'] = ref_mel.squeeze(0).cpu().numpy().T

    feat_writer.close()
    print('synthesize waveform...')
    cmd = ['parallel-wavegan-decode', '--checkpoint', \
           './vocoder/checkpoint-3000000steps.pkl', \
           '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)]
    subprocess.call(cmd)
Example #18
0
def convert(cfg):
    src_wav_paths = glob(
        '/Dataset/VCTK-Corpus/wav48_silence_trimmed/p225/*mic1.flac'
    )  # modified to absolute wavs path, can select any unseen speakers
    src_wav_paths = select_wavs(src_wav_paths)

    tar1_wav_paths = glob(
        '/Dataset/VCTK-Corpus/wav48_silence_trimmed/p231/*mic1.flac'
    )  # can select any unseen speakers
    tar2_wav_paths = glob(
        '/Dataset/VCTK-Corpus/wav48_silence_trimmed/p243/*mic1.flac'
    )  # can select any unseen speakers
    # tar1_wav_paths = select_wavs(tar1_wav_paths)
    # tar2_wav_paths = select_wavs(tar2_wav_paths)
    tar1_wav_paths = [sorted(tar1_wav_paths)[0]]
    tar2_wav_paths = [sorted(tar2_wav_paths)[0]]

    print('len(src):', len(src_wav_paths), 'len(tar1):', len(tar1_wav_paths),
          'len(tar2):', len(tar2_wav_paths))

    tmp = cfg.checkpoint.split('/')
    steps = tmp[-1].split('-')[-1].split('.')[0]
    out_dir = f'test/{tmp[-3]}-{tmp[-2]}-{steps}'
    out_dir = Path(utils.to_absolute_path(out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    encoder_lf0 = Encoder_lf0()
    encoder_spk = Encoder_spk()
    decoder = Decoder_ac(dim_neck=64)
    encoder.to(device)
    encoder_lf0.to(device)
    encoder_spk.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    encoder_spk.load_state_dict(checkpoint["encoder_spk"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    encoder_spk.eval()
    decoder.eval()

    mel_stats = np.load('./data/mel_stats.npy')
    mean = mel_stats[0]
    std = mel_stats[1]
    feat_writer = kaldiio.WriteHelper(
        "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + '/feats.1'))
    for i, src_wav_path in tqdm(enumerate(src_wav_paths, 1)):
        if i > 10:
            break
        mel, lf0 = extract_logmel(src_wav_path, mean, std)
        if i % 2 == 1:
            ref_wav_path = random.choice(tar2_wav_paths)
            tar = 'tarMale_'
        else:
            ref_wav_path = random.choice(tar1_wav_paths)
            tar = 'tarFemale_'
        ref_mel, _ = extract_logmel(ref_wav_path, mean, std)

        mel = torch.FloatTensor(mel.T).unsqueeze(0).to(device)
        lf0 = torch.FloatTensor(lf0).unsqueeze(0).to(device)
        ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device)

        out_filename = os.path.basename(src_wav_path).split('.')[0]
        with torch.no_grad():
            z, _, _, _ = encoder.encode(mel)
            lf0_embs = encoder_lf0(lf0)
            spk_embs = encoder_spk(ref_mel)
            output = decoder(z, lf0_embs, spk_embs)

            logmel = output.squeeze(0).cpu().numpy()
            feat_writer[out_filename] = logmel
            feat_writer[out_filename + '_src'] = mel.squeeze(0).cpu().numpy().T
            feat_writer[out_filename +
                        '_ref'] = ref_mel.squeeze(0).cpu().numpy().T

        subprocess.call(['cp', src_wav_path, out_dir])

    feat_writer.close()
    print('synthesize waveform...')
    cmd = ['parallel-wavegan-decode', '--checkpoint', \
           '/vocoder/checkpoint-3000000steps.pkl', \
           '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)]
    subprocess.call(cmd)
Example #19
0
def test_load_inputs_and_targets_legacy_format_multi_inputs(tmpdir):
    # batch = [("F01_050C0101_PED_REAL",
    #          {"input": [{"feat": "some/path1.ark:123",
    #                      "name": "input1"}
    #                     {"feat": "some/path2.ark:123"
    #                      "name": "input2"}],
    #           "output": [{"tokenid": "1 2 3 4"}],
    ark_1 = str(tmpdir.join("test_1.ark"))
    scp_1 = str(tmpdir.join("test_1.scp"))

    ark_2 = str(tmpdir.join("test_2.ark"))
    scp_2 = str(tmpdir.join("test_2.scp"))

    desire_xs_1 = []
    desire_xs_2 = []
    desire_ys = []
    with kaldiio.WriteHelper("ark,scp:{},{}".format(ark_1, scp_1)) as f:
        for i in range(10):
            x = np.random.random((100, 100)).astype(np.float32)
            uttid = "uttid{}".format(i)
            f[uttid] = x
            desire_xs_1.append(x)
            desire_ys.append(np.array([1, 2, 3, 4]))

    with kaldiio.WriteHelper("ark,scp:{},{}".format(ark_2, scp_2)) as f:
        for i in range(10):
            x = np.random.random((100, 100)).astype(np.float32)
            uttid = "uttid{}".format(i)
            f[uttid] = x
            desire_xs_2.append(x)
            desire_ys.append(np.array([1, 2, 3, 4]))

    batch = []
    with open(scp_1, "r") as f:
        lines_1 = f.readlines()
    with open(scp_2, "r") as f:
        lines_2 = f.readlines()

    for line_1, line_2 in zip(lines_1, lines_2):
        uttid, path_1 = line_1.strip().split()
        uttid, path_2 = line_2.strip().split()
        batch.append((
            uttid,
            {
                "input": [
                    {
                        "feat": path_1,
                        "name": "input1"
                    },
                    {
                        "feat": path_2,
                        "name": "input2"
                    },
                ],
                "output": [{
                    "tokenid": "1 2 3 4",
                    "name": "target1"
                }],
            },
        ))

    load_inputs_and_targets = LoadInputsAndTargets()
    xs_1, xs_2, ys = load_inputs_and_targets(batch)
    for x, xd in zip(xs_1, desire_xs_1):
        np.testing.assert_array_equal(x, xd)
    for x, xd in zip(xs_2, desire_xs_2):
        np.testing.assert_array_equal(x, xd)
    for y, yd in zip(ys, desire_ys):
        np.testing.assert_array_equal(y, yd)
Example #20
0
def gta_inference(args):
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info("args: " + key + ": " + str(vars(args)[key]))

    # define model
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, TTSInterface)
    logging.info(model)

    # load trained model parameters
    logging.info("reading model parameters from " + args.model)
    torch_load(args.model, model)
    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # read json data
    with open(args.json, "rb") as f:
        js = json.load(f)["utts"]

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    use_sortagrad = train_args.sortagrad == -1 or train_args.sortagrad > 0
    if use_sortagrad:
        train_args.batch_sort_key = "input"

    if args.batch_size is not None:
        assert args.batch_size > 0
        batch_size = args.batch_size
    else:
        batch_size = args.batch_size

    # make minibatch list (variable length)
    train_batchset = make_batchset(
        js,
        batch_size,
        train_args.maxlen_in,
        train_args.maxlen_out,
        train_args.minibatches,
        batch_sort_key=train_args.batch_sort_key,
        min_batch_size=train_args.ngpu if train_args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=train_args.batch_count,
        batch_bins=train_args.batch_bins,
        batch_frames_in=train_args.batch_frames_in,
        batch_frames_out=train_args.batch_frames_out,
        batch_frames_inout=train_args.batch_frames_inout,
        swap_io=True,
        iaxis=0,
        oaxis=0,
    )
    load_tr = LoadInputsAndTargets(
        mode="tts",
        use_speaker_embedding=train_args.use_speaker_embedding,
        use_second_target=train_args.use_second_target,
        use_character_embedding=train_args.use_character_embedding,
        use_intonation_type=train_args.use_intonation_type,
        preprocess_conf=train_args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
        keep_all_data_on_mem=train_args.keep_all_data_on_mem,
    )

    converter = CustomConverter()

    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    def transform(data, loader, converter):
        batch, utt_list = loader(data, return_uttid=True)
        batch = converter([batch])
        return batch, utt_list

    train_dataset = TransformDataset(
        train_batchset, lambda data: transform(data, load_tr, converter))

    feat_writer = kaldiio.WriteHelper(
        "ark,scp:{o}.ark,{o}.scp".format(o=args.out))

    for batch, utt_list in train_dataset:
        x = batch
        for key in x.keys():
            x[key] = x[key].to(device)

        outputs = model.gta_inference(**x)
        olens = x['olens']

        batch_size = olens.shape[0]
        for i in range(batch_size):
            utt_id = utt_list[i]
            mlspec = outputs[i]
            ol = olens[i]
            feat_writer[utt_id] = mlspec[:ol].cpu().numpy()

    feat_writer.close()
Example #21
0
    def __init__(self, wspecifier, filetype='mat',
                 write_num_frames=None, compress=False, compression_method=2,
                 pcm_format='wav'):
        self.writer_scp = None
        # Used for writing scp
        self.filename = None
        self.filetype = filetype
        # Used for filetype='sound' or 'sound.hdf5'
        self.pcm_format = pcm_format
        self.kwargs = {}

        if filetype == 'mat':
            if compress:
                self.writer = kaldiio.WriteHelper(
                    wspecifier, compression_method=compression_method)
            else:
                self.writer = kaldiio.WriteHelper(wspecifier)

        elif filetype in ['hdf5', 'sound.hdf5', 'sound']:
            # 1. Create spec_dict

            # e.g.
            #   ark,scp:out.ark,out.scp -> {'ark': 'out.ark', 'scp': 'out.scp'}
            ark_scp, filepath = wspecifier.split(':', 1)
            if ark_scp not in ['ark', 'scp,ark', 'ark,scp']:
                raise ValueError(
                    '{} is not allowed: {}'.format(ark_scp, wspecifier))
            ark_scps = ark_scp.split(',')
            filepaths = filepath.split(',')
            if len(ark_scps) != len(filepaths):
                raise ValueError(
                    'Mismatch: {} and {}'.format(ark_scp, filepath))
            spec_dict = dict(zip(ark_scps, filepaths))

            # 2. Set writer
            self.filename = spec_dict['ark']

            if filetype == 'sound.hdf5':
                self.writer = SoundHDF5File(spec_dict['ark'], 'w',
                                            format=self.pcm_format)

            elif filetype == 'hdf5':
                self.writer = h5py.File(spec_dict['ark'], 'w')

            elif filetype == 'sound':
                # Use "ark" value as directory to save wav files
                # e.g. ark,scp:dirname,wav.scp
                # -> The wave files are found in dirname/*.wav
                wavdir = spec_dict['ark']
                if not os.path.exists(wavdir):
                    os.makedirs(wavdir)
                self.writer = None
            else:
                # Cannot reach
                raise RuntimeError

            # 3. Set writer_scp
            if 'scp' in spec_dict:
                self.writer_scp = io.open(
                    spec_dict['scp'], 'w', encoding='utf-8')

        else:
            raise ValueError('Not supporting: filetype={}'.format(filetype))

        if write_num_frames is not None:
            if ':' not in write_num_frames:
                raise ValueError('Must include ":", write_num_frames={}'
                                 .format(write_num_frames))

            nframes_type, nframes_file = write_num_frames.split(':', 1)
            if nframes_type != 'ark,t':
                raise ValueError(
                    'Only supporting text mode. '
                    'e.g. --write-num-frames=ark,t:foo.txt :'
                    '{}'.format(nframes_type))

            self.writer_nframe = io.open(nframes_file, 'w', encoding='utf-8')
        else:
            self.writer_nframe = None
Example #22
0
#read from feats.scp
#add feats scp direc

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "feats_file",
        help="path to the feats.scp we're going to use as example")
    parser.add_argument("target_file",
                        help="path to target file (without the scp extension)")
    parser.add_argument("--filler",
                        type=int,
                        default=1,
                        help="value to fille the matrix with (1 or 0)")

    parser.parse_args()
    args, leftovers = parser.parse_known_args()

    with kaldiio.ReadHelper('scp:{}'.format(args.feats_file)) as reader:
        feats = {}
        for key, numpy_array in reader:
            feats[key] = numpy_array

    with kaldiio.WriteHelper('ark,scp:{}.ark,{}.scp'.format(
            args.target_file, args.target_file)) as writer:
        for key, value in feats.items():
            vec = np.full(len(value), args.filler, dtype=np.float32)
            writer(key, vec)
Example #23
0
File: tts.py Project: zy1022/espnet
def decode(args):
    """Decode with E2E-TTS model."""
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info("args: " + key + ": " + str(vars(args)[key]))

    # define model
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, TTSInterface)
    logging.info(model)

    # load trained model parameters
    logging.info("reading model parameters from " + args.model)
    torch_load(args.model, model)
    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # read json data
    with open(args.json, "rb") as f:
        js = json.load(f)["utts"]

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="tts",
        load_input=False,
        sort_in_input_length=False,
        use_speaker_embedding=train_args.use_speaker_embedding,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )

    # define function for plot prob and att_ws
    def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
        import matplotlib.pyplot as plt

        shape = array.shape
        if len(shape) == 1:
            # for eos probability
            plt.figure(figsize=figsize, dpi=dpi)
            plt.plot(array)
            plt.xlabel("Frame")
            plt.ylabel("Probability")
            plt.ylim([0, 1])
        elif len(shape) == 2:
            # for tacotron 2 attention weights, whose shape is (out_length, in_length)
            plt.figure(figsize=figsize, dpi=dpi)
            plt.imshow(array, aspect="auto")
            plt.xlabel("Input")
            plt.ylabel("Output")
        elif len(shape) == 4:
            # for transformer attention weights,
            # whose shape is (#leyers, #heads, out_length, in_length)
            plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]),
                       dpi=dpi)
            for idx1, xs in enumerate(array):
                for idx2, x in enumerate(xs, 1):
                    plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2)
                    plt.imshow(x, aspect="auto")
                    plt.xlabel("Input")
                    plt.ylabel("Output")
        else:
            raise NotImplementedError("Support only from 1D to 4D array.")
        plt.tight_layout()
        if not os.path.exists(os.path.dirname(figname)):
            # NOTE: exist_ok = True is needed for parallel process decoding
            os.makedirs(os.path.dirname(figname), exist_ok=True)
        plt.savefig(figname)
        plt.close()

    # define function to calculate focus rate
    # (see section 3.3 in https://arxiv.org/abs/1905.09263)
    def _calculate_focus_rete(att_ws):
        if att_ws is None:
            # fastspeech case -> None
            return 1.0
        elif len(att_ws.shape) == 2:
            # tacotron 2 case -> (L, T)
            return float(att_ws.max(dim=-1)[0].mean())
        elif len(att_ws.shape) == 4:
            # transformer case -> (#layers, #heads, L, T)
            return float(att_ws.max(dim=-1)[0].mean(dim=-1).max())
        else:
            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")

    # define function to convert attention to duration
    def _convert_att_to_duration(att_ws):
        if len(att_ws.shape) == 2:
            # tacotron 2 case -> (L, T)
            pass
        elif len(att_ws.shape) == 4:
            # transformer case -> (#layers, #heads, L, T)
            # get the most diagonal head according to focus rate
            att_ws = torch.cat([att_w for att_w in att_ws],
                               dim=0)  # (#heads * #layers, L, T)
            diagonal_scores = att_ws.max(dim=-1)[0].mean(
                dim=-1)  # (#heads * #layers,)
            diagonal_head_idx = diagonal_scores.argmax()
            att_ws = att_ws[diagonal_head_idx]  # (L, T)
        else:
            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
        # calculate duration from 2d attention weight
        durations = torch.stack(
            [att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])])
        return durations.view(-1, 1).float()

    # define writer instances
    feat_writer = kaldiio.WriteHelper(
        "ark,scp:{o}.ark,{o}.scp".format(o=args.out))
    if args.save_durations:
        dur_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(
            o=args.out.replace("feats", "durations")))
    if args.save_focus_rates:
        fr_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(
            o=args.out.replace("feats", "focus_rates")))

    # start decoding
    for idx, utt_id in enumerate(js.keys()):
        # setup inputs
        batch = [(utt_id, js[utt_id])]
        data = load_inputs_and_targets(batch)
        x = torch.LongTensor(data[0][0]).to(device)
        spemb = None
        if train_args.use_speaker_embedding:
            spemb = torch.FloatTensor(data[1][0]).to(device)

        # decode and write
        start_time = time.time()
        outs, probs, att_ws = model.inference(x, args, spemb=spemb)
        logging.info("inference speed = %.1f frames / sec." %
                     (int(outs.size(0)) / (time.time() - start_time)))
        if outs.size(0) == x.size(0) * args.maxlenratio:
            logging.warning("output length reaches maximum length (%s)." %
                            utt_id)
        focus_rate = _calculate_focus_rete(att_ws)
        logging.info("(%d/%d) %s (size: %d->%d, focus rate: %.3f)" %
                     (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0),
                      focus_rate))
        feat_writer[utt_id] = outs.cpu().numpy()
        if args.save_durations:
            ds = _convert_att_to_duration(att_ws)
            dur_writer[utt_id] = ds.cpu().numpy()
        if args.save_focus_rates:
            fr_writer[utt_id] = np.array(focus_rate).reshape(1, 1)

        # plot and save prob and att_ws
        if probs is not None:
            _plot_and_save(
                probs.cpu().numpy(),
                os.path.dirname(args.out) + "/probs/%s_prob.png" % utt_id,
            )
        if att_ws is not None:
            _plot_and_save(
                att_ws.cpu().numpy(),
                os.path.dirname(args.out) + "/att_ws/%s_att_ws.png" % utt_id,
            )

    # close file object
    feat_writer.close()
    if args.save_durations:
        dur_writer.close()
    if args.save_focus_rates:
        fr_writer.close()
Example #24
0
def main(cmd_args):
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
        logging.warning('Skip DEBUG/INFO messages')

    # display PYTHONPATH
    logging.info('python path = ' + os.environ.get('PYTHONPATH', '(None)'))

    # set random seed
    logging.info('random seed = %d' % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    set_deterministic_pytorch(args)

    logging.info("total speaker is %d" % args.nClasses)
    spk_model = SpeakerNet(nClasses=args.nClasses,
                           nPerSpeaker=args.nPerSpeaker,
                           trainfunc=args.trainfunc,
                           nOut=512)

    if args.spk_model is not None:
        spk_model.loadParameters(args.spk_model)
    else:
        spk_model = None
    spk_model.eval()
    mean = np.array([[
        -1.7101e+08, -1.727767e+08, -1.654258e+08, -1.568423e+08, -1.47768e+08,
        -1.355978e+08, -1.337955e+08, -1.290715e+08, -1.292888e+08,
        -1.333105e+08, -1.380836e+08, -1.388845e+08, -1.445241e+08,
        -1.438754e+08, -1.428372e+08, -1.428697e+08, -1.417773e+08,
        -1.400568e+08, -1.448087e+08, -1.459874e+08, -1.47229e+08,
        -1.490556e+08, -1.499799e+08, -1.522063e+08, -1.590756e+08,
        -1.618226e+08, -1.651485e+08, -1.684847e+08, -1.692581e+08,
        -1.714363e+08, -1.763494e+08, -1.776152e+08, -1.789162e+08,
        -1.805202e+08, -1.798933e+08, -1.818852e+08, -1.852947e+08,
        -1.860893e+08, -1.873477e+08, -1.889484e+08, -1.873008e+08,
        -1.891793e+08, -1.917609e+08, -1.932594e+08, -1.934982e+08,
        -1.90069e+08, -1.967007e+08, -1.955583e+08, -1.932292e+08,
        -2.001965e+08, -1.926799e+08, -2.013976e+08, -1.932717e+08,
        -1.997551e+08, -1.955731e+08, -1.958617e+08, -1.967825e+08,
        -1.952326e+08, -1.931164e+08, -1.947601e+08, -1.94064e+08,
        -1.937533e+08, -1.93948e+08, -1.940927e+08, -1.945755e+08,
        -1.955468e+08, -1.96344e+08, -1.963595e+08, -1.971519e+08,
        -1.991344e+08, -1.989762e+08, -2.000582e+08, -2.019397e+08,
        -2.019519e+08, -2.024301e+08, -2.031892e+08, -2.029932e+08,
        -2.029679e+08, -2.033156e+08, -2.033823e+08, -2.03208e+08,
        -2.036384e+08, -2.03879e+08, -2.04647e+08, -2.06028e+08, -2.060116e+08,
        -2.070609e+08, -2.071168e+08, -2.083309e+08, -2.092469e+08,
        -2.103796e+08, -2.122868e+08, -2.135678e+08, -2.144521e+08,
        -2.158103e+08, -2.171439e+08, -2.176665e+08, -2.191257e+08,
        -2.193856e+08, -2.21079e+08, -2.226874e+08, -2.247855e+08,
        -2.267768e+08, -2.286809e+08, -2.311216e+08, -2.33142e+08,
        -2.352095e+08, -2.373178e+08, -2.393992e+08, -2.415607e+08,
        -2.436022e+08, -2.450806e+08, -2.462217e+08, -2.47608e+08,
        -2.483978e+08, -2.495429e+08, -2.495807e+08, -2.501201e+08,
        -2.504308e+08, -2.506836e+08, -2.518955e+08, -2.528667e+08,
        -2.538843e+08, -2.553601e+08, -2.571577e+08, -2.592016e+08,
        -2.737314e+08, -3.25694e+08
    ]])
    var = np.array([[
        3.875797e+08, 3.972777e+08, 3.76892e+08, 3.590407e+08, 3.36797e+08,
        2.982351e+08, 2.993923e+08, 2.900205e+08, 2.903182e+08, 3.00258e+08,
        3.139445e+08, 3.133095e+08, 3.316776e+08, 3.290742e+08, 3.259625e+08,
        3.292938e+08, 3.253266e+08, 3.20113e+08, 3.353506e+08, 3.40549e+08,
        3.424283e+08, 3.454718e+08, 3.482779e+08, 3.577333e+08, 3.827005e+08,
        3.899876e+08, 4.01662e+08, 4.141465e+08, 4.154033e+08, 4.238292e+08,
        4.437099e+08, 4.463138e+08, 4.495017e+08, 4.545714e+08, 4.517053e+08,
        4.601415e+08, 4.730579e+08, 4.755685e+08, 4.813327e+08, 4.884872e+08,
        4.809006e+08, 4.883675e+08, 5.00223e+08, 5.064776e+08, 5.080264e+08,
        4.91717e+08, 5.215152e+08, 5.169479e+08, 5.060737e+08, 5.381505e+08,
        5.023963e+08, 5.430141e+08, 5.040811e+08, 5.339064e+08, 5.142676e+08,
        5.158492e+08, 5.202875e+08, 5.131353e+08, 5.043084e+08, 5.129934e+08,
        5.087678e+08, 5.064136e+08, 5.083315e+08, 5.083852e+08, 5.09834e+08,
        5.150194e+08, 5.177091e+08, 5.167306e+08, 5.197394e+08, 5.282414e+08,
        5.270312e+08, 5.324564e+08, 5.408028e+08, 5.407178e+08, 5.426285e+08,
        5.456758e+08, 5.454526e+08, 5.462478e+08, 5.481372e+08, 5.508704e+08,
        5.496423e+08, 5.518889e+08, 5.532486e+08, 5.56079e+08, 5.627578e+08,
        5.617894e+08, 5.666932e+08, 5.67652e+08, 5.73079e+08, 5.768822e+08,
        5.817027e+08, 5.912957e+08, 5.977753e+08, 6.0268e+08, 6.094717e+08,
        6.166043e+08, 6.196362e+08, 6.269311e+08, 6.276106e+08, 6.369116e+08,
        6.44361e+08, 6.551513e+08, 6.656342e+08, 6.762929e+08, 6.899264e+08,
        7.008929e+08, 7.117181e+08, 7.238042e+08, 7.350025e+08, 7.47482e+08,
        7.59422e+08, 7.681328e+08, 7.75756e+08, 7.834833e+08, 7.868992e+08,
        7.938968e+08, 7.929719e+08, 7.966068e+08, 7.983973e+08, 7.993377e+08,
        8.061261e+08, 8.111478e+08, 8.169364e+08, 8.25449e+08, 8.366562e+08,
        8.486715e+08, 9.377093e+08, 1.289456e+09
    ]])
    num_sum = 8.478675e+07
    with kaldiio.ReadHelper("scp:%s" %
                            args.read_file) as reader, kaldiio.WriteHelper(
                                'ark,scp:%s.ark,%s.scp' %
                                (args.write_file, args.write_file)) as writer:
        for key, numpy_array in reader:
            with torch.no_grad():
                length = len(numpy_array)
                numpy_array = numpy_array[20:-20]
                # numpy_array = numpy_array[20:]
                # numpy_array = numpy_array[:-20]
                # numpy_array = numpy_array - mean/num_sum
                # numpy_array = numpy_array / ( var/num_sum - (mean/num_sum)**2)
                torch_array = torch.from_numpy(numpy_array).unsqueeze(
                    0).float()

                logging.info(torch_array.size())
                writer[key] = spk_model(torch_array).squeeze(0).numpy()