Beispiel #1
0
def decode(args):
    '''RUN DECODING'''
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show argments
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # define model
    tacotron2 = Tacotron2(idim, odim, train_args)
    eos = str(tacotron2.idim - 1)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    torch_load(args.model, tacotron2)
    tacotron2.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    tacotron2 = tacotron2.to(device)

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)
    with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, utt_id in enumerate(js.keys()):
            x = js[utt_id]['output'][0]['tokenid'].split() + [eos]
            x = np.fromiter(map(int, x), dtype=np.int64)
            x = torch.LongTensor(x).to(device)

            # get speaker embedding
            if train_args.use_speaker_embedding:
                spemb = kaldi_io_py.read_vec_flt(
                    js[utt_id]['input'][1]['feat'])
                spemb = torch.FloatTensor(spemb).to(device)
            else:
                spemb = None

            # decode and write
            outs, _, _ = tacotron2.inference(x, args, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warn("output length reaches maximum length (%s)." %
                             utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)
Beispiel #2
0
def decode(args):
    '''RUN DECODING'''
    # read training config
    with open(args.model_conf, 'rb') as f:
        logging.info('reading a model config file from ' + args.model_conf)
        idim, odim, train_args = pickle.load(f)

    # show argments
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # define output activation function
    if hasattr(train_args, 'output_activation'):
        if train_args.output_activation is None:
            output_activation_fn = None
        elif hasattr(torch.nn.functional, train_args.output_activation):
            output_activation_fn = getattr(torch.nn.functional,
                                           train_args.output_activation)
        else:
            raise ValueError('there is no such an activation function. (%s)' %
                             train_args.output_activation)
    else:
        output_activation_fn = None

    # define model
    tacotron2 = Tacotron2(
        idim=idim,
        odim=odim,
        spk_embed_dim=train_args.spk_embed_dim if hasattr(
            train_args, "spk_embed_dim") else None,
        embed_dim=train_args.embed_dim,
        elayers=train_args.elayers,
        eunits=train_args.eunits,
        econv_layers=train_args.econv_layers,
        econv_chans=train_args.econv_chans,
        econv_filts=train_args.econv_filts,
        dlayers=train_args.dlayers,
        dunits=train_args.dunits,
        prenet_layers=train_args.prenet_layers,
        prenet_units=train_args.prenet_units,
        postnet_layers=train_args.postnet_layers,
        postnet_chans=train_args.postnet_chans,
        postnet_filts=train_args.postnet_filts,
        adim=train_args.adim,
        aconv_chans=train_args.aconv_chans,
        aconv_filts=train_args.aconv_filts,
        output_activation_fn=output_activation_fn,
        cumulate_att_w=train_args.cumulate_att_w,
        use_batch_norm=train_args.use_batch_norm,
        use_concate=train_args.use_concate,
        dropout=train_args.dropout_rate,
        zoneout=train_args.zoneout_rate,
        threshold=args.threshold,
        maxlenratio=args.maxlenratio,
        minlenratio=args.minlenratio,
    )
    eos = str(tacotron2.idim - 1)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    tacotron2.load_state_dict(
        torch.load(args.model, map_location=lambda storage, loc: storage))
    tacotron2.eval()

    # Set gpu
    ngpu = args.ngpu
    if ngpu >= 1:
        gpu_id = range(ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        tacotron2.cuda()
    else:
        gpu_id = [-1]

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # check the use of embedding
    # TODO(kan-bayashi): need to remove in the future
    if hasattr(train_args, "spk_embed_dim"):
        if train_args.spk_embed_dim is not None:
            train_args.use_speaker_embedding = True
        else:
            train_args.use_speaker_embedding = False
    else:
        train_args.use_speaker_embedding = False

    # TODO(kan-bayashi): need to be fixed in pytorch v4
    if not torch_is_old:
        torch.set_grad_enabled(False)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)
    with kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, utt_id in enumerate(js.keys()):
            x = js[utt_id]['output'][0]['tokenid'].split() + [eos]
            x = np.fromiter(map(int, x), dtype=np.int64)
            x = torch.from_numpy(x)
            if args.ngpu > 0:
                x = x.cuda()

            # TODO(kan-bayashi): need to be fixed in pytorch v4
            if torch_is_old:
                x = Variable(x, volatile=True)

            # get speaker embedding
            if train_args.use_speaker_embedding:
                spemb = kaldi_io_py.read_vec_flt(
                    js[utt_id]['input'][1]['feat'])
                spemb = torch.from_numpy(spemb)
                # TODO(kan-bayashi): need to be fixed in pytorch v4
                if torch_is_old:
                    spemb = Variable(spemb, volatile=True)
                if args.ngpu > 0:
                    spemb = spemb.cuda()
            else:
                spemb = None

            # decode and write
            outs, _, _ = tacotron2.inference(x, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warn("output length reaches maximum length (%s)." %
                             utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            kaldi_io_py.write_mat(f, outs.data.cpu().numpy(), utt_id)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fs',
                        type=int,
                        default=22050,
                        help='Sampling frequency')
    parser.add_argument('--fmax',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Maximum frequency')
    parser.add_argument('--fmin',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Minimum frequency')
    parser.add_argument('--n_mels',
                        type=int,
                        default=80,
                        help='Number of mel basis')
    parser.add_argument('--n_fft',
                        type=int,
                        default=1024,
                        help='FFT length in point')
    parser.add_argument('--n_shift',
                        type=int,
                        default=512,
                        help='Shift length in point')
    parser.add_argument('--window',
                        type=str,
                        default='hann',
                        choices=['hann', 'hamming'],
                        help='Type of window')
    parser.add_argument('scp', type=str, help='WAV scp files')
    parser.add_argument('out', type=str, help='Output file id')
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    )

    # load scp
    with open(args.scp, 'r') as f:
        scp = [x.replace('\n', '').split() for x in f.readlines()]

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)

    # extract feature and then write as ark with scp format
    with kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, (utt_id, path) in enumerate(scp, 1):
            x, fs = librosa.core.load(path, sr=None)
            assert fs == args.fs
            lmspc = logmelspectrogram(x=x,
                                      fs=args.fs,
                                      n_mels=args.n_mels,
                                      n_fft=args.n_fft,
                                      n_shift=args.n_shift,
                                      window=args.window,
                                      fmin=args.fmin,
                                      fmax=args.fmax)
            logging.info("(%d/%d) %s" % (idx, len(scp), utt_id))
            kaldi_io_py.write_mat(f, lmspc, utt_id)
Beispiel #4
0
def encode(args):
    '''Get ASR encoded representations...probably for xvectors'''
    # seed setting
    torch.manual_seed(args.seed)

    # read training config
    idim, odim, odim_adv, train_args = get_model_conf(args.model,
                                                      args.model_conf)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    e2e = E2E(idim, odim, train_args, odim_adv=odim_adv)
    model = Loss(e2e, train_args.mtlalpha)
    if train_args.rnnlm is not None:
        # set rnnlm. external rnnlm is used for recognition.
        model.predictor.rnnlm = rnnlm
    torch_load(args.model, model)
    e2e.recog_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = range(args.ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        model.cuda()

    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.feats_out, args.feats_out)

    if args.batchsize == 0:
        with torch.no_grad():
            with kaldi_io_py.open_or_fd(arkscp,
                                        'wb') as f, open(args.feats_in,
                                                         'rb') as f2:
                lines = f2.read().splitlines()
                for idx, line in enumerate(lines, 1):
                    line = line.strip().split()
                    name = line[0]
                    logging.info('(%d/%d) decoding ' + name, idx, len(lines))
                    feat = kaldi_io_py.read_mat(line[1])
                    rep = e2e.erep(feat)
                    logging.info('Rep shape: %s', rep.shape)
                    kaldi_io_py.write_mat(f, rep, name)
    else:
        try:
            from itertools import zip_longest as zip_longest
        except Exception:
            from itertools import izip_longest as zip_longest

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # Create json object for batch processing
        logging.info("Creating json for batch processing...")
        js = {}
        with open(args.feats_in, 'rb') as f:
            lines = f.read().splitlines()
            for line in lines:
                line = line.strip().split()
                name = line[0]
                featpath = line[1]
                feat_shape = kaldi_io_py.read_mat(featpath).shape
                js[name] = {'feat': featpath, 'shape': feat_shape}

        # sort data
        logging.info("Sorting data for batch processing...")
        keys = list(js.keys())
        feat_lens = [js[key]['shape'][0] for key in keys]
        sorted_index = sorted(range(len(feat_lens)),
                              key=lambda i: -feat_lens[i])
        keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            with kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
                for names in grouper(args.batchsize, keys, None):
                    names = [name for name in names if name]
                    feats = [
                        kaldi_io_py.read_mat(js[name]['feat'])
                        for name in names
                    ]
                    reps, replens = e2e.erep_batch(feats)
                    print(reps.shape, replens)
                    for i, rep in enumerate(reps):
                        name = names[i]
                        kaldi_io_py.write_mat(f, rep, name)
def tts_decode(args):
    '''RUN DECODING'''
    # read training config
    # idim, odim, train_args = get_model_conf(args.model, args.model_conf)
    # seed setting
    torch.manual_seed(args.seed)

    # show argments
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # read training config
    with open(args.model_conf, "rb") as f:
        logging.info('reading a model config file from' + args.model_conf)
        idim_asr, odim_asr, train_args = pickle.load(f)

    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    logging.info('reading model parameters from' + args.model)
    e2e_asr = E2E(idim_asr, odim_asr, train_args)
    logging.info(e2e_asr)
    asr_loss = Loss(e2e_asr, train_args.mtlalpha)

    # specify model architecture for TTS
    # reverse input and output dimension
    tts_loss = setup_tts_loss(odim_asr, idim_asr - 3, train_args)
    logging.info(tts_loss)

    # define loss
    model = ASRTTSLoss(asr_loss, tts_loss, train_args)

    def cpu_loader(storage, location):
        return storage

    def remove_dataparallel(state_dict):
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if k.startswith("module."):
                k = k[7:]
            new_state_dict[k] = v
        return new_state_dict

    model.load_state_dict(
        remove_dataparallel(torch.load(args.model, map_location=cpu_loader)))

    # define model
    tacotron2 = Tacotron2(idim, odim, train_args)
    eos = str(tacotron2.idim - 1)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    torch_load(args.model, tacotron2)
    tacotron2.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    tacotron2 = tacotron2.to(device)

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)
    with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, utt_id in enumerate(js.keys()):
            x = js[utt_id]['output'][0]['tokenid'].split() + [eos]
            x = np.fromiter(map(int, x), dtype=np.int64)
            x = torch.LongTensor(x).to(device)

            # get speaker embedding
            if train_args.use_speaker_embedding:
                spemb = kaldi_io_py.read_vec_flt(
                    js[utt_id]['input'][1]['feat'])
                spemb = torch.FloatTensor(spemb).to(device)
            else:
                spemb = None

            # decode and write
            outs, _, _ = tacotron2.inference(x, args, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warn("output length reaches maximum length (%s)." %
                             utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fs', type=int, help='Sampling frequency')
    parser.add_argument('--n_fft',
                        type=int,
                        default=1024,
                        help='FFT length in point')
    parser.add_argument('--n_shift',
                        type=int,
                        default=512,
                        help='Shift length in point')
    parser.add_argument('--win_length',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Analisys window length in point')
    parser.add_argument('--window',
                        type=str,
                        default='hann',
                        choices=['hann', 'hamming'],
                        help='Type of window')
    parser.add_argument('--write_utt2num_frames',
                        type=strtobool,
                        default=True,
                        help='Whether to write utt2num file')
    parser.add_argument('scp', type=str, help='WAV scp files')
    parser.add_argument('out', type=str, help='Output file id')
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    )

    # load scp
    with open(args.scp, 'r') as f:
        scp = [x.replace('\n', '').split() for x in f.readlines()]
    if len(scp[0]) != 2:
        utt_ids = [scp_[0] for scp_ in scp]
        paths = [scp_[-2] for scp_ in scp]
        scp = [[utt_id, path] for utt_id, path in zip(utt_ids, paths)]

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    if args.write_utt2num_frames:
        job_id = "." + args.out.split(".")[-1] if args.out.split(
            ".")[-1].isdigit() else ""
        arkscp = (
            'ark:| copy-feats --print-args=false --write-num-frames=ark,t:%s '
            'ark:- ark,scp:%s.ark,%s.scp') % (os.path.dirname(
                args.out) + "/utt2num_frames" + job_id, args.out, args.out)
    else:
        arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
            args.out, args.out)

    # extract feature and then write as ark with scp format
    with kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, (utt_id, path) in enumerate(scp, 1):
            x, fs = sf.read(path)
            assert fs == args.fs
            spc = spectrogram(x=x,
                              fs=args.fs,
                              n_fft=args.n_fft,
                              n_shift=args.n_shift,
                              win_length=args.win_length,
                              window=args.window)
            logging.info("(%d/%d) %s" % (idx, len(scp), utt_id))
            kaldi_io_py.write_mat(f, spc, utt_id)