Ejemplo n.º 1
0
def test_tacotron2_multi_gpu_trainable(model_dict):
    ngpu = 2
    device_ids = list(range(ngpu))
    bs = 10
    maxin_len = 10
    maxout_len = 10
    idim = 5
    odim = 10
    model_args = make_model_args(**model_dict)
    loss_args = make_loss_args()
    batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len,
                           model_args['spk_embed_dim'], model_args['spc_dim'])
    batch = (x.cuda() if x is not None else None for x in batch)

    # define model
    tacotron2 = Tacotron2(idim, odim, Namespace(**model_args))
    tacotron2 = torch.nn.DataParallel(tacotron2, device_ids)
    model = Tacotron2Loss(tacotron2, **loss_args)
    optimizer = torch.optim.Adam(model.parameters())
    model.cuda()

    # trainable
    loss = model(*batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
Ejemplo n.º 2
0
def test_tacotron2_multi_gpu_trainable():
    ngpu = 2
    device_ids = list(range(ngpu))
    bs = 2
    maxin_len = 10
    maxout_len = 10
    idim = 5
    odim = 10
    batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len)
    batch = (x.cuda() for x in batch)
    xs, ilens, ys, labels, olens = batch

    # define model
    model_args = make_model_args()
    loss_args = make_loss_args()
    tacotron2 = Tacotron2(idim, odim, Namespace(**model_args))
    tacotron2 = torch.nn.DataParallel(tacotron2, device_ids)
    model = Tacotron2Loss(tacotron2, **loss_args)
    optimizer = torch.optim.Adam(model.parameters())
    model.cuda()

    # trainable
    loss = model(xs, ilens, ys, labels, olens)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
Ejemplo n.º 3
0
def decode(args):
    '''RUN DECODING'''
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show argments
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # define model
    tacotron2 = Tacotron2(idim, odim, train_args)
    eos = str(tacotron2.idim - 1)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    torch_load(args.model, tacotron2)
    tacotron2.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    tacotron2 = tacotron2.to(device)

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)
    with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, utt_id in enumerate(js.keys()):
            x = js[utt_id]['output'][0]['tokenid'].split() + [eos]
            x = np.fromiter(map(int, x), dtype=np.int64)
            x = torch.LongTensor(x).to(device)

            # get speaker embedding
            if train_args.use_speaker_embedding:
                spemb = kaldi_io_py.read_vec_flt(
                    js[utt_id]['input'][1]['feat'])
                spemb = torch.FloatTensor(spemb).to(device)
            else:
                spemb = None

            # decode and write
            outs, _, _ = tacotron2.inference(x, args, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warn("output length reaches maximum length (%s)." %
                             utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)
Ejemplo n.º 4
0
def test_tacotron2_with_speaker_embedding_trainable_and_decodable(
        model_dict, loss_dict):
    # setup batch
    bs = 2
    maxin_len = 10
    maxout_len = 10
    idim = 5
    odim = 10
    spk_embed_dim = 128
    ilens = np.sort(np.random.randint(1, maxin_len, bs))[::-1].tolist()
    olens = np.sort(np.random.randint(1, maxout_len, bs))[::-1].tolist()
    xs = pad_ndarray_list([np.random.randint(0, idim, l) for l in ilens], 0)
    ys = pad_ndarray_list([np.random.randn(l, odim) for l in olens], 0)
    xs = torch.from_numpy(xs).long()
    ys = torch.from_numpy(ys).float()
    spembs = torch.from_numpy(np.random.randn(bs, spk_embed_dim)).float()
    # TODO(kan-bayashi): need to be modified in pytorch v4
    labels = ys.new(ys.size(0), ys.size(1)).zero_()
    for i, l in enumerate(olens):
        labels[i, l - 1:] = 1
    if torch_is_old:
        xs = Variable(xs)
        ys = Variable(ys)
        spembs = Variable(spembs)
        labels = Variable(labels)

    # define model
    model_args = make_model_args(spk_embed_dim=spk_embed_dim, **model_dict)
    loss_args = make_loss_args(**loss_dict)
    model = Tacotron2(idim, odim, **model_args)
    criterion = Tacotron2Loss(model, **loss_args)
    optimizer = torch.optim.Adam(model.parameters())

    # trainable
    after, before, logits = model(xs, ilens, ys, spembs)
    loss = criterion(xs, ilens, ys, labels, olens, spembs)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # decodable
    if torch_is_old:
        xs.volatile = True
        ys.volatile = True
    else:
        torch.set_grad_enabled(False)
    model.eval()
    yhat, probs, att_ws = model.inference(xs[0][:ilens[0]], spembs[0])
    att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs)
    assert att_ws.shape[0] == bs
    assert att_ws.shape[1] == max(olens)
    assert att_ws.shape[2] == max(ilens)
Ejemplo n.º 5
0
def test_tacotron2_trainable_and_decodable(model_dict, loss_dict):
    # setup batch
    bs = 2
    maxin_len = 10
    maxout_len = 10
    idim = 5
    odim = 10
    batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len)
    xs, ilens, ys, labels, olens = batch

    # define model
    model_args = make_model_args(**model_dict)
    loss_args = make_loss_args(**loss_dict)
    inference_args = make_inference_args()
    model = Tacotron2(idim, odim, Namespace(**model_args))
    criterion = Tacotron2Loss(model, **loss_args)
    optimizer = torch.optim.Adam(model.parameters())

    if model_args['spk_embed_dim'] is not None:
        spembs = torch.from_numpy(
            np.random.randn(bs, model_args['spk_embed_dim'])).float()
    else:
        spembs = None

    # trainable
    loss = criterion(xs, ilens, ys, labels, olens, spembs)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # decodable
    model.eval()
    with torch.no_grad():
        spemb = None if model_args['spk_embed_dim'] is None else spembs[0]
        yhat, probs, att_ws = model.inference(xs[0][:ilens[0]],
                                              Namespace(**inference_args),
                                              spemb)
        att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs)
    assert att_ws.shape[0] == bs
    assert att_ws.shape[1] == max(olens)
    assert att_ws.shape[2] == max(ilens)
def setup_tts_loss(odim_asr, idim_tts, args):
    from argparse import Namespace
    # define output activation function
    if args.tts_output_activation is None:
        output_activation_fn = None
    elif hasattr(torch.nn.functional, args.tts_output_activation):
        output_activation_fn = getattr(torch.nn.functional,
                                       args.tts_output_activation)
    else:
        raise ValueError('there is no such an activation function. (%s)' %
                         args.tts_output_activation)

    tts_args = Namespace(spk_embed_dim=args.tts_spk_embed_dim,
                         embed_dim=args.tts_embed_dim,
                         elayers=args.tts_elayers,
                         eunits=args.tts_eunits,
                         econv_layers=args.tts_econv_layers,
                         econv_chans=args.tts_econv_chans,
                         econv_filts=args.tts_econv_filts,
                         dlayers=args.tts_dlayers,
                         dunits=args.tts_dunits,
                         prenet_layers=args.tts_prenet_layers,
                         prenet_units=args.tts_prenet_units,
                         postnet_layers=args.tts_postnet_layers,
                         postnet_chans=args.tts_postnet_chans,
                         postnet_filts=args.tts_postnet_filts,
                         output_activation=output_activation_fn,
                         adim=args.tts_adim,
                         aconv_chans=args.tts_aconv_chans,
                         aconv_filts=args.tts_aconv_filts,
                         cumulate_att_w=args.tts_cumulate_att_w,
                         use_batch_norm=args.tts_use_batch_norm,
                         use_concate=args.tts_use_concate,
                         dropout=args.tts_dropout_rate,
                         zoneout=args.tts_zoneout_rate,
                         monotonic=args.tts_monotonic)
    e2e_tts = Tacotron2(idim=odim_asr, odim=idim_tts, args=tts_args)
    return Tacotron2Loss(model=e2e_tts,
                         use_masking=args.tts_use_masking,
                         bce_pos_weight=args.tts_bce_pos_weight)
Ejemplo n.º 7
0
def test_tacotron2_trainable_and_decodable(model_dict, loss_dict):
    # make args
    model_args = make_model_args(**model_dict)
    loss_args = make_loss_args(**loss_dict)
    inference_args = make_inference_args()

    # setup batch
    bs = 2
    maxin_len = 10
    maxout_len = 10
    idim = 5
    odim = 10
    if model_args['use_cbhg']:
        model_args['spc_dim'] = 129
    if model_args['use_speaker_embedding']:
        model_args['spk_embed_dim'] = 128
    batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len,
                           model_args['spk_embed_dim'], model_args['spc_dim'])
    xs, ilens, ys, labels, olens, spembs, spcs = batch

    # define model
    model = Tacotron2(idim, odim, Namespace(**model_args))
    criterion = Tacotron2Loss(model, **loss_args)
    optimizer = torch.optim.Adam(model.parameters())

    # trainable
    loss = criterion(xs, ilens, ys, labels, olens, spembs, spcs)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # decodable
    model.eval()
    with torch.no_grad():
        spemb = None if model_args['spk_embed_dim'] is None else spembs[0]
        model.inference(xs[0][:ilens[0]], Namespace(**inference_args), spemb)
        att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs)
    assert att_ws.shape[0] == bs
    assert att_ws.shape[1] == max(olens)
    assert att_ws.shape[2] == max(ilens)
Ejemplo n.º 8
0
def train(args):
    '''RUN TRAINING'''
    # seed setting
    torch.manual_seed(args.seed)

    # use determinisitic computation or not
    if args.debugmode < 1:
        torch.backends.cudnn.deterministic = False
        logging.info('torch cudnn deterministic is disabled')
    else:
        torch.backends.cudnn.deterministic = True

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning('cuda is not available')

    # get input and output dimension info
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']
    utts = list(valid_json.keys())

    # reverse input and output dimension
    idim = int(valid_json[utts[0]]['output'][0]['shape'][1])
    odim = int(valid_json[utts[0]]['input'][0]['shape'][1])
    if args.use_cbhg:
        args.spc_dim = int(valid_json[utts[0]]['input'][1]['shape'][1])
    if args.use_speaker_embedding:
        args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0])
    else:
        args.spk_embed_dim = None
    logging.info('#input dims : ' + str(idim))
    logging.info('#output dims: ' + str(odim))

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + '/model.json'
    with open(model_conf, 'wb') as f:
        logging.info('writing a model config file to' + model_conf)
        f.write(
            json.dumps((idim, odim, vars(args)), indent=4,
                       sort_keys=True).encode('utf_8'))
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    tacotron2 = Tacotron2(idim, odim, args)
    logging.info(tacotron2)

    # check the use of multi-gpu
    if args.ngpu > 1:
        tacotron2 = torch.nn.DataParallel(tacotron2,
                                          device_ids=list(range(args.ngpu)))
        logging.info('batch size is automatically increased (%d -> %d)' %
                     (args.batch_size, args.batch_size * args.ngpu))
        args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    tacotron2 = tacotron2.to(device)

    # define loss
    model = Tacotron2Loss(tacotron2, args.use_masking, args.bce_pos_weight)
    reporter = model.reporter

    # Setup an optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr,
                                 eps=args.eps,
                                 weight_decay=args.weight_decay)

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, 'target', reporter)
    setattr(optimizer, 'serialize', lambda s: reporter.serialize(s))

    # Setup a converter
    converter = CustomConverter(True, args.use_speaker_embedding,
                                args.use_cbhg)

    # read json data
    with open(args.train_json, 'rb') as f:
        train_json = json.load(f)['utts']
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']

    # make minibatch list (variable length)
    train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in,
                                   args.maxlen_out, args.minibatches,
                                   args.batch_sort_key)
    valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in,
                                   args.maxlen_out, args.minibatches,
                                   args.batch_sort_key)
    # hack to make batchsze argument as 1
    # actual bathsize is included in a list
    train_iter = chainer.iterators.MultiprocessIterator(TransformDataset(
        train_batchset, converter.transform),
                                                        batch_size=1,
                                                        n_processes=2,
                                                        n_prefetch=8,
                                                        maxtasksperchild=20)
    valid_iter = chainer.iterators.MultiprocessIterator(TransformDataset(
        valid_batchset, converter.transform),
                                                        batch_size=1,
                                                        repeat=False,
                                                        shuffle=False,
                                                        n_processes=2,
                                                        n_prefetch=8,
                                                        maxtasksperchild=20)

    # Set up a trainer
    updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer,
                            converter, device)
    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        logging.info('resumed from %s' % args.resume)
        torch_resume(args.resume, trainer)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        CustomEvaluator(model, valid_iter, reporter, converter, device))

    # Save snapshot for each epoch
    trainer.extend(torch_snapshot(), trigger=(1, 'epoch'))

    # Save best models
    trainer.extend(
        extensions.snapshot_object(tacotron2,
                                   'model.loss.best',
                                   savefun=torch_save),
        trigger=training.triggers.MinValueTrigger('validation/main/loss'))

    # Save attention figure for each epoch
    if args.num_save_attention > 0:
        data = sorted(list(valid_json.items())[:args.num_save_attention],
                      key=lambda x: int(x[1]['input'][0]['shape'][1]),
                      reverse=True)
        if hasattr(tacotron2, "module"):
            att_vis_fn = tacotron2.module.calculate_all_attentions
        else:
            att_vis_fn = tacotron2.calculate_all_attentions
        trainer.extend(PlotAttentionReport(att_vis_fn,
                                           data,
                                           args.outdir + '/att_ws',
                                           converter=CustomConverter(
                                               False,
                                               args.use_speaker_embedding),
                                           device=device,
                                           reverse=True),
                       trigger=(1, 'epoch'))

    # Make a plot for training and validation values
    plot_keys = [
        'main/loss', 'validation/main/loss', 'main/l1_loss',
        'validation/main/l1_loss', 'main/mse_loss', 'validation/main/mse_loss',
        'main/bce_loss', 'validation/main/bce_loss'
    ]
    trainer.extend(
        extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'],
                              'epoch',
                              file_name='l1_loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'],
                              'epoch',
                              file_name='mse_loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'],
                              'epoch',
                              file_name='bce_loss.png'))
    if args.use_cbhg:
        plot_keys += [
            'main/cbhg_l1_loss', 'validation/main/cbhg_l1_loss',
            'main/cbhg_mse_loss', 'validation/main/cbhg_mse_loss'
        ]
        trainer.extend(
            extensions.PlotReport(
                ['main/cbhg_l1_loss', 'validation/main/cbhg_l1_loss'],
                'epoch',
                file_name='cbhg_l1_loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/cbhg_mse_loss', 'validation/main/cbhg_mse_loss'],
                'epoch',
                file_name='cbhg_mse_loss.png'))
    trainer.extend(
        extensions.PlotReport(plot_keys, 'epoch', file_name='loss.png'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=(REPORT_INTERVAL,
                                                 'iteration')))
    report_keys = plot_keys[:]
    report_keys[0:0] = ['epoch', 'iteration', 'elapsed_time']
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=(REPORT_INTERVAL, 'iteration'))
    trainer.extend(extensions.ProgressBar(update_interval=REPORT_INTERVAL))

    # Run the training
    trainer.run()
Ejemplo n.º 9
0
def decode(args):
    '''RUN DECODING'''
    # read training config
    with open(args.model_conf, 'rb') as f:
        logging.info('reading a model config file from ' + args.model_conf)
        idim, odim, train_args = pickle.load(f)

    # show argments
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # define output activation function
    if hasattr(train_args, 'output_activation'):
        if train_args.output_activation is None:
            output_activation_fn = None
        elif hasattr(torch.nn.functional, train_args.output_activation):
            output_activation_fn = getattr(torch.nn.functional,
                                           train_args.output_activation)
        else:
            raise ValueError('there is no such an activation function. (%s)' %
                             train_args.output_activation)
    else:
        output_activation_fn = None

    # define model
    tacotron2 = Tacotron2(
        idim=idim,
        odim=odim,
        spk_embed_dim=train_args.spk_embed_dim if hasattr(
            train_args, "spk_embed_dim") else None,
        embed_dim=train_args.embed_dim,
        elayers=train_args.elayers,
        eunits=train_args.eunits,
        econv_layers=train_args.econv_layers,
        econv_chans=train_args.econv_chans,
        econv_filts=train_args.econv_filts,
        dlayers=train_args.dlayers,
        dunits=train_args.dunits,
        prenet_layers=train_args.prenet_layers,
        prenet_units=train_args.prenet_units,
        postnet_layers=train_args.postnet_layers,
        postnet_chans=train_args.postnet_chans,
        postnet_filts=train_args.postnet_filts,
        adim=train_args.adim,
        aconv_chans=train_args.aconv_chans,
        aconv_filts=train_args.aconv_filts,
        output_activation_fn=output_activation_fn,
        cumulate_att_w=train_args.cumulate_att_w,
        use_batch_norm=train_args.use_batch_norm,
        use_concate=train_args.use_concate,
        dropout=train_args.dropout_rate,
        zoneout=train_args.zoneout_rate,
        threshold=args.threshold,
        maxlenratio=args.maxlenratio,
        minlenratio=args.minlenratio,
    )
    eos = str(tacotron2.idim - 1)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    tacotron2.load_state_dict(
        torch.load(args.model, map_location=lambda storage, loc: storage))
    tacotron2.eval()

    # Set gpu
    ngpu = args.ngpu
    if ngpu >= 1:
        gpu_id = range(ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        tacotron2.cuda()
    else:
        gpu_id = [-1]

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # check the use of embedding
    # TODO(kan-bayashi): need to remove in the future
    if hasattr(train_args, "spk_embed_dim"):
        if train_args.spk_embed_dim is not None:
            train_args.use_speaker_embedding = True
        else:
            train_args.use_speaker_embedding = False
    else:
        train_args.use_speaker_embedding = False

    # TODO(kan-bayashi): need to be fixed in pytorch v4
    if not torch_is_old:
        torch.set_grad_enabled(False)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)
    with kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, utt_id in enumerate(js.keys()):
            x = js[utt_id]['output'][0]['tokenid'].split() + [eos]
            x = np.fromiter(map(int, x), dtype=np.int64)
            x = torch.from_numpy(x)
            if args.ngpu > 0:
                x = x.cuda()

            # TODO(kan-bayashi): need to be fixed in pytorch v4
            if torch_is_old:
                x = Variable(x, volatile=True)

            # get speaker embedding
            if train_args.use_speaker_embedding:
                spemb = kaldi_io_py.read_vec_flt(
                    js[utt_id]['input'][1]['feat'])
                spemb = torch.from_numpy(spemb)
                # TODO(kan-bayashi): need to be fixed in pytorch v4
                if torch_is_old:
                    spemb = Variable(spemb, volatile=True)
                if args.ngpu > 0:
                    spemb = spemb.cuda()
            else:
                spemb = None

            # decode and write
            outs, _, _ = tacotron2.inference(x, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warn("output length reaches maximum length (%s)." %
                             utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            kaldi_io_py.write_mat(f, outs.data.cpu().numpy(), utt_id)
Ejemplo n.º 10
0
def train(args):
    '''RUN TRAINING'''
    # seed setting
    torch.manual_seed(args.seed)

    # use determinisitic computation or not
    if args.debugmode < 1:
        torch.backends.cudnn.deterministic = False
        logging.info('torch cudnn deterministic is disabled')
    else:
        torch.backends.cudnn.deterministic = True

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning('cuda is not available')

    # get input and output dimension info
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']
    utts = list(valid_json.keys())

    # reverse input and output dimension
    idim = int(valid_json[utts[0]]['output'][0]['shape'][1])
    odim = int(valid_json[utts[0]]['input'][0]['shape'][1])
    if args.use_speaker_embedding:
        args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0])
    else:
        args.spk_embed_dim = None
    logging.info('#input dims : ' + str(idim))
    logging.info('#output dims: ' + str(odim))

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + '/model.conf'
    with open(model_conf, 'wb') as f:
        logging.info('writing a model config file to' + model_conf)
        pickle.dump((idim, odim, args), f)
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # define output activation function
    if args.output_activation is None:
        output_activation_fn = None
    elif hasattr(torch.nn.functional, args.output_activation):
        output_activation_fn = getattr(torch.nn.functional,
                                       args.output_activation)
    else:
        raise ValueError('there is no such an activation function. (%s)' %
                         args.output_activation)

    # specify model architecture
    tacotron2 = Tacotron2(idim=idim,
                          odim=odim,
                          spk_embed_dim=args.spk_embed_dim,
                          embed_dim=args.embed_dim,
                          elayers=args.elayers,
                          eunits=args.eunits,
                          econv_layers=args.econv_layers,
                          econv_chans=args.econv_chans,
                          econv_filts=args.econv_filts,
                          dlayers=args.dlayers,
                          dunits=args.dunits,
                          prenet_layers=args.prenet_layers,
                          prenet_units=args.prenet_units,
                          postnet_layers=args.postnet_layers,
                          postnet_chans=args.postnet_chans,
                          postnet_filts=args.postnet_filts,
                          output_activation_fn=output_activation_fn,
                          adim=args.adim,
                          aconv_chans=args.aconv_chans,
                          aconv_filts=args.aconv_filts,
                          cumulate_att_w=args.cumulate_att_w,
                          use_batch_norm=args.use_batch_norm,
                          use_concate=args.use_concate,
                          dropout=args.dropout_rate,
                          zoneout=args.zoneout_rate)
    logging.info(tacotron2)

    # Set gpu
    ngpu = args.ngpu
    if ngpu == 1:
        gpu_id = range(ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        tacotron2.cuda()
    elif ngpu > 1:
        gpu_id = range(ngpu)
        logging.info('gpu id: ' + str(gpu_id))
        tacotron2 = torch.nn.DataParallel(tacotron2, device_ids=gpu_id)
        tacotron2.cuda()
        logging.info('batch size is automatically increased (%d -> %d)' %
                     (args.batch_size, args.batch_size * ngpu))
        args.batch_size *= ngpu
    else:
        gpu_id = [-1]

    # define loss
    model = Tacotron2Loss(model=tacotron2,
                          use_masking=args.use_masking,
                          bce_pos_weight=args.bce_pos_weight)
    reporter = model.reporter

    # Setup an optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr,
                                 eps=args.eps,
                                 weight_decay=args.weight_decay)

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, 'target', reporter)
    setattr(optimizer, 'serialize', lambda s: reporter.serialize(s))

    # read json data
    with open(args.train_json, 'rb') as f:
        train_json = json.load(f)['utts']
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']

    # make minibatch list (variable length)
    train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in,
                                   args.maxlen_out, args.minibatches,
                                   args.batch_sort_key)
    valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in,
                                   args.maxlen_out, args.minibatches,
                                   args.batch_sort_key)
    # hack to make batchsze argument as 1
    # actual bathsize is included in a list
    train_iter = chainer.iterators.SerialIterator(train_batchset, 1)
    valid_iter = chainer.iterators.SerialIterator(valid_batchset,
                                                  1,
                                                  repeat=False,
                                                  shuffle=False)

    # Set up a trainer
    converter = CustomConverter(gpu_id, True, args.use_speaker_embedding)
    updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer,
                            converter)
    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        logging.info('restored from %s' % args.resume)
        chainer.serializers.load_npz(args.resume, trainer)
        torch_load(args.outdir + '/model.ep.%d' % trainer.updater.epoch,
                   tacotron2)
        model = trainer.updater.model

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(CustomEvaluator(model, valid_iter, reporter, converter))

    # Take a snapshot for each specified epoch
    trainer.extend(
        extensions.snapshot(filename='snapshot.ep.{.updater.epoch}'),
        trigger=(1, 'epoch'))

    # Save attention figure for each epoch
    if args.num_save_attention > 0:
        data = sorted(list(valid_json.items())[:args.num_save_attention],
                      key=lambda x: int(x[1]['input'][0]['shape'][1]),
                      reverse=True)
        trainer.extend(PlotAttentionReport(
            tacotron2, data, args.outdir + '/att_ws',
            CustomConverter(gpu_id, False, args.use_speaker_embedding), True),
                       trigger=(1, 'epoch'))

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport([
            'main/loss', 'validation/main/loss', 'main/l1_loss',
            'validation/main/l1_loss', 'main/mse_loss',
            'validation/main/mse_loss', 'main/bce_loss',
            'validation/main/bce_loss'
        ],
                              'epoch',
                              file_name='loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'],
                              'epoch',
                              file_name='l1_loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'],
                              'epoch',
                              file_name='mse_loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'],
                              'epoch',
                              file_name='bce_loss.png'))

    # Save model for each epoch
    trainer.extend(extensions.snapshot_object(tacotron2,
                                              'model.ep.{.updater.epoch}',
                                              savefun=torch_save),
                   trigger=(1, 'epoch'))

    # Save best models
    trainer.extend(
        extensions.snapshot_object(tacotron2,
                                   'model.loss.best',
                                   savefun=torch_save),
        trigger=training.triggers.MinValueTrigger('validation/main/loss'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=(100, 'iteration')))
    report_keys = [
        'epoch', 'iteration', 'elapsed_time', 'main/loss', 'main/l1_loss',
        'main/mse_loss', 'main/bce_loss', 'validation/main/loss',
        'validation/main/l1_loss', 'validation/main/mse_loss',
        'validation/main/bce_loss'
    ]
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=(100, 'iteration'))
    trainer.extend(extensions.ProgressBar())

    # Run the training
    trainer.run()
Ejemplo n.º 11
0
def tts_decode(args):
    '''RUN DECODING'''
    # read training config
    # idim, odim, train_args = get_model_conf(args.model, args.model_conf)
    # seed setting
    torch.manual_seed(args.seed)

    # show argments
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # read training config
    with open(args.model_conf, "rb") as f:
        logging.info('reading a model config file from' + args.model_conf)
        idim_asr, odim_asr, train_args = pickle.load(f)

    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    logging.info('reading model parameters from' + args.model)
    e2e_asr = E2E(idim_asr, odim_asr, train_args)
    logging.info(e2e_asr)
    asr_loss = Loss(e2e_asr, train_args.mtlalpha)

    # specify model architecture for TTS
    # reverse input and output dimension
    tts_loss = setup_tts_loss(odim_asr, idim_asr - 3, train_args)
    logging.info(tts_loss)

    # define loss
    model = ASRTTSLoss(asr_loss, tts_loss, train_args)

    def cpu_loader(storage, location):
        return storage

    def remove_dataparallel(state_dict):
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if k.startswith("module."):
                k = k[7:]
            new_state_dict[k] = v
        return new_state_dict

    model.load_state_dict(
        remove_dataparallel(torch.load(args.model, map_location=cpu_loader)))

    # define model
    tacotron2 = Tacotron2(idim, odim, train_args)
    eos = str(tacotron2.idim - 1)

    # load trained model parameters
    logging.info('reading model parameters from ' + args.model)
    torch_load(args.model, tacotron2)
    tacotron2.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    tacotron2 = tacotron2.to(device)

    # read json data
    with open(args.json, 'rb') as f:
        js = json.load(f)['utts']

    # chech direcitory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python)
    arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % (
        args.out, args.out)
    with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f:
        for idx, utt_id in enumerate(js.keys()):
            x = js[utt_id]['output'][0]['tokenid'].split() + [eos]
            x = np.fromiter(map(int, x), dtype=np.int64)
            x = torch.LongTensor(x).to(device)

            # get speaker embedding
            if train_args.use_speaker_embedding:
                spemb = kaldi_io_py.read_vec_flt(
                    js[utt_id]['input'][1]['feat'])
                spemb = torch.FloatTensor(spemb).to(device)
            else:
                spemb = None

            # decode and write
            outs, _, _ = tacotron2.inference(x, args, spemb)
            if outs.size(0) == x.size(0) * args.maxlenratio:
                logging.warn("output length reaches maximum length (%s)." %
                             utt_id)
            logging.info(
                '(%d/%d) %s (size:%d->%d)' %
                (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0)))
            kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)