Esempio n. 1
0
    description=__doc__,
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", required=True, help='dataset name')
parser.add_argument("--warmstart",
                    help='Warmstart (transfer learn) from a pre-trained model')
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

if args.dataset not in ['ljspeech', 'mbspeech']:
    from datasets.generic import vocab, Generic as SpeechDataset
    train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
        ['texts', 'mels', 'mel_gates'], args.dataset),
                                           batch_size=64,
                                           mode='train')
    valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
        ['texts', 'mels', 'mel_gates'], args.dataset),
                                           batch_size=64,
                                           mode='valid')
else:
    if args.dataset == 'ljspeech':
        from datasets.lj_speech import vocab, LJSpeech as SpeechDataset
    elif args.dataset == 'mbspeech':
        from datasets.mb_speech import vocab, MBSpeech as SpeechDataset
    train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
        ['texts', 'mels', 'mel_gates']),
                                           batch_size=64,
                                           mode='train')
    valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
Esempio n. 2
0
 def valid_dataloader(self, dataset, batch_size=32, \
             num_workers=0 if sys.platform.startswith('win') else 8):
     return Text2MelDataLoader(dataset,
                               batch_size=batch_size,
                               mode='valid',
                               num_workers=num_workers)
Esempio n. 3
0
    from datasets.swara import vocab, SWARA as SpeechDataset
elif args.dataset == 'swara_test':
    from datasets.swara_test import vocab, SWARA as SpeechDataset
else:
    print('No such dataset')
    sys.exit(1)

# os.environ["CUDA_VISIBLE_DEVICES"]="3"

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
    ['texts', 'mels', 'mel_gates', 'speakers', 'filenames']),
                                       batch_size=16,
                                       mode='train')
valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
    ['texts', 'mels', 'mel_gates', 'speakers', 'filenames']),
                                       batch_size=16,
                                       mode='valid')

text2mel = Text2Mel(vocab).cuda()

optimizer = torch.optim.Adam(text2mel.parameters(), lr=hp.text2mel_lr)

start_timestamp = int(time.time() * 1000)
start_epoch = 0
global_step = 0

logger = Logger(args.dataset, 'text2mel')
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--voice", default='Keira', help='voice name')
parser.add_argument("--script",
                    default='Keira_all.csv',
                    help='script filename')
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

Speech.load(['texts', 'mels', 'mel_gates'], args.voice, args.script)

train_data_loader = Text2MelDataLoader(Speech,
                                       batch_size=hp.text2mel_batch_size,
                                       mode='train')
valid_data_loader = Text2MelDataLoader(Speech,
                                       batch_size=hp.text2mel_batch_size,
                                       mode='valid')

text2mel = Text2Mel(vocab).cuda()

optimizer = torch.optim.Adam(text2mel.parameters(), lr=hp.text2mel_lr)

start_timestamp = int(time.time() * 1000)
start_epoch = 0
global_step = 0

logger = Logger(
    f'{args.voice}-{args.script}-{hp.d}-{hp.text2mel_lr}-{hp.text2mel_batch_size}',
Esempio n. 5
0
    else:
        index = 'cpu'
    device = select_device(index)

    hparams = HParam(args.config) \
            if args.config else HParam(osp.join(osp.abspath(os.getcwd()), 'config', 'default.yaml'))

    checkpoint = args.checkpoint or get_last_chkpt_path(
        osp.join(hparams.trainer.logdir, f"{hparams.data.dataset}-{args.name}"))

    extractor = DurationTrainer(hparams, device=device).load_checkpoint(checkpoint).model
    extractor.train(False)

    dataset_root = osp.join(hparams.data.datasets_path, hparams.data.dataset_dir)
    dataset = SpeechDataset(['mels', 'mlens', 'texts', 'tlens', 'files'], dataset_root, hparams.text)
    dataloader = Text2MelDataLoader(dataset, args.batch_size, mode='whole')
    normalizer = MinMaxNorm(hparams.audio.spec_min, hparams.audio.spec_max)

    pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \
                disable=hparams.trainer.disable_progress_bar)
    with open(osp.join(dataset.path, 'duration.txt'), 'w', encoding='utf-8') as fw:
        for it, batch in enumerate(pbar, start=1):
            mels, mlens, texts, tlens = \
                batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1)
            mels, mlens, texts, tlens = \
                mels.to(device), mlens.to(device), texts.to(device), tlens.to(device)
            
            mels = normalizer(mels)

            with torch.no_grad():
                melspecs, attns = extractor((texts, tlens, mels, True))