Exemple #1
0
def train():

    category = pd.read_pickle('category.pkl')
    dilations = [2**i for i in range(10)] * 5
    receptive_field = calculate_receptive_field(dilations, 2, 32)
    wavenet = WaveNet(dilations=dilations,
                      use_glob_cond=True,
                      glob_cls_num=len(category),
                      glob_embed_dim=5)
    optimizer = tf.keras.optimizers.Adam(1e-3)
    # load dataset
    trainset = tf.data.TFRecordDataset(join(
        'dataset', 'trainset.tfrecord')).repeat(-1).map(
            parse_function_generator()).batch(batch_size).prefetch(
                tf.data.experimental.AUTOTUNE)
    # restore from existing checkpoint
    if False == exists('checkpoints'): mkdir('checkpoints')
    checkpoint = tf.train.Checkpoint(model=wavenet, optimizer=optimizer)
    checkpoint.restore(tf.train.latest_checkpoint('checkpoints'))
    # create log
    log = tf.summary.create_file_writer('checkpoints')
    # train model
    avg_loss = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
    for audios, person_id in trainset:
        inputs = audios[:, :-1, :]
        # inputs.shape = (batch, receptive_field + audio_length - 1, 1)
        target = audios[:, receptive_field:, :]
        # target.shape = (batch, audio_length, 1)
        with tf.GradientTape() as tape:
            outputs = wavenet([inputs, person_id])
            # outputs.shape = (batch, audio_length, 1)
            loss = tf.keras.losses.SparseCategoricalCrossentropy()(target,
                                                                   outputs)
        avg_loss.update_state(loss)
        # write log
        if tf.equal(optimizer.iterations % 100, 0):
            with log.as_default():
                tf.summary.scalar('loss',
                                  avg_loss.result(),
                                  step=optimizer.iterations)
            print('Step #%d Loss: %.6f' %
                  (optimizer.iterations, avg_loss.result()))
            if avg_loss.result() < 0.01: break
            avg_loss.reset_states()
        grads = tape.gradient(loss, wavenet.trainable_variables)
        optimizer.apply_gradients(zip(grads, wavenet.trainable_variables))
    # save the network structure with weights
    if False == exists('model'): mkdir('model')
    wavenet.save(join('model', 'wavenet.h5'))
Exemple #2
0
def train(num_gpus, rank, group_name, output_directory, tensorboard_directory,
          ckpt_iter, n_iters, iters_per_ckpt, iters_per_logging,
          learning_rate, batch_size_per_gpu):
    """
    Train the WaveNet model on the LJSpeech dataset

    Parameters:
    num_gpus, rank, group_name:     parameters for distributed training
    output_directory (str):         save model checkpoints to this path
    tensorboard_directory (str):    save tensorboard events to this path
    ckpt_iter (int or 'max'):       the pretrained checkpoint to be loaded; 
                                    automitically selects the maximum iteration if 'max' is selected
    n_iters (int):                  number of iterations to train, default is 1M
    iters_per_ckpt (int):           number of iterations to save checkpoint, 
                                    default is 10k, for models with residual_channel=64 this number can be larger
    iters_per_logging (int):        number of iterations to save training log, default is 100
    learning_rate (float):          learning rate
    batch_size_per_gpu (int):       batchsize per gpu, default is 2 so total batchsize is 16 with 8 gpus
    """

    # generate experiment (local) path
    local_path = "ch{}_T{}_betaT{}".format(wavenet_config["res_channels"], 
                                           diffusion_config["T"], 
                                           diffusion_config["beta_T"])
    # Create tensorboard logger.
    if rank == 0:
        tb = SummaryWriter(os.path.join('exp', local_path, tensorboard_directory))

    # distributed running initialization
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)

    # Get shared output_directory ready
    output_directory = os.path.join('exp', local_path, output_directory)
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory, flush=True)

    # map diffusion hyperparameters to gpu
    for key in diffusion_hyperparams:
        if key is not "T":
            diffusion_hyperparams[key] = diffusion_hyperparams[key].cuda()

    # load training data
    trainloader = load_LJSpeech(trainset_config=trainset_config, 
                                batch_size=batch_size_per_gpu, 
                                num_gpus=num_gpus)
    print('Data loaded')
    
    # predefine model
    net = WaveNet(**wavenet_config).cuda()
    print_size(net)

    # apply gradient all reduce
    if num_gpus > 1:
        net = apply_gradient_allreduce(net)

    # define optimizer
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    # load checkpoint
    if ckpt_iter == 'max':
        ckpt_iter = find_max_epoch(output_directory)
    if ckpt_iter >= 0:
        try:
            # load checkpoint file
            model_path = os.path.join(output_directory, '{}.pkl'.format(ckpt_iter))
            checkpoint = torch.load(model_path, map_location='cpu')
            
            # feed model dict and optimizer state
            net.load_state_dict(checkpoint['model_state_dict'])
            if 'optimizer_state_dict' in checkpoint:
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

            print('Successfully loaded model at iteration {}'.format(ckpt_iter))
        except:
            ckpt_iter = -1
            print('No valid checkpoint model found, start training from initialization.')
    else:
        ckpt_iter = -1
        print('No valid checkpoint model found, start training from initialization.')

    # training
    n_iter = ckpt_iter + 1
    while n_iter < n_iters + 1:
        for mel_spectrogram, audio in trainloader: 
            # load audio and mel spectrogram
            mel_spectrogram = mel_spectrogram.cuda()
            audio = audio.unsqueeze(1).cuda()
            
            # back-propagation
            optimizer.zero_grad()
            X = (mel_spectrogram, audio)
            loss = training_loss(net, nn.MSELoss(), X, diffusion_hyperparams)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()
            loss.backward()
            optimizer.step()

            # output to log
            # note, only do this on the first gpu
            if n_iter % iters_per_logging == 0 and rank == 0:
                # save training loss to tensorboard
                print("iteration: {} \treduced loss: {} \tloss: {}".format(n_iter, reduced_loss, loss.item()))
                tb.add_scalar("Log-Train-Loss", torch.log(loss).item(), n_iter)
                tb.add_scalar("Log-Train-Reduced-Loss", np.log(reduced_loss), n_iter)

            # save checkpoint
            if n_iter > 0 and n_iter % iters_per_ckpt == 0 and rank == 0:
                checkpoint_name = '{}.pkl'.format(n_iter)
                torch.save({'model_state_dict': net.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict()}, 
                           os.path.join(output_directory, checkpoint_name))
                print('model at iteration %s is saved' % n_iter)
            
            n_iter += 1

    # Close TensorBoard.
    if rank == 0:
        tb.close()
Exemple #3
0
# make directory of results
result = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
os.mkdir(result)
shutil.copy(__file__, os.path.join(result, __file__))
shutil.copy('utils.py', os.path.join(result, 'utils.py'))
shutil.copy('params.py', os.path.join(result, 'params.py'))
shutil.copy('generate.py', os.path.join(result, 'generate.py'))
shutil.copy('net.py', os.path.join(result, 'net.py'))
shutil.copytree('WaveNet', os.path.join(result, 'WaveNet'))

# Model
encoder = UpsampleNet(params.upsample_factors)
decoder = WaveNet(params.n_loop, params.n_layer, params.filter_size,
                  params.residual_channels, params.dilated_channels,
                  params.skip_channels, params.output_dim, params.quantize,
                  params.log_scale_min, params.condition_dim,
                  params.dropout_zero_rate)

if params.distribution_type == 'gaussian':
    loss_fun = decoder.calculate_gaussian_loss
    acc_fun = None
elif params.distribution_type == 'logistic':
    loss_fun = decoder.calculate_logistic_loss
    acc_fun = None
elif params.distribution_type == 'softmax':
    loss_fun = chainer.functions.softmax_cross_entropy
    acc_fun = chainer.functions.accuracy
model = EncoderDecoderModel(encoder, decoder, loss_fun, acc_fun)

# Optimizer
 maes = []
 mases = []
 hitss = []
 for r in random_seeds:
     train_a = np.expand_dims(train[:, 0], 1)
     test_a = np.expand_dims(test[:, 0], 1)
     if s == 'MIMO':
         MIMO = True
     else:
         MIMO = False
     wavenet = WaveNet(forecast_horizon=1,
                       log_difference=True,
                       initial_filter_width=48,
                       filter_width=2,
                       residual_channels=64,
                       dilation_channels=64,
                       skip_channels=64,
                       use_biases=True,
                       use_batch_norm=False,
                       dilations=[1, 2, 4, 8, 16, 32],
                       random_seed=r,
                       MIMO=MIMO)
     if s == 'auto_regressive':
         mae, mase, hits = wavenet.train_and_predict(
             train_a,
             test_a,
             batch_size=128,
             max_epochs=10,
             plot=False,
             train_fraction=0.8)
     else:
         mae, mase, hits = wavenet.train_and_predict(
inputs = Preprocess(params.sr, params.n_fft, params.hop_length, params.n_mels,
                    params.top_db, None, params.categorical_output_dim)(path)

_, condition, _ = inputs
if params.categorical_output_dim is False or params.categorical_output_dim is None:
    input_dim = 1
else:
    input_dim = categorical_output_dim
x = numpy.zeros([n, input_dim, 1, 1], dtype=numpy.float32)
condition = numpy.expand_dims(condition, axis=0)

# make model
encoder = UpsampleNet(params.upsample_factors)
decoder = WaveNet(params.n_loop, params.n_layer, params.filter_size,
                  params.residual_channels, params.dilated_channels,
                  params.skip_channels, params.output_dim, params.quantize,
                  params.log_scale_min, params.condition_dim,
                  params.dropout_zero_rate)

# load trained parameter
chainer.serializers.load_npz(args.model, encoder,
                             'updater/model:main/encoder/')
chainer.serializers.load_npz(args.model, decoder,
                             'updater/model:main/decoder/')

if args.gpu >= 0:
    use_gpu = True
    chainer.cuda.get_device_from_id(args.gpu).use()
else:
    use_gpu = False
Exemple #6
0
def generate(output_directory, tensorboard_directory,
             num_samples,
             ckpt_path, ckpt_iter):
    """
    Generate audio based on ground truth mel spectrogram

    Parameters:
    output_directory (str):         save generated speeches to this path
    tensorboard_directory (str):    save tensorboard events to this path
    num_samples (int):              number of samples to generate, default is 4
    ckpt_path (str):                checkpoint path
    ckpt_iter (int or 'max'):       the pretrained checkpoint to be loaded; 
                                    automitically selects the maximum iteration if 'max' is selected
    """

    # generate experiment (local) path
    local_path = "ch{}_T{}_betaT{}".format(wavenet_config["res_channels"], 
                                           diffusion_config["T"], 
                                           diffusion_config["beta_T"])
    
    # Get shared output_directory ready
    output_directory = os.path.join('exp', local_path, output_directory)
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    print("output directory", output_directory, flush=True)

    # map diffusion hyperparameters to gpu
    for key in diffusion_hyperparams:
        if key is not "T":
            diffusion_hyperparams[key] = diffusion_hyperparams[key].cuda()

    # predefine model
    net = WaveNet(**wavenet_config).cuda()
    print_size(net)

    # load checkpoint
    ckpt_path = os.path.join('exp', local_path, ckpt_path)
    if ckpt_iter == 'max':
        ckpt_iter = find_max_epoch(ckpt_path)
    model_path = os.path.join(ckpt_path, '{}.pkl'.format(ckpt_iter))
    try:
        checkpoint = torch.load(model_path, map_location='cpu')
        net.load_state_dict(checkpoint['model_state_dict'])
        print('Successfully loaded model at iteration {}'.format(ckpt_iter))
    except:
        raise Exception('No valid model found')

    # predefine audio shape
    audio_length = trainset_config["segment_length"]  # 16000
    print('begin generating audio of length %s' % audio_length)

    # inference
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()

    generated_audio = sampling(net, (num_samples,1,audio_length), 
                               diffusion_hyperparams)
    
    end.record()
    torch.cuda.synchronize()
    print('generated {} utterances of random_digit at iteration {} in {} seconds'.format(num_samples,
                                                                               ckpt_iter, 
                                                                               int(start.elapsed_time(end)/1000)))

    # save audio to .wav
    for i in range(num_samples):
        outfile = '{}_{}_{}k_{}.wav'.format(wavenet_config["res_channels"], 
                                        diffusion_config["T"], 
                                        ckpt_iter // 1000, 
                                        i)
        wavwrite(os.path.join(output_directory, outfile), 
                    trainset_config["sampling_rate"],
                    generated_audio[i].squeeze().cpu().numpy())

        # save audio to tensorboard
        tb = SummaryWriter(os.path.join('exp', local_path, tensorboard_directory))
        tb.add_audio(tag=outfile, snd_tensor=generated_audio[i], sample_rate=trainset_config["sampling_rate"])
        tb.close()

    print('saved generated samples at iteration %s' % ckpt_iter)
# make directory of results
result = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
os.mkdir(result)
shutil.copy(__file__, os.path.join(result, __file__))
shutil.copy('utils.py', os.path.join(result, 'utils.py'))
shutil.copy('params.py', os.path.join(result, 'params.py'))
shutil.copy('generate.py', os.path.join(result, 'generate.py'))
shutil.copy('net.py', os.path.join(result, 'net.py'))
shutil.copytree('WaveNet', os.path.join(result, 'WaveNet'))

# Model
encoder = UpsampleNet(params.channels, params.upsample_factors)
wavenet = WaveNet(params.n_loop, params.n_layer, params.filter_size,
                  params.input_dim, params.residual_channels,
                  params.dilated_channels, params.skip_channels,
                  params.quantize, params.use_logistic, params.n_mixture,
                  params.log_scale_min, params.condition_dim,
                  params.dropout_zero_rate)

if params.ema_mu < 1:
    decoder = ExponentialMovingAverage(wavenet, params.ema_mu)
else:
    decoder = wavenet

if params.use_logistic:
    loss_fun = wavenet.calculate_logistic_loss
    acc_fun = None
else:
    loss_fun = chainer.functions.softmax_cross_entropy
    acc_fun = chainer.functions.accuracy
model = EncoderDecoderModel(encoder, decoder, loss_fun, acc_fun)
Exemple #8
0
# tensorboard --host 127.0.0.1 --logdir=D:\Projects\AI\Summary\Wavenet\

gen = MarketDataGenerator(train_ratio, seq_length, output_count,
                          batch_size, ["EURGBP", "EURUSD", "GBPUSD"],
                          datetime(2019, 4, 20), 90000)
# gen = FXTMEncoded(train_ratio, seq_length, output_count, batch_size)

trainX = gen.trainX
trainY = gen.trainY
validX = gen.validX
validY = gen.validY
testX = gen.testX
testY = gen.testY

# test = WaveNetMK0(n_filter, n_fc, n_layer)
test = WaveNet(n_filter, n_pp, n_fc, n_layer)
test.compile(gen.input_dim,
             gen.output_dim,
             optimizer=Adam(lr=0.001, decay=0.01),
             mode=0,
             default_loss='mse')
test.model_train.fit([trainX, trainY],
                     trainY,
                     batch_size=batch_size,
                     epochs=epoch,
                     callbacks=[model_saver],
                     validation_data=([validX, validY], validY))
test.model_train.load_weights(mfile)

# input_dim = [16, 3, 1]
# encoder_layers = [400]
             Remember the "p" in ARIMA(p, d, q) is how far back in the past the algorithm looks to predict the future.
             The "p" for WaveNet is equal to filter_width*sum(dilation_factors) + initial_filter_width
             We want to make this on the order of 200 to capture the weekly dependencies. Also, make sure dilation_factors
             start from 1 and go up in powers of 2. 
 random_seed: random seed from which weights are drawn, ensures a consistent method of getting the same results each time
 MIMO: (multi-input multi-output) If you pass multiple time series into the network and MIMO is False, the network will adjust it's topoplogy
                                 to condition on the time series to predict the first time series in the list
                                 If you pass multiple time series into the network and MIMO is True, the network will adjust it's topoplogy
                                 to predict all time_series simultaneously
 '''
 wavenet = WaveNet(forecast_horizon=1,
                   log_difference=False,
                   initial_filter_width=2,
                   filter_width=2,
                   residual_channels=32,
                   dilation_channels=32,
                   skip_channels=256,
                   use_biases=True,
                   use_batch_norm=True,
                   dilations=[1, 2, 4, 8, 16, 32, 64, 128],
                   random_seed=1234,
                   MIMO=False)
 '''
 The first two elements are your train and test set
 batch_size: how many sequences are passed to the GPU at once. The bigger batch_size is, the faster training will go. However, too big and you will run out of memory
 max_epochs: maximum number of epochs to train, 10-15 should do 
 plot: leave False
 train_fraction: what portion of the training set to use for training, the rest will be used for validation
 '''
 wavenet.train_and_predict(center_node_train,
                           center_node_test,
                           batch_size=128,