Ejemplo n.º 1
0
def generate_audio():
	# compute receptive field width
	learnable_steps = 1
	batch_size = 1
	num_layers = len(params.residual_conv_channels)
	receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers
	receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1
	target_width = learnable_steps
	input_width = receptive_steps
	# to compute all learnable targets
	input_width += learnable_steps - 1
	## padding for causal conv block
	input_width += len(params.causal_conv_channels)

	# quantized signals generated by WaveNet
	generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32)

	start = time.time()
	for time_step in xrange(9):
		# quantized signals in receptive field
		padded_quantized_x_batch = generated_quantized_audio[-input_width:].reshape((1, -1))

		# convert to image
		padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantization_steps=params.quantization_steps)

		# generate next signal
		softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		softmax = softmax[0, :, 0, -1]
		generated_quantized_signal = np.argmax(softmax)
		generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0)
		print generated_quantized_signal,

	print generated_quantized_audio
	print time.time() - start

	wavenet.prev_causal_outputs = None
	wavenet.prev_residual_outputs_out = None
	wavenet.prev_residual_outputs_z = None
	generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32)

	start = time.time()
	for time_step in xrange(9):
		# quantized signals in receptive field
		padded_quantized_x_batch = generated_quantized_audio[-input_width:].reshape((1, -1))

		# convert to image
		padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantization_steps=params.quantization_steps)

		# generate next signal
		softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		softmax = softmax[0, :, 0, -1]
		generated_quantized_signal = np.argmax(softmax)
		generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0)
		print generated_quantized_signal,

	print generated_quantized_audio
	print time.time() - start
Ejemplo n.º 2
0
def generate_audio():
	receptive_field_width_steps = 5

	batch_size = 1
	max_dilation = max(params.residual_conv_dilations)
	target_width = receptive_field_width_steps
	padded_input_width = receptive_field_width_steps + max_dilation

	# quantized signals generated by WaveNet
	generated_quantized_audio = np.mod(np.arange(1, padded_input_width + 1), 6).astype(np.int32)

	start = time.time()
	for time_step in xrange(500):
		# quantized signals in receptive field
		padded_quantized_x_batch = generated_quantized_audio[-padded_input_width:].reshape((1, -1))

		# convert to image
		padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantized_channels=params.audio_channels)

		# generate next signal
		softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		softmax = softmax[0, :, 0, -1]
		generated_quantized_signal = np.argmax(softmax)
		generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0)

	print generated_quantized_audio
	print time.time() - start

	wavenet.prev_causal_outputs = None
	wavenet.prev_residual_outputs_out = None
	wavenet.prev_residual_outputs_z = None
	generated_quantized_audio = np.mod(np.arange(1, padded_input_width + 1), 6).astype(np.int32)

	start = time.time()
	for time_step in xrange(500):
		# quantized signals in receptive field
		padded_quantized_x_batch = generated_quantized_audio[-padded_input_width:].reshape((1, -1))

		# convert to image
		padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantized_channels=params.audio_channels)

		# generate next signal
		softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		softmax = softmax[0, :, 0, -1]
		generated_quantized_signal = np.argmax(softmax)
		generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0)

	print generated_quantized_audio
	print time.time() - start
Ejemplo n.º 3
0
def generate_audio():
    receptive_field_width_steps = 5

    batch_size = 1
    max_dilation = max(params.residual_conv_dilations)
    target_width = receptive_field_width_steps
    padded_input_width = receptive_field_width_steps + max_dilation

    # quantized signals generated by WaveNet
    generated_quantized_audio = np.zeros((padded_input_width, ),
                                         dtype=np.int32)

    for time_step in xrange(200):
        # quantized signals in receptive field
        padded_quantized_x_batch = generated_quantized_audio[
            -padded_input_width:].reshape((1, -1))

        # convert to image
        padded_x_batch = data.onehot_pixel_image(
            padded_quantized_x_batch, quantized_channels=params.audio_channels)

        # generate next signal
        softmax = wavenet.forward_one_step(padded_x_batch,
                                           softmax=True,
                                           return_numpy=True)
        softmax = softmax[0, :, 0, -1]
        generated_quantized_signal = np.random.choice(np.arange(
            params.audio_channels),
                                                      p=softmax)
        generated_quantized_audio = np.append(generated_quantized_audio,
                                              [generated_quantized_signal],
                                              axis=0)

    print generated_quantized_audio
Ejemplo n.º 4
0
def train_audio():

	target_width = 5
	padded_input_width = 9
	batch_size = 5

	quantized_signal = np.mod(np.arange(1, 100), 6)
	quantized_signal = np.repeat(np.arange(0, 10), 100, axis=0)
	print quantized_signal

	for epoch in xrange(30):
		for step in xrange(100):
			padded_signal_batch, target_batch = create_batch(quantized_signal, batch_size, padded_input_width, target_width)
			
			padded_onehot_batch = data.onehot_pixel_image(padded_signal_batch, quantized_channels=params.quantization_steps)

			# print padded_signal_batch[0, -1]
			# print padded_onehot_batch[0, :, 0, -1]
			# print target_batch[0, -1]

			loss = wavenet.loss(padded_onehot_batch, target_batch)
			wavenet.backprop(loss)

		loss = float(loss.data)
		if loss > 0.3:
			print padded_signal_batch
			print target_batch
Ejemplo n.º 5
0
def main():
    # compute required input width
    num_layers = len(params.residual_conv_channels)
    receptive_steps_per_unit = params.residual_conv_filter_width**num_layers
    receptive_steps = (receptive_steps_per_unit -
                       1) * params.residual_num_blocks + 1
    input_width = receptive_steps
    # padding for causal conv block
    input_width += len(params.causal_conv_channels)

    # quantized signals generated by WaveNet
    generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32)

    for time_step in xrange(1000):
        # quantized signals in receptive field
        padded_quantized_x_batch = generated_quantized_audio[
            -input_width:].reshape((1, -1))

        # convert to image
        padded_x_batch = data.onehot_pixel_image(
            padded_quantized_x_batch,
            quantization_steps=params.quantization_steps)

        # generate next signal
        softmax = wavenet.forward_one_step(padded_x_batch,
                                           apply_softmax=True,
                                           as_numpy=True)
        softmax = softmax[0, :, 0, -1]
        generated_quantized_signal = np.random.choice(np.arange(
            params.quantization_steps),
                                                      p=softmax)
        generated_quantized_audio = np.append(generated_quantized_audio,
                                              [generated_quantized_signal],
                                              axis=0)
        print generated_quantized_signal,
Ejemplo n.º 6
0
def train_audio():

    target_width = 5
    padded_input_width = 9
    batch_size = 2

    quantized_signal = np.mod(
        np.arange(1, padded_input_width * batch_size * 4), 6)
    print quantized_signal

    for rep in xrange(30):
        for pos in xrange(quantized_signal.size //
                          (padded_input_width * batch_size)):
            for shift in xrange(padded_input_width):
                if (
                        pos + 1
                ) * padded_input_width * batch_size + shift + 1 < quantized_signal.size:
                    padded_signal_batch, target_batch = create_padded_batch(
                        quantized_signal, batch_size, pos, shift, target_width,
                        padded_input_width)

                    padded_onehot_batch = data.onehot_pixel_image(
                        padded_signal_batch,
                        quantized_channels=params.audio_channels)

                    # print padded_signal_batch[0, -1]
                    # print padded_onehot_batch[0, :, 0, -1]
                    # print target_batch[0, -1]

                    loss = wavenet.loss(padded_onehot_batch, target_batch)
                    wavenet.backprop(loss)

        print float(loss.data)

    wavenet.save(args.model_dir)
Ejemplo n.º 7
0
def generate_audio(sampling_rate=48000, generate_sec=1, remove_silence_frames=False):
	batch_size = 1

	# compute required input width
	num_layers = len(params.residual_conv_channels)
	receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers
	receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1
	input_width = receptive_steps
	# padding for causal conv block
	input_width += len(params.causal_conv_channels)

	# quantized signals generated by WaveNet
	generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32)

	start_time = time.time()
	for time_step in xrange(1, int(sampling_rate * generate_sec)):
		# quantized signals in receptive field
		padded_quantized_x_batch = generated_quantized_audio[-input_width:].reshape((1, -1))

		# convert to image
		padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantization_steps=params.quantization_steps)

		# generate next signal
		if args.use_faster_wavenet:
			softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		else:
			softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True)

		softmax = softmax[0, :, 0, -1]
		generated_quantized_signal = np.random.choice(np.arange(params.quantization_steps), p=softmax)

		if generated_quantized_signal == 0 and remove_silence_frames:
			pass
		else:
			generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0)

		if time_step % 10 == 0:
			sys.stdout.write("\rgenerating {:.2f} msec / {:.2f} msec".format(time_step * 1000.0 / sampling_rate, generate_sec * 1000.0))
			sys.stdout.flush()

	print "\ndone in {:.3f} sec".format(time.time() - start_time)

	# remove zero paddings
	generated_quantized_audio = generated_quantized_audio[input_width:]

	try:
		os.mkdir(args.generate_dir)
	except:
		pass

	filename = "{}/generated.wav".format(args.generate_dir)
	data.save_audio_file(filename, generated_quantized_audio, params.quantization_steps, format="16bit_pcm", sampling_rate=sampling_rate)
Ejemplo n.º 8
0
def generate_audio(sampling_rate=48000, generate_sec=1, remove_silence_frames=False):
	# compute required input width
	num_layers = len(params.residual_conv_channels)
	receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers
	receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1
	input_width = receptive_steps
	# add paddings of causal conv block
	input_width += len(params.causal_conv_channels)

	# pad with silence signals
	generated_signals = np.full((input_width, ), 127, dtype=np.int32)

	start_time = time.time()
	for time_step in xrange(1, int(sampling_rate * generate_sec)):
		# signals in receptive field
		input_signals = generated_signals[-input_width:].reshape((1, -1))

		# convert to image
		input_signals = data.onehot_pixel_image(input_signals, quantization_steps=params.quantization_steps)

		# generate next signal
		if args.fast:
			softmax = wavenet._forward_one_step(input_signals, apply_softmax=True, as_numpy=True)
		else:
			softmax = wavenet.forward_one_step(input_signals, apply_softmax=True, as_numpy=True)


		softmax = softmax[0, :, 0, -1]
		signal = np.random.choice(np.arange(params.quantization_steps), p=softmax)

		if signal == 127 and remove_silence_frames:
			pass
		else:
			generated_signals = np.append(generated_signals, [signal], axis=0)

		if time_step % 10 == 0:
			sys.stdout.write("\rgenerating {:.2f} msec / {:.2f} msec".format(time_step * 1000.0 / sampling_rate, generate_sec * 1000.0))
			sys.stdout.flush()

	print "\ndone in {:.3f} sec".format(time.time() - start_time)

	# remove paddings
	generated_signals = generated_signals[input_width:]

	try:
		os.mkdir(args.output_dir)
	except:
		pass

	filename = "{}/generated.wav".format(args.output_dir)
	data.save_audio_file(filename, generated_signals, params.quantization_steps, format="16bit_pcm", sampling_rate=sampling_rate)
Ejemplo n.º 9
0
def train_audio():

	# compute receptive field width
	learnable_steps = 1
	batch_size = 1
	num_layers = len(params.residual_conv_channels)
	receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers
	receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1
	target_width = learnable_steps
	input_width = receptive_steps
	# to compute all learnable targets
	input_width += learnable_steps - 1
	## padding for causal conv block
	input_width += len(params.causal_conv_channels)

	quantized_signal = np.mod(np.arange(1, input_width * 10), params.quantization_steps)
	print quantized_signal

	for rep in xrange(300):
		sum_loss = 0
		for train in xrange(50):
			# create batch
			input_batch, target_batch = create_batch(quantized_signal, batch_size, input_width, target_width)

			# convert to 1xW image whose #channels is equal to the quantization steps of audio
			# input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width))
			input_batch = data.onehot_pixel_image(input_batch, quantization_steps=params.quantization_steps)

			# training
			## causal block
			output = wavenet.forward_causal_block(input_batch)
			## remove causal padding
			output = wavenet.slice_1d(output, len(params.causal_conv_channels))
			## residual dilated conv block
			output, sum_skip_connections = wavenet.forward_residual_block(output)
			## remove unnecessary elements
			sum_skip_connections = wavenet.slice_1d(sum_skip_connections, sum_skip_connections.data.shape[3] - target_width)
			## softmax block
			## Note: do not apply F.softmax
			output = wavenet.forward_softmax_block(sum_skip_connections, softmax=False)
			## compute cross entroy
			loss = wavenet.cross_entropy(output, target_batch)
			## update weights
			wavenet.backprop(loss)

			sum_loss += float(loss.data)

		print sum_loss / 50.0
		wavenet.save(args.model_dir)
Ejemplo n.º 10
0
def generate_audio(receptive_field_width_ms=25, sampling_rate=48000, generate_duration_sec=1):
	# e.g.
	# 48000 Hz * 0.25 = 12000 time steps (= 250 milliseconds receptive field)
	receptive_steps = int(sampling_rate * receptive_field_width_ms / 1000.0)

	# compute required input width
	batch_size = 1
	max_dilation = max(params.residual_conv_dilations)
	target_width = receptive_steps
	padded_input_width = receptive_steps + max_dilation * (params.residual_conv_kernel_width - 1)

	# quantized signals generated by WaveNet
	generated_quantized_audio = np.zeros((padded_input_width, ), dtype=np.int32)

	start_time = time.time()

	for time_step in xrange(1, int(sampling_rate * generate_duration_sec)):
		# quantized signals in receptive field
		padded_quantized_x_batch = generated_quantized_audio[-padded_input_width:].reshape((1, -1))

		# convert to image
		padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantized_channels=params.audio_channels)

		# generate next signal
		if args.use_faster_wavenet:
			softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		else:
			softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True)
		softmax = softmax[0, :, 0, -1]
		generated_quantized_signal = np.random.choice(np.arange(params.audio_channels), p=softmax)
		generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0)

		if time_step % 10 == 0:
			sys.stdout.write("\rgenerating {:.2f} msec / {:.2f} msec".format(time_step * 1000.0 / sampling_rate, generate_duration_sec * 1000.0))
			sys.stdout.flush()

	print "\ndone in {:.3f} sec".format(time.time() - start_time)

	# remove zero paddings
	generated_quantized_audio = generated_quantized_audio[padded_input_width:]

	try:
		os.mkdir(args.generate_dir)
	except:
		pass

	filename = "{}/generated.wav".format(args.generate_dir)
	data.save_audio_file(filename, generated_quantized_audio, params.audio_channels, format="16bit_pcm", sampling_rate=sampling_rate)
Ejemplo n.º 11
0
def train_audio():

    target_width = 4
    padded_input_width = 8 + 3 + 1
    batch_size = 8

    quantized_signal = np.mod(
        np.arange(1, padded_input_width * batch_size * 4), 6)
    # pad with zero
    quantized_signal = np.insert(quantized_signal,
                                 0,
                                 np.ones((padded_input_width, ),
                                         dtype=np.int32),
                                 axis=0)
    print quantized_signal

    for rep in xrange(50):
        for step in xrange(10):
            padded_signal_batch, target_batch = create_batch(
                quantized_signal, batch_size, padded_input_width, target_width)

            padded_onehot_batch = data.onehot_pixel_image(
                padded_signal_batch,
                quantized_channels=params.quantization_steps)

            # print padded_signal_batch[0, -1]
            # print padded_onehot_batch[0, :, 0, -1]
            # print target_batch[0, -1]

            output = wavenet.forward_causal_block(padded_onehot_batch)
            output = wavenet.slice_1d(output, 1)
            output, sum_skip_connections = wavenet.forward_residual_block(
                output)
            sum_skip_connections = wavenet.slice_1d(
                sum_skip_connections, output.data.shape[3] - target_width)
            output = wavenet.forward_softmax_block(sum_skip_connections,
                                                   softmax=False)
            loss = wavenet.cross_entropy(output, target_batch)
            wavenet.backprop(loss)

        loss = float(loss.data)
        print loss

    wavenet.save(args.model_dir)
Ejemplo n.º 12
0
def train_audio(filename,
                batch_size=10,
                save_per_update=500,
                log_per_update=50,
                epochs=100):
    quantized_signal, sampling_rate = data.load_audio_file(
        filename, quantized_channels=params.audio_channels)

    # receptive field width for the top residual dilated conv layer
    # receptive field width is determined automatically when determining the depth of the residual dilated conv block
    receptive_steps = params.residual_conv_dilations[-1] * (
        params.residual_conv_kernel_width - 1)
    receptive_msec = int(receptive_steps * 1000.0 / sampling_rate)

    print "training", filename
    print "	sampling rate:", sampling_rate, "[Hz]"
    print "	receptive field width:", receptive_msec, "[millisecond]"
    print "	receptive field width:", receptive_steps, "[time step]"
    print "	batch_size:", batch_size
    print "	learning_rate:", params.learning_rate

    # compute required input width
    max_dilation = max(params.residual_conv_dilations)
    target_width = receptive_steps
    padded_input_width = receptive_steps + max_dilation * (
        params.residual_conv_kernel_width - 1)

    num_updates = 0
    total_updates = 0
    sum_loss = 0

    if padded_input_width * batch_size + 1 > quantized_signal.size:
        raise Exception("batch_size too large")

    # pad with zero
    quantized_signal = np.insert(quantized_signal,
                                 0,
                                 np.zeros((padded_input_width, ),
                                          dtype=np.int32),
                                 axis=0)

    max_batches = int(
        (quantized_signal.size - padded_input_width) / float(batch_size))

    for epoch in xrange(1, epochs + 1):
        print "epoch: {}/{}".format(epoch, epochs)
        for batch_index in xrange(1, max_batches + 1):
            # create batch
            padded_input_batch, target_batch = create_batch(
                quantized_signal, batch_size, padded_input_width, target_width)

            # convert to 1xW image whose channel is equal to quantized audio_channels
            # padded_x_batch.shape = (BATCHSIZE, CHANNELS(=audio channels), HEIGHT(=1), WIDTH(=receptive field))
            padded_x_batch = data.onehot_pixel_image(
                padded_input_batch, quantized_channels=params.audio_channels)

            # update weights
            loss = wavenet.loss(padded_x_batch, target_batch)
            wavenet.backprop(loss)

            # logging
            sum_loss += float(loss.data)
            total_updates += 1
            if batch_index % log_per_update == 0:
                print "	batch: {}/{} loss: {:.6f}".format(
                    batch_index, max_batches, sum_loss / float(log_per_update))
                sum_loss = 0

            # save the model
            if total_updates % save_per_update == 0:
                wavenet.save(dir=args.model_dir)

        wavenet.save(dir=args.model_dir)
    wavenet.save(dir=args.model_dir)
Ejemplo n.º 13
0
def train_audio(
    filename,
    batch_size=16,
    learnable_steps=16,
    save_per_update=500,
    train_steps_ratio=0.05,
):

    # load audio data
    path_to_file = args.wav_dir + "/" + filename
    quantized_signal, sampling_rate = data.load_audio_file(
        path_to_file, quantization_steps=params.quantization_steps)

    # compute receptive field width
    num_layers = len(params.residual_conv_channels)
    receptive_steps_per_unit = params.residual_conv_filter_width**num_layers
    receptive_steps = (receptive_steps_per_unit -
                       1) * params.residual_num_blocks + 1
    receptive_msec = int(receptive_steps * 1000.0 / sampling_rate)
    target_width = learnable_steps
    input_width = receptive_steps
    # to compute all learnable targets
    input_width += learnable_steps - 1
    ## padding for causal conv block
    input_width += len(params.causal_conv_channels)

    # for logging
    num_updates = 0
    total_updates = 0
    sum_loss_epoch = 0
    sum_loss = 0
    start_time = time.time()
    prev_averate_loss = None
    max_batches = max(
        int((quantized_signal.size - input_width) / float(batch_size) *
            train_steps_ratio), 1)

    # print "training", filename
    # print "	sampling rate:", sampling_rate, "[Hz]"
    # print "	length:", quantized_signal.size, "[step]"
    # print "	batch_size:", batch_size
    # print "	learnable_steps:", learnable_steps

    # pad with zero
    quantized_signal = np.insert(quantized_signal,
                                 0,
                                 np.zeros((input_width, ), dtype=np.int32),
                                 axis=0)

    sum_loss_epoch = 0
    sum_loss = 0
    start_time = time.time()
    for batch_index in xrange(1, max_batches + 1):
        # create batch
        input_batch, target_batch = create_batch(quantized_signal, batch_size,
                                                 input_width, target_width)

        # convert to 1xW image whose #channels is equal to the quantization steps of audio
        # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width))
        input_batch = data.onehot_pixel_image(
            input_batch, quantization_steps=params.quantization_steps)

        # training
        ## causal block
        output = wavenet.forward_causal_block(input_batch)
        ## remove causal padding
        output = wavenet.slice_1d(output, len(params.causal_conv_channels))
        ## residual dilated conv block
        output, sum_skip_connections = wavenet.forward_residual_block(output)
        ## remove unnecessary elements
        sum_skip_connections = wavenet.slice_1d(
            sum_skip_connections,
            sum_skip_connections.data.shape[3] - target_width)
        ## softmax block
        ## Note: do not apply F.softmax
        output = wavenet.forward_softmax_block(sum_skip_connections,
                                               softmax=False)
        ## compute cross entroy
        loss = wavenet.cross_entropy(output, target_batch)
        ## update weights
        wavenet.backprop(loss)

        # logging
        loss = float(loss.data)
        sum_loss_epoch += loss
        sum_loss += loss
        total_updates += 1

        # save the model
        if total_updates % save_per_update == 0:
            wavenet.save(dir=args.model_dir)

    wavenet.save(dir=args.model_dir)
    average_loss = sum_loss / float(max_batches)
    sys.stdout.flush()

    return average_loss
Ejemplo n.º 14
0
def train_audio():
    # compute required input width
    num_layers = len(params.residual_conv_channels)
    receptive_width_per_unit = params.residual_conv_filter_width**num_layers
    receptive_width = (receptive_width_per_unit -
                       1) * params.residual_num_blocks + 1
    # padding for causal conv block
    causal_padding = len(params.causal_conv_channels)

    # quantized_signal = np.mod(np.arange(1, 100), 6)
    quantized_signal = np.repeat(np.arange(0, 10), 100, axis=0)
    # quantized_signal = np.random.randint(0, params.quantization_steps, 1000)
    original_signal_width = quantized_signal.size
    quantized_signal = np.insert(quantized_signal,
                                 0,
                                 np.full((receptive_width + causal_padding, ),
                                         0,
                                         dtype=np.int32),
                                 axis=0)

    target_width = original_signal_width // 20
    batch_size = 2

    for epoch in xrange(100):
        sum_loss = 0
        for step in xrange(500):
            input_batch, target_batch = create_batch(
                quantized_signal, batch_size, receptive_width + causal_padding,
                target_width)

            padded_onehot_batch = data.onehot_pixel_image(
                input_batch, quantization_steps=params.quantization_steps)

            # convert to 1xW image whose #channels is equal to the quantization steps of audio
            # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width))
            input_batch = data.onehot_pixel_image(
                input_batch, quantization_steps=params.quantization_steps)

            # training
            ## causal block
            output = wavenet.forward_causal_block(input_batch)
            ## remove causal padding
            # output = wavenet.slice_1d(output, len(params.causal_conv_channels))
            ## residual dilated conv block
            output, sum_skip_connections = wavenet.forward_residual_block(
                output)
            ## remove unnecessary elements
            sum_skip_connections = wavenet.slice_1d(
                sum_skip_connections,
                sum_skip_connections.data.shape[3] - target_width)
            ## softmax block
            ## Note: do not apply F.softmax
            output = wavenet.forward_softmax_block(sum_skip_connections,
                                                   apply_softmax=False)

            ## compute cross entroy
            loss = wavenet.cross_entropy(output, target_batch)
            ## update weights
            wavenet.backprop(loss)
            sum_loss += float(loss.data)
        print epoch, sum_loss
        wavenet.save(args.model_dir)
Ejemplo n.º 15
0
def train_audio(filename, batch_size=16, train_width=16, repeat=1000):
    # load audio data
    path_to_file = args.wav_dir + "/" + filename
    signals, sampling_rate = data.load_audio_file(
        path_to_file, quantization_steps=params.quantization_steps)

    # calculate receptive width
    num_layers = len(params.residual_conv_channels)
    receptive_width_per_unit = params.residual_conv_filter_width**num_layers
    receptive_width = (receptive_width_per_unit -
                       1) * params.residual_num_blocks + 1
    receptive_msec = int(receptive_width * 1000.0 / sampling_rate)

    # calculate required width
    input_width = receptive_width
    # add paddings of causal conv block
    input_width += len(params.causal_conv_channels)

    # for logging
    num_updates = 0
    total_updates = 0
    sum_loss = 0
    prev_average_loss = None

    # pad with silence signals
    signals = np.insert(signals,
                        0,
                        np.full((input_width, ), 127, dtype=np.int32),
                        axis=0)

    for batch_index in xrange(0, repeat):
        # create batch
        input_batch, target_batch = create_batch(signals, batch_size,
                                                 input_width, train_width)

        # convert to 1xW image whose #channels is equal to the quantization steps of audio
        # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width))
        input_batch = data.onehot_pixel_image(
            input_batch, quantization_steps=params.quantization_steps)

        # training
        output = wavenet.forward_causal_block(input_batch)
        output, sum_skip_connections = wavenet.forward_residual_block(output)
        sum_skip_connections = wavenet.slice_1d(
            sum_skip_connections, sum_skip_connections.shape[3] -
            train_width)  # remove unnecessary elements
        output = wavenet.forward_softmax_block(
            sum_skip_connections, apply_softmax=False)  # not apply F.softmax
        loss = wavenet.compute_cross_entropy(output, target_batch)
        wavenet.backprop(loss)

        # logging
        sum_loss += float(loss.data)
        total_updates += 1

        if batch_index % 10 == 0:
            sys.stdout.write("\r	{} - {} width; {}/{}".format(
                stdout.BOLD + filename + stdout.END, signals.size, batch_index,
                repeat))
            sys.stdout.flush()

    wavenet.save(args.model_dir)
    return sum_loss