def generate_audio(): # compute receptive field width learnable_steps = 1 batch_size = 1 num_layers = len(params.residual_conv_channels) receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1 target_width = learnable_steps input_width = receptive_steps # to compute all learnable targets input_width += learnable_steps - 1 ## padding for causal conv block input_width += len(params.causal_conv_channels) # quantized signals generated by WaveNet generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32) start = time.time() for time_step in xrange(9): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[-input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantization_steps=params.quantization_steps) # generate next signal softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.argmax(softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) print generated_quantized_signal, print generated_quantized_audio print time.time() - start wavenet.prev_causal_outputs = None wavenet.prev_residual_outputs_out = None wavenet.prev_residual_outputs_z = None generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32) start = time.time() for time_step in xrange(9): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[-input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantization_steps=params.quantization_steps) # generate next signal softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.argmax(softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) print generated_quantized_signal, print generated_quantized_audio print time.time() - start
def generate_audio(): receptive_field_width_steps = 5 batch_size = 1 max_dilation = max(params.residual_conv_dilations) target_width = receptive_field_width_steps padded_input_width = receptive_field_width_steps + max_dilation # quantized signals generated by WaveNet generated_quantized_audio = np.mod(np.arange(1, padded_input_width + 1), 6).astype(np.int32) start = time.time() for time_step in xrange(500): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[-padded_input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantized_channels=params.audio_channels) # generate next signal softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.argmax(softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) print generated_quantized_audio print time.time() - start wavenet.prev_causal_outputs = None wavenet.prev_residual_outputs_out = None wavenet.prev_residual_outputs_z = None generated_quantized_audio = np.mod(np.arange(1, padded_input_width + 1), 6).astype(np.int32) start = time.time() for time_step in xrange(500): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[-padded_input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantized_channels=params.audio_channels) # generate next signal softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.argmax(softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) print generated_quantized_audio print time.time() - start
def generate_audio(): receptive_field_width_steps = 5 batch_size = 1 max_dilation = max(params.residual_conv_dilations) target_width = receptive_field_width_steps padded_input_width = receptive_field_width_steps + max_dilation # quantized signals generated by WaveNet generated_quantized_audio = np.zeros((padded_input_width, ), dtype=np.int32) for time_step in xrange(200): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[ -padded_input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image( padded_quantized_x_batch, quantized_channels=params.audio_channels) # generate next signal softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.random.choice(np.arange( params.audio_channels), p=softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) print generated_quantized_audio
def train_audio(): target_width = 5 padded_input_width = 9 batch_size = 5 quantized_signal = np.mod(np.arange(1, 100), 6) quantized_signal = np.repeat(np.arange(0, 10), 100, axis=0) print quantized_signal for epoch in xrange(30): for step in xrange(100): padded_signal_batch, target_batch = create_batch(quantized_signal, batch_size, padded_input_width, target_width) padded_onehot_batch = data.onehot_pixel_image(padded_signal_batch, quantized_channels=params.quantization_steps) # print padded_signal_batch[0, -1] # print padded_onehot_batch[0, :, 0, -1] # print target_batch[0, -1] loss = wavenet.loss(padded_onehot_batch, target_batch) wavenet.backprop(loss) loss = float(loss.data) if loss > 0.3: print padded_signal_batch print target_batch
def main(): # compute required input width num_layers = len(params.residual_conv_channels) receptive_steps_per_unit = params.residual_conv_filter_width**num_layers receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1 input_width = receptive_steps # padding for causal conv block input_width += len(params.causal_conv_channels) # quantized signals generated by WaveNet generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32) for time_step in xrange(1000): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[ -input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image( padded_quantized_x_batch, quantization_steps=params.quantization_steps) # generate next signal softmax = wavenet.forward_one_step(padded_x_batch, apply_softmax=True, as_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.random.choice(np.arange( params.quantization_steps), p=softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) print generated_quantized_signal,
def train_audio(): target_width = 5 padded_input_width = 9 batch_size = 2 quantized_signal = np.mod( np.arange(1, padded_input_width * batch_size * 4), 6) print quantized_signal for rep in xrange(30): for pos in xrange(quantized_signal.size // (padded_input_width * batch_size)): for shift in xrange(padded_input_width): if ( pos + 1 ) * padded_input_width * batch_size + shift + 1 < quantized_signal.size: padded_signal_batch, target_batch = create_padded_batch( quantized_signal, batch_size, pos, shift, target_width, padded_input_width) padded_onehot_batch = data.onehot_pixel_image( padded_signal_batch, quantized_channels=params.audio_channels) # print padded_signal_batch[0, -1] # print padded_onehot_batch[0, :, 0, -1] # print target_batch[0, -1] loss = wavenet.loss(padded_onehot_batch, target_batch) wavenet.backprop(loss) print float(loss.data) wavenet.save(args.model_dir)
def generate_audio(sampling_rate=48000, generate_sec=1, remove_silence_frames=False): batch_size = 1 # compute required input width num_layers = len(params.residual_conv_channels) receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1 input_width = receptive_steps # padding for causal conv block input_width += len(params.causal_conv_channels) # quantized signals generated by WaveNet generated_quantized_audio = np.zeros((input_width, ), dtype=np.int32) start_time = time.time() for time_step in xrange(1, int(sampling_rate * generate_sec)): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[-input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantization_steps=params.quantization_steps) # generate next signal if args.use_faster_wavenet: softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True) else: softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.random.choice(np.arange(params.quantization_steps), p=softmax) if generated_quantized_signal == 0 and remove_silence_frames: pass else: generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) if time_step % 10 == 0: sys.stdout.write("\rgenerating {:.2f} msec / {:.2f} msec".format(time_step * 1000.0 / sampling_rate, generate_sec * 1000.0)) sys.stdout.flush() print "\ndone in {:.3f} sec".format(time.time() - start_time) # remove zero paddings generated_quantized_audio = generated_quantized_audio[input_width:] try: os.mkdir(args.generate_dir) except: pass filename = "{}/generated.wav".format(args.generate_dir) data.save_audio_file(filename, generated_quantized_audio, params.quantization_steps, format="16bit_pcm", sampling_rate=sampling_rate)
def generate_audio(sampling_rate=48000, generate_sec=1, remove_silence_frames=False): # compute required input width num_layers = len(params.residual_conv_channels) receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1 input_width = receptive_steps # add paddings of causal conv block input_width += len(params.causal_conv_channels) # pad with silence signals generated_signals = np.full((input_width, ), 127, dtype=np.int32) start_time = time.time() for time_step in xrange(1, int(sampling_rate * generate_sec)): # signals in receptive field input_signals = generated_signals[-input_width:].reshape((1, -1)) # convert to image input_signals = data.onehot_pixel_image(input_signals, quantization_steps=params.quantization_steps) # generate next signal if args.fast: softmax = wavenet._forward_one_step(input_signals, apply_softmax=True, as_numpy=True) else: softmax = wavenet.forward_one_step(input_signals, apply_softmax=True, as_numpy=True) softmax = softmax[0, :, 0, -1] signal = np.random.choice(np.arange(params.quantization_steps), p=softmax) if signal == 127 and remove_silence_frames: pass else: generated_signals = np.append(generated_signals, [signal], axis=0) if time_step % 10 == 0: sys.stdout.write("\rgenerating {:.2f} msec / {:.2f} msec".format(time_step * 1000.0 / sampling_rate, generate_sec * 1000.0)) sys.stdout.flush() print "\ndone in {:.3f} sec".format(time.time() - start_time) # remove paddings generated_signals = generated_signals[input_width:] try: os.mkdir(args.output_dir) except: pass filename = "{}/generated.wav".format(args.output_dir) data.save_audio_file(filename, generated_signals, params.quantization_steps, format="16bit_pcm", sampling_rate=sampling_rate)
def train_audio(): # compute receptive field width learnable_steps = 1 batch_size = 1 num_layers = len(params.residual_conv_channels) receptive_steps_per_unit = params.residual_conv_filter_width ** num_layers receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1 target_width = learnable_steps input_width = receptive_steps # to compute all learnable targets input_width += learnable_steps - 1 ## padding for causal conv block input_width += len(params.causal_conv_channels) quantized_signal = np.mod(np.arange(1, input_width * 10), params.quantization_steps) print quantized_signal for rep in xrange(300): sum_loss = 0 for train in xrange(50): # create batch input_batch, target_batch = create_batch(quantized_signal, batch_size, input_width, target_width) # convert to 1xW image whose #channels is equal to the quantization steps of audio # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width)) input_batch = data.onehot_pixel_image(input_batch, quantization_steps=params.quantization_steps) # training ## causal block output = wavenet.forward_causal_block(input_batch) ## remove causal padding output = wavenet.slice_1d(output, len(params.causal_conv_channels)) ## residual dilated conv block output, sum_skip_connections = wavenet.forward_residual_block(output) ## remove unnecessary elements sum_skip_connections = wavenet.slice_1d(sum_skip_connections, sum_skip_connections.data.shape[3] - target_width) ## softmax block ## Note: do not apply F.softmax output = wavenet.forward_softmax_block(sum_skip_connections, softmax=False) ## compute cross entroy loss = wavenet.cross_entropy(output, target_batch) ## update weights wavenet.backprop(loss) sum_loss += float(loss.data) print sum_loss / 50.0 wavenet.save(args.model_dir)
def generate_audio(receptive_field_width_ms=25, sampling_rate=48000, generate_duration_sec=1): # e.g. # 48000 Hz * 0.25 = 12000 time steps (= 250 milliseconds receptive field) receptive_steps = int(sampling_rate * receptive_field_width_ms / 1000.0) # compute required input width batch_size = 1 max_dilation = max(params.residual_conv_dilations) target_width = receptive_steps padded_input_width = receptive_steps + max_dilation * (params.residual_conv_kernel_width - 1) # quantized signals generated by WaveNet generated_quantized_audio = np.zeros((padded_input_width, ), dtype=np.int32) start_time = time.time() for time_step in xrange(1, int(sampling_rate * generate_duration_sec)): # quantized signals in receptive field padded_quantized_x_batch = generated_quantized_audio[-padded_input_width:].reshape((1, -1)) # convert to image padded_x_batch = data.onehot_pixel_image(padded_quantized_x_batch, quantized_channels=params.audio_channels) # generate next signal if args.use_faster_wavenet: softmax = wavenet._forward_one_step(padded_x_batch, softmax=True, return_numpy=True) else: softmax = wavenet.forward_one_step(padded_x_batch, softmax=True, return_numpy=True) softmax = softmax[0, :, 0, -1] generated_quantized_signal = np.random.choice(np.arange(params.audio_channels), p=softmax) generated_quantized_audio = np.append(generated_quantized_audio, [generated_quantized_signal], axis=0) if time_step % 10 == 0: sys.stdout.write("\rgenerating {:.2f} msec / {:.2f} msec".format(time_step * 1000.0 / sampling_rate, generate_duration_sec * 1000.0)) sys.stdout.flush() print "\ndone in {:.3f} sec".format(time.time() - start_time) # remove zero paddings generated_quantized_audio = generated_quantized_audio[padded_input_width:] try: os.mkdir(args.generate_dir) except: pass filename = "{}/generated.wav".format(args.generate_dir) data.save_audio_file(filename, generated_quantized_audio, params.audio_channels, format="16bit_pcm", sampling_rate=sampling_rate)
def train_audio(): target_width = 4 padded_input_width = 8 + 3 + 1 batch_size = 8 quantized_signal = np.mod( np.arange(1, padded_input_width * batch_size * 4), 6) # pad with zero quantized_signal = np.insert(quantized_signal, 0, np.ones((padded_input_width, ), dtype=np.int32), axis=0) print quantized_signal for rep in xrange(50): for step in xrange(10): padded_signal_batch, target_batch = create_batch( quantized_signal, batch_size, padded_input_width, target_width) padded_onehot_batch = data.onehot_pixel_image( padded_signal_batch, quantized_channels=params.quantization_steps) # print padded_signal_batch[0, -1] # print padded_onehot_batch[0, :, 0, -1] # print target_batch[0, -1] output = wavenet.forward_causal_block(padded_onehot_batch) output = wavenet.slice_1d(output, 1) output, sum_skip_connections = wavenet.forward_residual_block( output) sum_skip_connections = wavenet.slice_1d( sum_skip_connections, output.data.shape[3] - target_width) output = wavenet.forward_softmax_block(sum_skip_connections, softmax=False) loss = wavenet.cross_entropy(output, target_batch) wavenet.backprop(loss) loss = float(loss.data) print loss wavenet.save(args.model_dir)
def train_audio(filename, batch_size=10, save_per_update=500, log_per_update=50, epochs=100): quantized_signal, sampling_rate = data.load_audio_file( filename, quantized_channels=params.audio_channels) # receptive field width for the top residual dilated conv layer # receptive field width is determined automatically when determining the depth of the residual dilated conv block receptive_steps = params.residual_conv_dilations[-1] * ( params.residual_conv_kernel_width - 1) receptive_msec = int(receptive_steps * 1000.0 / sampling_rate) print "training", filename print " sampling rate:", sampling_rate, "[Hz]" print " receptive field width:", receptive_msec, "[millisecond]" print " receptive field width:", receptive_steps, "[time step]" print " batch_size:", batch_size print " learning_rate:", params.learning_rate # compute required input width max_dilation = max(params.residual_conv_dilations) target_width = receptive_steps padded_input_width = receptive_steps + max_dilation * ( params.residual_conv_kernel_width - 1) num_updates = 0 total_updates = 0 sum_loss = 0 if padded_input_width * batch_size + 1 > quantized_signal.size: raise Exception("batch_size too large") # pad with zero quantized_signal = np.insert(quantized_signal, 0, np.zeros((padded_input_width, ), dtype=np.int32), axis=0) max_batches = int( (quantized_signal.size - padded_input_width) / float(batch_size)) for epoch in xrange(1, epochs + 1): print "epoch: {}/{}".format(epoch, epochs) for batch_index in xrange(1, max_batches + 1): # create batch padded_input_batch, target_batch = create_batch( quantized_signal, batch_size, padded_input_width, target_width) # convert to 1xW image whose channel is equal to quantized audio_channels # padded_x_batch.shape = (BATCHSIZE, CHANNELS(=audio channels), HEIGHT(=1), WIDTH(=receptive field)) padded_x_batch = data.onehot_pixel_image( padded_input_batch, quantized_channels=params.audio_channels) # update weights loss = wavenet.loss(padded_x_batch, target_batch) wavenet.backprop(loss) # logging sum_loss += float(loss.data) total_updates += 1 if batch_index % log_per_update == 0: print " batch: {}/{} loss: {:.6f}".format( batch_index, max_batches, sum_loss / float(log_per_update)) sum_loss = 0 # save the model if total_updates % save_per_update == 0: wavenet.save(dir=args.model_dir) wavenet.save(dir=args.model_dir) wavenet.save(dir=args.model_dir)
def train_audio( filename, batch_size=16, learnable_steps=16, save_per_update=500, train_steps_ratio=0.05, ): # load audio data path_to_file = args.wav_dir + "/" + filename quantized_signal, sampling_rate = data.load_audio_file( path_to_file, quantization_steps=params.quantization_steps) # compute receptive field width num_layers = len(params.residual_conv_channels) receptive_steps_per_unit = params.residual_conv_filter_width**num_layers receptive_steps = (receptive_steps_per_unit - 1) * params.residual_num_blocks + 1 receptive_msec = int(receptive_steps * 1000.0 / sampling_rate) target_width = learnable_steps input_width = receptive_steps # to compute all learnable targets input_width += learnable_steps - 1 ## padding for causal conv block input_width += len(params.causal_conv_channels) # for logging num_updates = 0 total_updates = 0 sum_loss_epoch = 0 sum_loss = 0 start_time = time.time() prev_averate_loss = None max_batches = max( int((quantized_signal.size - input_width) / float(batch_size) * train_steps_ratio), 1) # print "training", filename # print " sampling rate:", sampling_rate, "[Hz]" # print " length:", quantized_signal.size, "[step]" # print " batch_size:", batch_size # print " learnable_steps:", learnable_steps # pad with zero quantized_signal = np.insert(quantized_signal, 0, np.zeros((input_width, ), dtype=np.int32), axis=0) sum_loss_epoch = 0 sum_loss = 0 start_time = time.time() for batch_index in xrange(1, max_batches + 1): # create batch input_batch, target_batch = create_batch(quantized_signal, batch_size, input_width, target_width) # convert to 1xW image whose #channels is equal to the quantization steps of audio # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width)) input_batch = data.onehot_pixel_image( input_batch, quantization_steps=params.quantization_steps) # training ## causal block output = wavenet.forward_causal_block(input_batch) ## remove causal padding output = wavenet.slice_1d(output, len(params.causal_conv_channels)) ## residual dilated conv block output, sum_skip_connections = wavenet.forward_residual_block(output) ## remove unnecessary elements sum_skip_connections = wavenet.slice_1d( sum_skip_connections, sum_skip_connections.data.shape[3] - target_width) ## softmax block ## Note: do not apply F.softmax output = wavenet.forward_softmax_block(sum_skip_connections, softmax=False) ## compute cross entroy loss = wavenet.cross_entropy(output, target_batch) ## update weights wavenet.backprop(loss) # logging loss = float(loss.data) sum_loss_epoch += loss sum_loss += loss total_updates += 1 # save the model if total_updates % save_per_update == 0: wavenet.save(dir=args.model_dir) wavenet.save(dir=args.model_dir) average_loss = sum_loss / float(max_batches) sys.stdout.flush() return average_loss
def train_audio(): # compute required input width num_layers = len(params.residual_conv_channels) receptive_width_per_unit = params.residual_conv_filter_width**num_layers receptive_width = (receptive_width_per_unit - 1) * params.residual_num_blocks + 1 # padding for causal conv block causal_padding = len(params.causal_conv_channels) # quantized_signal = np.mod(np.arange(1, 100), 6) quantized_signal = np.repeat(np.arange(0, 10), 100, axis=0) # quantized_signal = np.random.randint(0, params.quantization_steps, 1000) original_signal_width = quantized_signal.size quantized_signal = np.insert(quantized_signal, 0, np.full((receptive_width + causal_padding, ), 0, dtype=np.int32), axis=0) target_width = original_signal_width // 20 batch_size = 2 for epoch in xrange(100): sum_loss = 0 for step in xrange(500): input_batch, target_batch = create_batch( quantized_signal, batch_size, receptive_width + causal_padding, target_width) padded_onehot_batch = data.onehot_pixel_image( input_batch, quantization_steps=params.quantization_steps) # convert to 1xW image whose #channels is equal to the quantization steps of audio # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width)) input_batch = data.onehot_pixel_image( input_batch, quantization_steps=params.quantization_steps) # training ## causal block output = wavenet.forward_causal_block(input_batch) ## remove causal padding # output = wavenet.slice_1d(output, len(params.causal_conv_channels)) ## residual dilated conv block output, sum_skip_connections = wavenet.forward_residual_block( output) ## remove unnecessary elements sum_skip_connections = wavenet.slice_1d( sum_skip_connections, sum_skip_connections.data.shape[3] - target_width) ## softmax block ## Note: do not apply F.softmax output = wavenet.forward_softmax_block(sum_skip_connections, apply_softmax=False) ## compute cross entroy loss = wavenet.cross_entropy(output, target_batch) ## update weights wavenet.backprop(loss) sum_loss += float(loss.data) print epoch, sum_loss wavenet.save(args.model_dir)
def train_audio(filename, batch_size=16, train_width=16, repeat=1000): # load audio data path_to_file = args.wav_dir + "/" + filename signals, sampling_rate = data.load_audio_file( path_to_file, quantization_steps=params.quantization_steps) # calculate receptive width num_layers = len(params.residual_conv_channels) receptive_width_per_unit = params.residual_conv_filter_width**num_layers receptive_width = (receptive_width_per_unit - 1) * params.residual_num_blocks + 1 receptive_msec = int(receptive_width * 1000.0 / sampling_rate) # calculate required width input_width = receptive_width # add paddings of causal conv block input_width += len(params.causal_conv_channels) # for logging num_updates = 0 total_updates = 0 sum_loss = 0 prev_average_loss = None # pad with silence signals signals = np.insert(signals, 0, np.full((input_width, ), 127, dtype=np.int32), axis=0) for batch_index in xrange(0, repeat): # create batch input_batch, target_batch = create_batch(signals, batch_size, input_width, train_width) # convert to 1xW image whose #channels is equal to the quantization steps of audio # input_batch.shape = (BATCHSIZE, CHANNELS(=quantization_steps), HEIGHT(=1), WIDTH(=input_width)) input_batch = data.onehot_pixel_image( input_batch, quantization_steps=params.quantization_steps) # training output = wavenet.forward_causal_block(input_batch) output, sum_skip_connections = wavenet.forward_residual_block(output) sum_skip_connections = wavenet.slice_1d( sum_skip_connections, sum_skip_connections.shape[3] - train_width) # remove unnecessary elements output = wavenet.forward_softmax_block( sum_skip_connections, apply_softmax=False) # not apply F.softmax loss = wavenet.compute_cross_entropy(output, target_batch) wavenet.backprop(loss) # logging sum_loss += float(loss.data) total_updates += 1 if batch_index % 10 == 0: sys.stdout.write("\r {} - {} width; {}/{}".format( stdout.BOLD + filename + stdout.END, signals.size, batch_index, repeat)) sys.stdout.flush() wavenet.save(args.model_dir) return sum_loss