def load_labels(filelist, predictions, fps, datadir): labels = [] for fn in filelist: ffn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(ffn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(predictions[fn])) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) return labels
def prepare_audio_batches(self, sample_rate, frame_len, fps, blocklen, batchsize, batch_data=True): spects = [] for fn in progress(self.filelist, 'File'): cache_fn = (self.featuredir and os.path.join(self.featuredir, fn + '.npy')) spects.append( cached(cache_fn, audio.read_ffmpeg, os.path.join(self.datadir, 'audio', fn), sample_rate)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(self.filelist, spects): fn = os.path.join(self.datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(sample_rate) labels.append(create_aligned_targets(segments, timestamps, np.bool)) if (batch_data): batches = augment.grab_random_audio_excerpts( spects, labels, batchsize, sample_rate, frame_len, fps, blocklen) return batches else: return spects, labels
def prepare_batches(self, sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize, batch_data=True): bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate spects = [] for fn in progress(self.filelist, 'File'): cache_fn = (self.featuredir and os.path.join(self.featuredir, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(self.datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(self.filelist, spects): fn = os.path.join(self.datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) if (self.input_type == 'stft'): print('Create dataset with stft output') if (batch_data): batches = augment.grab_random_excerpts(spects, labels, batchsize, blocklen) return batches else: return spects, labels if (self.input_type == 'mel_spects' or self.input_type == 'mel_spects_norm'): # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not self.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % self.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) #print(meanstd_file,mean, istd) #input('wait') #print(znorm.compute_mean_std(mel_spects)) #input('wait') # - prepare training data generator print("Preparing training data feed...") if not self.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts if (self.input_type == 'mel_spects'): print('Creating batches of mel spects without znorm') if (batch_data): batches = augment.grab_random_excerpts( mel_spects, labels, batchsize, blocklen) else: pad = np.tile((np.log(1e-7).astype(floatX)), (blocklen // 2, 80)) mel_spects = (np.concatenate((pad, spect, pad), axis=0) for spect in mel_spects) mel_spects = list(mel_spects) return mel_spects, labels elif (self.input_type == 'mel_spects_norm'): print('Creating batches of mel spects with znorm') #mel_spects = [(spect - mean) * istd for spect in mel_spects] if (batch_data): mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts( mel_spects, labels, batchsize, blocklen) else: #pad = np.tile((np.log(1e-7)-mean)*istd, (blocklen//2, 1)) #input(pad) pad = np.tile((np.log(1e-7).astype(floatX)), (blocklen // 2, 80)) mel_spects = (np.concatenate((pad, spect, pad), axis=0) for spect in mel_spects) mel_spects = [(spect - mean) * istd for spect in mel_spects] #input(mean.shape) #mel_spects = [(spect - np.zeros(80)) * np.ones(80) for spect in mel_spects] mel_spects = list(mel_spects) return mel_spects, labels else: print('Creating batches of stfts') batches = augment.grab_random_excerpts(spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = cfg['spline_order'] if spline_order > 1: from scipy.ndimage import spline_filter spects = [ spline_filter(spect, spline_order).astype(floatX) for spect in spects ] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = cfg['max_stretch'] batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = cfg['max_shift'] batches = augment.apply_random_stretch_shift( batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters max_db = cfg['max_db'] batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=max_db) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = cfg['bg_threads'] bg_processes = cfg['bg_processes'] if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [ create_datafeed(spects, labels) for _ in range(bg_threads) ], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) return batches
def main(): # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile sample_rate = 22050 frame_len = 1024 fps = 70 mel_bands = 80 mel_min = 27.5 mel_max = 8000 blocklen = 115 batchsize = 32 bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'train')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append(cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log(np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not options.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - prepare training data generator print("Preparing training data feed...") if not options.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts( mel_spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = 2 if spline_order > 1: from scipy.ndimage import spline_filter spects = [spline_filter(spect, spline_order).astype(floatX) for spect in spects] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = .3 batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = .3 batches = augment.apply_random_stretch_shift( batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=10) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = 3 bg_processes = 0 if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [create_datafeed(spects, labels) for _ in range(bg_threads)], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) print("Preparing training function...") # instantiate neural network input_var = T.tensor3('input') inputs = input_var.dimshuffle(0, 'x', 1, 2) # insert "channels" dimension network = model.architecture(inputs, (None, 1, blocklen, mel_bands)) # create cost expression target_var = T.vector('targets') targets = (0.02 + 0.96 * target_var) # map 0 -> 0.02, 1 -> 0.98 targets = targets.dimshuffle(0, 'x') # turn into column vector outputs = lasagne.layers.get_output(network, deterministic=False) cost = T.mean(lasagne.objectives.binary_crossentropy(outputs, targets)) # prepare and compile training function params = lasagne.layers.get_all_params(network, trainable=True) initial_eta = 0.01 eta_decay = 0.85 momentum = 0.95 eta = theano.shared(lasagne.utils.floatX(initial_eta)) updates = lasagne.updates.nesterov_momentum(cost, params, eta, momentum) print("Compiling training function...") train_fn = theano.function([input_var, target_var], cost, updates=updates) # run training loop print("Training:") epochs = 20 epochsize = 2000 batches = iter(batches) for epoch in range(epochs): err = 0 for batch in progress( range(epochsize), min_delay=.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): err += train_fn(*next(batches)) if not np.isfinite(err): print("\nEncountered NaN loss in training. Aborting.") sys.exit(1) print("Train loss: %.3f" % (err / epochsize)) eta.set_value(eta.get_value() * lasagne.utils.floatX(eta_decay)) # save final network print("Saving final model") np.savez(modelfile, **{'param%d' % i: p for i, p in enumerate( lasagne.layers.get_all_param_values(network))})
def main(): # parse command line parser = opts_parser() options = parser.parse_args() outdir = options.outdir if options.load_spectra != 'memory' and not options.cache_spectra: parser.error('option --load-spectra=%s requires --cache-spectra' % options.load_spectra) # read configuration files and immediate settings cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) # read some settings into local variables sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist filelist = [] ranges = {} for part in 'train', 'valid', 'test': a = len(filelist) with io.open( os.path.join(datadir, 'filelists', cfg.get('filelist.%s' % part, part))) as f: filelist.extend(l.rstrip() for l in f if l.rstrip()) ranges[part] = slice(a, len(filelist)) # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps, loading_mode=options.load_spectra)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # compute and save different variants of summarized magnitudes print("Saving files...") # - ground truth outfile = os.path.join(outdir, '%s_gt.pkl' % options.dataset) print(outfile) with io.open(outfile, 'wb') as f: pickle.dump({'labels': labels, 'splits': ranges}, f, protocol=-1) # - summarized spectra save_spectral_sums( os.path.join(outdir, '%s_spect_sum.pkl' % options.dataset), spects) # - summarized mel spectra bank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max).astype(np.float32) spects = [np.dot(spect[:, ], bank) for spect in spects] save_spectral_sums( os.path.join(outdir, '%s_spect_mel_sum.pkl' % options.dataset), spects) # - summarized log-mel spectra spects = [np.log(np.maximum(1e-7, spect)) for spect in spects] save_spectral_sums( os.path.join(outdir, '%s_spect_mel_log_sum.pkl' % options.dataset), spects) # - summarized standardized log-mel spectra m, s = znorm.compute_mean_std(spects[ranges['train']], axis=0) spects = [((spect - m) / s).astype(np.float32) for spect in spects] save_spectral_sums( os.path.join(outdir, '%s_spect_mel_log_std_sum.pkl' % options.dataset), spects)
def main(): # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile if options.load_spectra != 'memory' and not options.cache_spectra: parser.error('option --load-spectra=%s requires --cache-spectra' % options.load_spectra) # read configuration files and immediate settings cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) # read some settings into local variables sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 if cfg['filterbank'] == 'mel_learn': bin_mel_max = bin_nyquist else: bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open( os.path.join(datadir, 'filelists', cfg.get('filelist.train', 'train'))) as f: filelist = [l.rstrip() for l in f if l.rstrip()] if options.validate: with io.open( os.path.join(datadir, 'filelists', cfg.get('filelist.valid', 'valid'))) as f: filelist_val = [l.rstrip() for l in f if l.rstrip()] filelist.extend(filelist_val) else: filelist_val = [] # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps, loading_mode=options.load_spectra)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # - split off validation data, if needed if options.validate: spects_val = spects[-len(filelist_val):] spects = spects[:-len(filelist_val)] labels_val = labels[-len(filelist_val):] labels = labels[:-len(filelist_val)] # - prepare training data generator print("Preparing training data feed...") if not options.augment: # Without augmentation, we just create a generator that returns # mini-batches of random excerpts batches = augment.grab_random_excerpts(spects, labels, batchsize, blocklen, bin_mel_max) batches = augment.generate_in_background([batches], num_cached=15) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = cfg['spline_order'] if spline_order > 1 and options.load_spectra == 'memory': from scipy.ndimage import spline_filter spects = [ spline_filter(spect, spline_order).astype(floatX) for spect in spects ] prefiltered = True else: prefiltered = False # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = cfg['max_stretch'] batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = cfg['max_shift'] batches = augment.apply_random_stretch_shift( batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=prefiltered) # We apply random frequency filters max_db = cfg['max_db'] batches = augment.apply_random_filters(batches, mel_max, max_db) # We apply random loudness changes max_loudness = cfg['max_loudness'] if max_loudness: batches = augment.apply_random_loudness(batches, max_loudness) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = cfg['bg_threads'] bg_processes = cfg['bg_processes'] if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [create_datafeed(spects, labels) for _ in range(bg_threads)], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) print("Preparing training function...") # instantiate neural network input_var = T.tensor3('input') inputs = input_var.dimshuffle(0, 'x', 1, 2) # insert "channels" dimension network = model.architecture(inputs, (None, 1, blocklen, bin_mel_max), cfg) print( "- %d layers (%d with weights), %f mio params" % (len(lasagne.layers.get_all_layers(network)), sum(hasattr(l, 'W') for l in lasagne.layers.get_all_layers(network)), lasagne.layers.count_params(network, trainable=True) / 1e6)) print("- weight shapes: %r" % [ l.W.get_value().shape for l in lasagne.layers.get_all_layers(network) if hasattr(l, 'W') and hasattr(l.W, 'get_value') ]) # create cost expression target_var = T.vector('targets') targets = (0.02 + 0.96 * target_var) # map 0 -> 0.02, 1 -> 0.98 targets = targets.dimshuffle(0, 'x') # turn into column vector outputs = lasagne.layers.get_output(network, deterministic=False) cost = T.mean(lasagne.objectives.binary_crossentropy(outputs, targets)) if cfg.get('l2_decay', 0): cost_l2 = lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) * cfg['l2_decay'] else: cost_l2 = 0 # prepare and compile training function params = lasagne.layers.get_all_params(network, trainable=True) initial_eta = cfg['initial_eta'] eta_decay = cfg['eta_decay'] eta_decay_every = cfg.get('eta_decay_every', 1) patience = cfg.get('patience', 0) trials_of_patience = cfg.get('trials_of_patience', 1) patience_criterion = cfg.get( 'patience_criterion', 'valid_loss' if options.validate else 'train_loss') momentum = cfg['momentum'] first_params = params[:cfg['first_params']] first_params_eta_scale = cfg['first_params_eta_scale'] if cfg['learn_scheme'] == 'nesterov': learn_scheme = lasagne.updates.nesterov_momentum elif cfg['learn_scheme'] == 'momentum': learn_scheme = lasagne.update.momentum elif cfg['learn_scheme'] == 'adam': learn_scheme = lasagne.updates.adam else: raise ValueError('Unknown learn_scheme=%s' % cfg['learn_scheme']) eta = theano.shared(lasagne.utils.floatX(initial_eta)) if not first_params or first_params_eta_scale == 1: updates = learn_scheme(cost + cost_l2, params, eta, momentum) else: grads = theano.grad(cost + cost_l2, params) updates = learn_scheme(grads[len(first_params):], params[len(first_params):], eta, momentum) if first_params_eta_scale > 0: updates.update( learn_scheme(grads[:len(first_params)], first_params, eta * first_params_eta_scale, momentum)) print("Compiling training function...") train_fn = theano.function([input_var, target_var], cost, updates=updates) # prepare and compile validation function, if requested if options.validate: print("Compiling validation function...") import model_to_fcn network_test = model_to_fcn.model_to_fcn(network, allow_unlink=False) outputs_test = lasagne.layers.get_output(network_test, deterministic=True) cost_test = T.mean( lasagne.objectives.binary_crossentropy(outputs_test, targets)) val_fn = theano.function([input_var, target_var], [cost_test, outputs_test]) # run training loop print("Training:") epochs = cfg['epochs'] epochsize = cfg['epochsize'] batches = iter(batches) if options.save_errors: errors = [] if first_params and cfg['first_params_log']: first_params_hist = [] if patience > 0: best_error = np.inf best_state = get_state(network, updates) for epoch in range(epochs): # actual training err = 0 for batch in progress(range(epochsize), min_delay=.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): err += train_fn(*next(batches)) if not np.isfinite(err): print("\nEncountered NaN loss in training. Aborting.") sys.exit(1) if first_params and cfg['first_params_log'] and ( batch % cfg['first_params_log'] == 0): first_params_hist.append( tuple(param.get_value() for param in first_params)) np.savez( modelfile[:-4] + '.hist.npz', **{ 'param%d' % i: param for i, param in enumerate(zip(*first_params_hist)) }) # report training loss print("Train loss: %.3f" % (err / epochsize)) if options.save_errors: errors.append(err / epochsize) # compute and report validation loss, if requested if options.validate: val_err = 0 preds = [] max_len = int(fps * cfg.get('val.max_len', 30)) for spect, label in zip(spects_val, labels_val): # pick excerpt of val.max_len seconds in center of file excerpt = slice(max(0, (len(spect) - max_len) // 2), (len(spect) + max_len) // 2) # crop to maximum length and required spectral bins spect = spect[None, excerpt, :bin_mel_max] # crop to maximum length and remove edges lost in the network label = label[excerpt][blocklen // 2:-(blocklen // 2)] e, pred = val_fn(spect, label) val_err += e preds.append((pred[:, 0], label)) print("Validation loss: %.3f" % (val_err / len(filelist_val))) from eval import evaluate _, results = evaluate(*zip(*preds)) print("Validation error: %.3f" % (1 - results['accuracy'])) if options.save_errors: errors.append(val_err / len(filelist_val)) errors.append(1 - results['accuracy']) # update learning rate and/or apply early stopping, if needed if patience > 0: if patience_criterion == 'train_loss': cur_error = err / epochsize elif patience_criterion == 'valid_loss': cur_error = val_err / len(filelist_val) elif patience_criterion == 'valid_error': cur_error = 1 - results['accuracy'] if cur_error <= best_error: best_error = cur_error best_state = get_state(network, updates) patience = cfg['patience'] else: patience -= 1 if patience == 0: if eta_decay_every == 'trial_of_patience' and eta_decay != 1: eta.set_value(eta.get_value() * lasagne.utils.floatX(eta_decay)) restore_state(network, updates, best_state) patience = cfg['patience'] trials_of_patience -= 1 print("Lost patience (%d remaining trials)." % trials_of_patience) if trials_of_patience == 0: break if eta_decay_every != 'trial_of_patience' and eta_decay != 1 and \ (epoch + 1) % eta_decay_every == 0: eta.set_value(eta.get_value() * lasagne.utils.floatX(eta_decay)) # save final network print("Saving final model") np.savez( modelfile, **{ 'param%d' % i: p for i, p in enumerate(lasagne.layers.get_all_param_values(network)) }) with io.open(modelfile + '.vars', 'wb') as f: f.writelines('%s=%s\n' % kv for kv in cfg.items()) if options.save_errors: np.savez(modelfile[:-len('.npz')] + '.err.npz', np.asarray(errors).reshape(epoch + 1, -1))
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'train')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] if options.validate: with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist_val = [l.strip() for l in f if l.strip()] filelist.extend(filelist_val) else: filelist_val = [] # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) if options.validate: spects_val = spects[-len(filelist_val):] spects = spects[:-len(filelist_val)] labels_val = labels[-len(filelist_val):] labels = labels[:-len(filelist_val)] # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not options.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - prepare training data generator print("Preparing training data feed...") if not options.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts(mel_spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = cfg['spline_order'] if spline_order > 1: from scipy.ndimage import spline_filter spects = [ spline_filter(spect, spline_order).astype(floatX) for spect in spects ] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = cfg['max_stretch'] batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = cfg['max_shift'] batches = augment.apply_random_stretch_shift(batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters max_db = cfg['max_db'] batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=max_db) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = cfg['bg_threads'] bg_processes = cfg['bg_processes'] if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [create_datafeed(spects, labels) for _ in range(bg_threads)], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) ########################################################################### #-----------Main changes to code to make it work with pytorch-------------# ########################################################################### print("preparing training function...") mdl = model.CNNModel() mdl = mdl.to(device) #Setting up learning rate and learning rate parameters initial_eta = cfg['initial_eta'] eta_decay = cfg['eta_decay'] momentum = cfg['momentum'] eta_decay_every = cfg.get('eta_decay_every', 1) eta = initial_eta #set up loss criterion = torch.nn.BCELoss() #set up optimizer optimizer = torch.optim.SGD(mdl.parameters(), lr=eta, momentum=momentum, nesterov=True) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=eta_decay_every, gamma=eta_decay) #set up optimizer writer = SummaryWriter(os.path.join(modelfile, 'runs')) epochs = cfg['epochs'] epochsize = cfg['epochsize'] batches = iter(batches) #conditions to save model best_val_loss = 100000. best_val_error = 1. for epoch in range(epochs): # - Initialize certain parameters that are used to monitor training err = 0 total_norm = 0 loss_accum = 0 mdl.train(True) # - Compute the L-2 norm of the gradients for p in mdl.parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) # - Start the training for this epoch for batch in progress(range(epochsize), min_delay=0.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): data = next(batches) input_data = np.transpose(data[0][:, :, :, np.newaxis], (0, 3, 1, 2)) labels = data[1][:, np.newaxis].astype(np.float32) #map labels to make them softer labels = (0.02 + 0.96 * labels) optimizer.zero_grad() outputs = mdl(torch.from_numpy(input_data).to(device)) loss = criterion(outputs, torch.from_numpy(labels).to(device)) loss.backward() optimizer.step() loss_accum += loss.item() # - Compute validation loss and error if desired if options.validate: from eval import evaluate mdl.train(False) val_loss = 0 preds = [] labs = [] max_len = fps mel_spects_val = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects_val) mel_spects_val = [(spect - mean) * istd for spect in mel_spects_val] num_iter = 0 for spect, label in zip(mel_spects_val, labels_val): num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - Pass mini-batches through the network and concatenate results for pos in range(0, num_excerpts, batchsize): input_data = np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], (0, 3, 1, 2)) if (pos + batchsize > num_excerpts): label_batch = label[blocklen // 2 + pos:blocklen // 2 + num_excerpts, np.newaxis].astype(np.float32) else: label_batch = label[blocklen // 2 + pos:blocklen // 2 + pos + batchsize, np.newaxis].astype(np.float32) pred = mdl(torch.from_numpy(input_data).to(device)) e = criterion(pred, torch.from_numpy(label_batch).to(device)) preds = np.append(preds, pred[:, 0].cpu().detach().numpy()) labs = np.append(labs, label_batch) val_loss += e.item() num_iter += 1 print("Validation loss: %.3f" % (val_loss / num_iter)) _, results = evaluate(preds, labs) print("Validation error: %.3f" % (1 - results['accuracy'])) if (val_loss / num_iter < best_val_loss and (1 - results['accuracy']) < best_val_error): torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) best_val_loss = val_loss / num_iter best_val_error = 1 - results['accuracy'] print('New saved model', best_val_loss, best_val_error) #Update the learning rate scheduler.step() print('Training Loss per epoch', loss_accum / epochsize) # - Save parameters for examining writer.add_scalar('Training Loss', loss_accum / epochsize, epoch) writer.add_scalar('Validation loss', val_loss / num_iter, epoch) writer.add_scalar('Gradient norm', total_norm, epoch) writer.add_scalar('Validation error', 1 - results['accuracy']) for param_group in optimizer.param_groups: print(param_group['lr']) if not options.validate: torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))