def __init__(self, sample_rate, frame_len, mel_bands, mel_min, mel_max, bin_mel_max, device): super(LogMelBankLayer, self).__init__() self.device = device self.bin_mel_max = bin_mel_max self.filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) self.filterbank = torch.Tensor(self.filterbank[:bin_mel_max].astype( np.float32)).to(device)
def prepare_batches(self, sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize, batch_data=True): bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate spects = [] for fn in progress(self.filelist, 'File'): cache_fn = (self.featuredir and os.path.join(self.featuredir, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(self.datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(self.filelist, spects): fn = os.path.join(self.datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) if (self.input_type == 'stft'): print('Create dataset with stft output') if (batch_data): batches = augment.grab_random_excerpts(spects, labels, batchsize, blocklen) return batches else: return spects, labels if (self.input_type == 'mel_spects' or self.input_type == 'mel_spects_norm'): # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not self.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % self.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) #print(meanstd_file,mean, istd) #input('wait') #print(znorm.compute_mean_std(mel_spects)) #input('wait') # - prepare training data generator print("Preparing training data feed...") if not self.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts if (self.input_type == 'mel_spects'): print('Creating batches of mel spects without znorm') if (batch_data): batches = augment.grab_random_excerpts( mel_spects, labels, batchsize, blocklen) else: pad = np.tile((np.log(1e-7).astype(floatX)), (blocklen // 2, 80)) mel_spects = (np.concatenate((pad, spect, pad), axis=0) for spect in mel_spects) mel_spects = list(mel_spects) return mel_spects, labels elif (self.input_type == 'mel_spects_norm'): print('Creating batches of mel spects with znorm') #mel_spects = [(spect - mean) * istd for spect in mel_spects] if (batch_data): mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts( mel_spects, labels, batchsize, blocklen) else: #pad = np.tile((np.log(1e-7)-mean)*istd, (blocklen//2, 1)) #input(pad) pad = np.tile((np.log(1e-7).astype(floatX)), (blocklen // 2, 80)) mel_spects = (np.concatenate((pad, spect, pad), axis=0) for spect in mel_spects) mel_spects = [(spect - mean) * istd for spect in mel_spects] #input(mean.shape) #mel_spects = [(spect - np.zeros(80)) * np.ones(80) for spect in mel_spects] mel_spects = list(mel_spects) return mel_spects, labels else: print('Creating batches of stfts') batches = augment.grab_random_excerpts(spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = cfg['spline_order'] if spline_order > 1: from scipy.ndimage import spline_filter spects = [ spline_filter(spect, spline_order).astype(floatX) for spect in spects ] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = cfg['max_stretch'] batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = cfg['max_shift'] batches = augment.apply_random_stretch_shift( batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters max_db = cfg['max_db'] batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=max_db) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = cfg['bg_threads'] bg_processes = cfg['bg_processes'] if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [ create_datafeed(spects, labels) for _ in range(bg_threads) ], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) return batches
def main(): # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile sample_rate = 22050 frame_len = 1024 fps = 70 mel_bands = 80 mel_min = 27.5 mel_max = 8000 blocklen = 115 batchsize = 32 bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'train')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append(cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log(np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not options.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - prepare training data generator print("Preparing training data feed...") if not options.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts( mel_spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = 2 if spline_order > 1: from scipy.ndimage import spline_filter spects = [spline_filter(spect, spline_order).astype(floatX) for spect in spects] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = .3 batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = .3 batches = augment.apply_random_stretch_shift( batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=10) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = 3 bg_processes = 0 if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [create_datafeed(spects, labels) for _ in range(bg_threads)], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) print("Preparing training function...") # instantiate neural network input_var = T.tensor3('input') inputs = input_var.dimshuffle(0, 'x', 1, 2) # insert "channels" dimension network = model.architecture(inputs, (None, 1, blocklen, mel_bands)) # create cost expression target_var = T.vector('targets') targets = (0.02 + 0.96 * target_var) # map 0 -> 0.02, 1 -> 0.98 targets = targets.dimshuffle(0, 'x') # turn into column vector outputs = lasagne.layers.get_output(network, deterministic=False) cost = T.mean(lasagne.objectives.binary_crossentropy(outputs, targets)) # prepare and compile training function params = lasagne.layers.get_all_params(network, trainable=True) initial_eta = 0.01 eta_decay = 0.85 momentum = 0.95 eta = theano.shared(lasagne.utils.floatX(initial_eta)) updates = lasagne.updates.nesterov_momentum(cost, params, eta, momentum) print("Compiling training function...") train_fn = theano.function([input_var, target_var], cost, updates=updates) # run training loop print("Training:") epochs = 20 epochsize = 2000 batches = iter(batches) for epoch in range(epochs): err = 0 for batch in progress( range(epochsize), min_delay=.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): err += train_fn(*next(batches)) if not np.isfinite(err): print("\nEncountered NaN loss in training. Aborting.") sys.exit(1) print("Train loss: %.3f" % (err / epochsize)) eta.set_value(eta.get_value() * lasagne.utils.floatX(eta_decay)) # save final network print("Saving final model") np.savez(modelfile, **{'param%d' % i: p for i, p in enumerate( lasagne.layers.get_all_param_values(network))})
def architecture(input_var, input_shape, cfg): layer = InputLayer(input_shape, input_var) # filterbank, if any if cfg['filterbank'] == 'mel': import audio filterbank = audio.create_mel_filterbank(cfg['sample_rate'], cfg['frame_len'], cfg['mel_bands'], cfg['mel_min'], cfg['mel_max']) filterbank = filterbank[:input_shape[3]].astype(theano.config.floatX) layer = DenseLayer(layer, num_units=cfg['mel_bands'], num_leading_axes=-1, W=T.constant(filterbank), b=None, nonlinearity=None) elif cfg['filterbank'] == 'mel_learn': layer = MelBankLayer(layer, cfg['sample_rate'], cfg['frame_len'], cfg['mel_bands'], cfg['mel_min'], cfg['mel_max']) elif cfg['filterbank'] != 'none': raise ValueError("Unknown filterbank=%s" % cfg['filterbank']) # magnitude transformation, if any if cfg['magscale'] == 'log': layer = ExpressionLayer(layer, lambda x: T.log(T.maximum(1e-7, x))) elif cfg['magscale'] == 'log1p': layer = ExpressionLayer(layer, T.log1p) elif cfg['magscale'].startswith('log1p_learn'): # learnable log(1 + 10^a * x), with given initial a (or default 0) a = float(cfg['magscale'][len('log1p_learn'):] or 0) a = T.exp(theano.shared(lasagne.utils.floatX(a))) layer = lasagne.layers.ScaleLayer(layer, scales=a, shared_axes=(0, 1, 2, 3)) layer = ExpressionLayer(layer, T.log1p) elif cfg['magscale'].startswith('pow_learn'): # learnable x^sigmoid(a), with given initial a (or default 0) a = float(cfg['magscale'][len('pow_learn'):] or 0) a = T.nnet.sigmoid(theano.shared(lasagne.utils.floatX(a))) layer = PowLayer(layer, exponent=a) elif cfg['magscale'] == 'pcen': layer = PCENLayer(layer) if cfg.get('pcen_fix_alpha'): layer.params[layer.log_alpha].remove("trainable") elif cfg['magscale'] == 'loudness_only': # cut away half a block length on the left and right layer = lasagne.layers.SliceLayer(layer, slice(cfg['blocklen'] // 2, -(cfg['blocklen'] // 2)), axis=2) # average over the frequencies and channels layer = lasagne.layers.ExpressionLayer( layer, lambda X: X.mean(axis=(1, 3), keepdims=True), lambda shp: (shp[0], 1, shp[2], 1)) elif cfg['magscale'] != 'none': raise ValueError("Unknown magscale=%s" % cfg['magscale']) # temporal difference, if any if cfg['arch.timediff']: layer = TimeDiffLayer(layer, delta=cfg['arch.timediff']) # standardization per frequency band if cfg.get('input_norm', 'batch') == 'batch': layer = batch_norm_vanilla(layer, axes=(0, 2), beta=None, gamma=None) elif cfg['input_norm'] == 'instance': layer = lasagne.layers.StandardizationLayer(layer, axes=2) elif cfg['input_norm'] == 'none': pass else: raise ValueError("Unknown input_norm=%s" % cfg['input_norm']) # convolutional neural network kwargs = dict(nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.Orthogonal()) maybe_batch_norm = batch_norm if cfg['arch.batch_norm'] else lambda x: x if cfg['arch.convdrop'] == 'independent': maybe_dropout = lambda x: dropout(x, 0.1) elif cfg['arch.convdrop'] == 'channels': maybe_dropout = lambda x: dropout(x, 0.1, shared_axes=(2, 3)) elif cfg['arch.convdrop'] == 'bands': maybe_dropout = lambda x: dropout(x, 0.1, shared_axes=(1, 2)) elif cfg['arch.convdrop'] == 'none': maybe_dropout = lambda x: x else: raise ValueError("Unknown arch.convdrop=%s" % cfg['arch.convdrop']) if cfg['arch'] == 'dense:16': layer = DenseLayer(layer, 16, **kwargs) layer = DenseLayer(layer, 1, nonlinearity=lasagne.nonlinearities.sigmoid, W=lasagne.init.Orthogonal()) return layer convmore = cfg['arch.convmore'] layer = Conv2DLayer(layer, int(64 * convmore), 3, **kwargs) if cfg.get('arch.firstconv_zeromean', False) == 'params': layer.W = layer.W - T.mean(layer.W, axis=(2, 3), keepdims=True) layer = maybe_batch_norm(layer) layer = maybe_dropout(layer) layer = Conv2DLayer(layer, int(32 * convmore), 3, **kwargs) layer = maybe_batch_norm(layer) layer = MaxPool2DLayer(layer, 3) layer = maybe_dropout(layer) layer = Conv2DLayer(layer, int(128 * convmore), 3, **kwargs) layer = maybe_batch_norm(layer) layer = maybe_dropout(layer) layer = Conv2DLayer(layer, int(64 * convmore), 3, **kwargs) layer = maybe_batch_norm(layer) if cfg['arch'] == 'ismir2015': layer = MaxPool2DLayer(layer, 3) elif cfg['arch'] == 'ismir2016': layer = maybe_dropout(layer) layer = Conv2DLayer(layer, int(128 * convmore), (3, layer.output_shape[3] - 3), **kwargs) layer = maybe_batch_norm(layer) layer = MaxPool2DLayer(layer, (1, 4)) else: raise ValueError('Unknown arch=%s' % cfg['arch']) layer = DenseLayer(dropout(layer, 0.5), 256, **kwargs) layer = maybe_batch_norm(layer) layer = DenseLayer(dropout(layer, 0.5), 64, **kwargs) layer = maybe_batch_norm(layer) layer = DenseLayer(dropout(layer, 0.5), 1, nonlinearity=lasagne.nonlinearities.sigmoid, W=lasagne.init.Orthogonal()) return layer
def main(): print(torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) outfile = options.outfile sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset print("Preparing data reading...") datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist += [l.rstrip() for l in f if l.rstrip()] # - create generator for spectra spects = (cached( options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy'), audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps) for fn in filelist) # - pitch-shift if needed if options.pitchshift: import scipy.ndimage spline_order = 2 spects = (scipy.ndimage.affine_transform( spect, (1, 1 / (1 + options.pitchshift / 100.)), output_shape=(len(spect), mel_max), order=spline_order) for spect in spects) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - define generator for mel spectra spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) # - load mean/std meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - define generator for Z-scoring spects = ((spect - mean) * istd for spect in spects) # - define generator for silence-padding pad = np.tile((np.log(1e-7) - mean) * istd, (blocklen // 2, 1)) spects = (np.concatenate((pad, spect, pad), axis=0) for spect in spects) # - we start the generator in a background thread (not required) spects = augment.generate_in_background([spects], num_cached=1) mdl = model.CNNModel() mdl.load_state_dict(torch.load(modelfile)) mdl.to(device) mdl.eval() # run prediction loop print("Predicting:") predictions = [] for spect in progress(spects, total=len(filelist), desc='File '): # naive way: pass excerpts of the size used during training # - view spectrogram memory as a 3-tensor of overlapping excerpts num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - pass mini-batches through the network and concatenate results preds = np.vstack( mdl( torch.from_numpy( np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], ( 0, 3, 1, 2))).to(device)).cpu().detach().numpy() for pos in range(0, num_excerpts, batchsize)) predictions.append(preds) # save predictions print("Saving predictions") np.savez(outfile, **{fn: pred for fn, pred in zip(filelist, predictions)})
def main(): # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile outfile = options.outfile sample_rate = 22050 frame_len = 1024 fps = 70 mel_bands = 80 mel_min = 27.5 mel_max = 8000 blocklen = 115 batchsize = 32 bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset print("Preparing data reading...") datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist += [l.rstrip() for l in f if l.rstrip()] # - create generator for spectra spects = (cached(options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy'), audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps) for fn in filelist) # - pitch-shift if needed if options.pitchshift: import scipy.ndimage spline_order = 2 spects = (scipy.ndimage.affine_transform( spect, (1, 1 / (1 + options.pitchshift / 100.)), output_shape=(len(spect), mel_max), order=spline_order) for spect in spects) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - define generator for mel spectra spects = (np.log(np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) # - load mean/std meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - define generator for Z-scoring spects = ((spect - mean) * istd for spect in spects) # - define generator for silence-padding pad = np.tile((np.log(1e-7) - mean) * istd, (blocklen // 2, 1)) spects = (np.concatenate((pad, spect, pad), axis=0) for spect in spects) # - we start the generator in a background thread (not required) spects = augment.generate_in_background([spects], num_cached=1) print("Preparing prediction function...") # instantiate neural network input_var = T.tensor3('input') inputs = input_var.dimshuffle(0, 'x', 1, 2) # insert "channels" dimension network = model.architecture(inputs, (None, 1, blocklen, mel_bands)) # load saved weights with np.load(modelfile) as f: lasagne.layers.set_all_param_values( network, [f['param%d' % i] for i in range(len(f.files))]) # performant way: convert to fully-convolutional network if not options.mem_use == 'low': import model_to_fcn network = model_to_fcn.model_to_fcn(network, allow_unlink=True) # create output expression outputs = lasagne.layers.get_output(network, deterministic=True) # prepare and compile prediction function print("Compiling prediction function...") test_fn = theano.function([input_var], outputs) # run prediction loop print("Predicting:") predictions = [] for spect in progress(spects, total=len(filelist), desc='File '): if options.mem_use == 'high': # fastest way: pass full spectrogram through network at once preds = test_fn(spect[np.newaxis]) # insert batch dimension elif options.mem_use == 'mid': # performant way: pass spectrogram in equal chunks of up to one # minute, taking care to overlap by `blocklen // 2` frames and to # not pass a chunk shorter than `blocklen` frames chunks = np.ceil(len(spect) / (fps * 60.)) hopsize = int(np.ceil(len(spect) / chunks)) chunksize = hopsize + blocklen - 1 preds = np.vstack(test_fn(spect[np.newaxis, pos:pos + chunksize]) for pos in range(0, len(spect), hopsize)) else: # naive way: pass excerpts of the size used during training # - view spectrogram memory as a 3-tensor of overlapping excerpts num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - pass mini-batches through the network and concatenate results preds = np.vstack(test_fn(excerpts[pos:pos + batchsize]) for pos in range(0, num_excerpts, batchsize)) predictions.append(preds) if options.plot: if spect.ndim == 3: spect = spect[0] # remove channel axis spect = spect[blocklen//2:-blocklen//2] # remove zero padding import matplotlib.pyplot as plt fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True) ax1.imshow(spect.T[::-1], vmin=-3, cmap='hot', aspect='auto', interpolation='nearest') ax2.plot(preds) ax2.set_ylim(0, 1.1) plt.show() # save predictions print("Saving predictions") np.savez(outfile, **{fn: pred for fn, pred in zip(filelist, predictions)})
def main(): # parse the command line arguments parser = utils.argument_parser() args = parser.parse_args() print("-------------------------------") print("classifier:%s" % args.classifier) print("inverter:%s" % args.inverter) print("dataset_path:%s" % args.dataset_path) print("dataset name:%s" % args.dataset) print("results path:%s" % args.results_dir) print("inverting from: %s" % args.layer) print("-------------------------------") # default parameters sample_rate = 22050 frame_len = 1024 fps = 70 mel_bands = 80 mel_min = 27.5 mel_max = 8000 blocklen = 115 batchsize = 32 start_offset = 10 # secs end_offset = 20 # secs bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), args.dataset_path, 'datasets', args.dataset) # load filelist with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] # compute spectra print("Computing%s spectra..." % (" or loading" if args.cache_spectra else "")) spects = [ ] # list of tuples, where each tuple has magnitude and phase information for one audio file for fn in progress(filelist, 'File '): cache_fn = (args.cache_spectra and os.path.join(args.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) # load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % args.dataset) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) print("Preparing training data feed...") # normalised mel spects, without data augmentation mel_spects = [(spect - mean) * istd for spect in mel_spects] # we create two theano functions # the first one uses pre-trained classifier to generate features and predictions # the second one uses pre-trained inverter to generate mel spectrograms from input features # classifier (discriminator) model input_var = T.tensor3('input') inputs = input_var.dimshuffle( 0, 'x', 1, 2 ) # insert "channels" dimension, changes a 32 x 115 x 80 input to 32 x 1 x 115 x 80 input which is fed to the CNN network = model.architecture(inputs, (None, 1, blocklen, mel_bands)) # load saved weights with np.load(args.classifier) as f: lasagne.layers.set_all_param_values( network['fc9'], [f['param%d' % i] for i in range(len(f.files))]) # create output expression outputs_score = lasagne.layers.get_output(network[args.layer], deterministic=True) outputs_pred = lasagne.layers.get_output(network['fc9'], deterministic=True) # prepare and compile prediction function print("Compiling classifier function...") pred_fn_score = theano.function([input_var], outputs_score, allow_input_downcast=True) pred_fn = theano.function([input_var], outputs_pred, allow_input_downcast=True) # inverter (generator) model if (args.layer == 'fc8') or (args.layer == 'fc7'): input_var_deconv = T.matrix('input_var_deconv') else: input_var_deconv = T.tensor4('input_var_deconv') # inverter (generator) model if (args.layer == 'fc8'): gen_network = upconv.architecture_upconv_fc8( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1])) elif args.layer == 'fc7': gen_network = upconv.architecture_upconv_fc7( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1])) elif args.layer == 'mp6': gen_network = upconv.architecture_upconv_mp6( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1], lasagne.layers.get_output_shape(network[args.layer])[2], lasagne.layers.get_output_shape(network[args.layer])[3]), args.n_conv_layers, args.n_conv_filters) elif args.layer == 'conv5': gen_network = upconv.architecture_upconv_conv5( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1], lasagne.layers.get_output_shape(network[args.layer])[2], lasagne.layers.get_output_shape(network[args.layer])[3]), args.n_conv_layers, args.n_conv_filters) elif args.layer == 'conv4': gen_network = upconv.architecture_upconv_conv4( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1], lasagne.layers.get_output_shape(network[args.layer])[2], lasagne.layers.get_output_shape(network[args.layer])[3]), args.n_conv_layers, args.n_conv_filters) elif args.layer == 'mp3': gen_network = upconv.architecture_upconv_mp3( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1], lasagne.layers.get_output_shape(network[args.layer])[2], lasagne.layers.get_output_shape(network[args.layer])[3]), args.n_conv_layers, args.n_conv_filters) elif args.layer == 'conv2': gen_network = upconv.architecture_upconv_conv2( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1], lasagne.layers.get_output_shape(network[args.layer])[2], lasagne.layers.get_output_shape(network[args.layer])[3]), args.n_conv_layers, args.n_conv_filters) else: gen_network = upconv.architecture_upconv_conv1( input_var_deconv, (batchsize, lasagne.layers.get_output_shape( network[args.layer])[1], lasagne.layers.get_output_shape(network[args.layer])[2], lasagne.layers.get_output_shape(network[args.layer])[3]), args.n_conv_layers, args.n_conv_filters) # load saved weights with np.load(args.inverter) as f: lasagne.layers.set_all_param_values( gen_network, [f['param%d' % i] for i in range(len(f.files))]) # create cost expression outputs = lasagne.layers.get_output(gen_network, deterministic=True) print("Compiling inverter function...") test_fn = theano.function([input_var_deconv], outputs, allow_input_downcast=True) # instance-based feature inversion # (1) pick a file from a dataset (e.g., dataset: Jamendo test) (2) select a time index to read the instance file_idx = np.arange(0, len(filelist)) hop_size = sample_rate / fps # samples for file_instance in file_idx: print("<<<<Analysis for the file: %d>>>>" % (file_instance + 1)) time_idx = np.random.randint( start_offset, end_offset, 1 )[0] # provides a random integer start position between start and end offsets # generate excerpts for the selected file_idx # excerpts is a 3-d array of shape: num_excerpts x blocklen x mel_spects_dimensions num_excerpts = len(mel_spects[file_instance]) - blocklen + 1 print("Number of excerpts in the file :%d" % num_excerpts) excerpts = np.lib.stride_tricks.as_strided( mel_spects[file_instance], shape=(num_excerpts, blocklen, mel_spects[file_instance].shape[1]), strides=(mel_spects[file_instance].strides[0], mel_spects[file_instance].strides[0], mel_spects[file_instance].strides[1])) # convert the time_idx to the excerpt index excerpt_idx = int(np.round((time_idx * sample_rate) / (hop_size))) print("Time_idx: %f secs, Excerpt_idx: %d" % (time_idx, excerpt_idx)) if ((excerpt_idx + batchsize) > num_excerpts): print( "------------------Number of excerpts are less for file: %d--------------------" % (file_instance + 1)) break # generating feature representations for the select excerpt. # CAUTION: Need to feed mini-batch to pre-trained model, so (mini_batch-1) following excerpts are also fed, but are not analysed # classifier can have less than minibatch data, but the inverter needs a batch of data to make prediction (comes from how the inverter was trained) scores = pred_fn_score(excerpts[excerpt_idx:excerpt_idx + batchsize]) #print("Feature"), #print(scores[file_idx]) predictions = pred_fn(excerpts[excerpt_idx:excerpt_idx + batchsize]) #print("Prediction:%f" %(predictions[0][0])) mel_predictions = np.squeeze( test_fn(scores), axis=1 ) # mel_predictions is a 3-d array of shape batch_size x blocklen x n_mels # saves plots for the input Mel spectrogram and its inverted representation # all plots are normalised in [0, 1] range plots.plot_figures(utils.normalise(excerpts[excerpt_idx]), utils.normalise(mel_predictions[0]), predictions[0][0], file_instance, excerpt_idx, args.results_dir, args.layer)
def prepare_audio(mean, istd, options): """ Reads input audio and creates Mel-spectrogram excerpts of size 115 x 80 needed by the neural network model """ # default parameters from ISMIR 2015: Jan et. al. sample_rate = 22050 frame_len = 1024 fps = 70 mel_bands = 80 mel_min = 27.5 mel_max = 8000 blocklen = 115 bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset print("Preparing data reading...") datadir = os.path.join(os.path.dirname(__file__), 'dataset') # - load filelist with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist += [l.rstrip() for l in f if l.rstrip()] if not options.partial: #duration and offset arguments have not use in the part of the code. # - create generator for spectra spects = (simplecache.cached( options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy'), audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps) for fn in filelist) else: # - create generator for spectra spects = (simplecache.cached( options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy'), audio.extract_spect_partial, os.path.join(datadir, 'audio', fn), options.save_input, options.dump_path, sample_rate, frame_len, fps, options.offset, options.duration) for fn in filelist) if (options.transform == 'mel'): # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # calculating and saving the pinv (80*bin_mel_max) for later use. filterbank_pinv = linalg.pinv( filterbank) # pseudo inv will automatically be of shape: 80 x 372 #filterbank_pinv = filterbank.T # 80 x 372 spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) else: spects = (np.log(np.maximum(spect, 1e-7)) for spect in spects) filterbank_pinv = np.ones( (mel_bands, bin_mel_max) ) # dummy of no use in this case. need to do as same code is used to return # - define generator for Z-scoring spects = ((spect - mean) * istd for spect in spects) # - define generator for silence-padding pad = np.tile((np.log(1e-7) - mean) * istd, (blocklen // 2, 1)) spects = (np.concatenate((pad, spect, pad), axis=0) for spect in spects) # - we start the generator in a background thread (not required) spects = augment.generate_in_background([spects], num_cached=1) spectrum = [ ] # list of 3d arrays.each 3d array for one audio file No. of excerpts x 115 x 80 # run prediction loop print("Generating excerpts:") for spect in progress.progress(spects, total=len(filelist), desc='File '): # - view spectrogram memory as a 3-tensor of overlapping excerpts num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) spectrum.append(excerpts) return spectrum, filterbank_pinv
def main(): # parse command line parser = opts_parser() options = parser.parse_args() outdir = options.outdir if options.load_spectra != 'memory' and not options.cache_spectra: parser.error('option --load-spectra=%s requires --cache-spectra' % options.load_spectra) # read configuration files and immediate settings cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) # read some settings into local variables sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist filelist = [] ranges = {} for part in 'train', 'valid', 'test': a = len(filelist) with io.open( os.path.join(datadir, 'filelists', cfg.get('filelist.%s' % part, part))) as f: filelist.extend(l.rstrip() for l in f if l.rstrip()) ranges[part] = slice(a, len(filelist)) # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps, loading_mode=options.load_spectra)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # compute and save different variants of summarized magnitudes print("Saving files...") # - ground truth outfile = os.path.join(outdir, '%s_gt.pkl' % options.dataset) print(outfile) with io.open(outfile, 'wb') as f: pickle.dump({'labels': labels, 'splits': ranges}, f, protocol=-1) # - summarized spectra save_spectral_sums( os.path.join(outdir, '%s_spect_sum.pkl' % options.dataset), spects) # - summarized mel spectra bank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max).astype(np.float32) spects = [np.dot(spect[:, ], bank) for spect in spects] save_spectral_sums( os.path.join(outdir, '%s_spect_mel_sum.pkl' % options.dataset), spects) # - summarized log-mel spectra spects = [np.log(np.maximum(1e-7, spect)) for spect in spects] save_spectral_sums( os.path.join(outdir, '%s_spect_mel_log_sum.pkl' % options.dataset), spects) # - summarized standardized log-mel spectra m, s = znorm.compute_mean_std(spects[ranges['train']], axis=0) spects = [((spect - m) / s).astype(np.float32) for spect in spects] save_spectral_sums( os.path.join(outdir, '%s_spect_mel_log_std_sum.pkl' % options.dataset), spects)
def main(): # parse the command line arguments parser = utils.argument_parser() args = parser.parse_args() print("-------------------------------") print("classifier:%s" % args.classifier) print("inverter:%s" % args.inverter) print("dataset_path:%s" % args.dataset_path) print("dataset name:%s" % args.dataset) print("results path:%s" % args.results_dir) print("quantitative analysis:%s" % args.quant_analysis) print("mask inversion flag: %r" % args.mask_inv_flag) print("plot quant results case: %r" % args.plot_quant_res) print("-------------------------------") # just plots the quantitative analysis results and exits if args.plot_quant_res: # jamendo results exp_loss_jamendo_case1 = [ 0, 6.48, 10.87, 13.33, 15.51, 19.15, 25.94, 37.56, 49.11, 56.85, 57.77 ] #, 57.77] exp_loss_jamendo_case2 = [ 57.77, 59.19, 58.11, 51.81, 43.1, 31.87, 22.84, 15.51, 11.03, 5.86, 0.03 ] #, 0] rel_area_jamendo_case1 = [100, 96, 87, 77, 65, 53, 39, 26, 14, 4, 0] #, 0] rel_area_jamendo_case2 = [0, 4, 13, 23, 35, 47, 61, 74, 86, 96, 100] #, 100] exp_losses_jamendo = [exp_loss_jamendo_case1, exp_loss_jamendo_case2] rel_areas_jamendo = [rel_area_jamendo_case1, rel_area_jamendo_case2] # rwc results exp_loss_rwc_case1 = [ 0, 6.52, 10.9, 13.39, 15.87, 21.28, 30.92, 43.22, 53.41, 60.85, 63.66 ] #, 63.66] exp_loss_rwc_case2 = [ 63.66, 64.5, 61.01, 52.55, 39.39, 26.27, 16.13, 9.55, 5.05, 2.26, 0.03 ] #, 0] rel_area_rwc_case1 = [100, 96, 87, 75, 61, 47, 33, 20, 10, 3, 0] #, 0] rel_area_rwc_case2 = [0, 4, 13, 25, 39, 53, 67, 80, 90, 97, 100] #, 100] exp_losses_rwc = [exp_loss_rwc_case1, exp_loss_rwc_case2] rel_areas_rwc = [rel_area_rwc_case1, rel_area_rwc_case2] plots.quant_eval(exp_losses_jamendo, rel_areas_jamendo, exp_losses_rwc, rel_areas_rwc, args.results_dir) exit(0) # default parameters sample_rate = 22050 frame_len = 1024 fps = 70 mel_bands = 80 mel_min = 27.5 mel_max = 8000 blocklen = 115 batchsize = 32 # single instance inversion/quantitative analysis parameters preds_before = [] if not args.quant_analysis: time_index = 10 masking_threshold = [0.7] duration = 0 # no use increment = 0.5 else: preds_after = [] area_per_instance = [] result = [] start_offset = 5 end_offset = 20 duration = 200 increment = 0.5 masking_threshold = np.arange(0.0, 1.2, 0.1) class_threshold = 0.66 # Calculated over Jamendo validation dataset # printing and plotting parameters df = True #inp = [] #expns =[] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), args.dataset_path, 'datasets', args.dataset) # load filelist with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] # compute spectra print("Computing%s spectra..." % (" or loading" if args.cache_spectra else "")) spects = [ ] # list of tuples, where each tuple has magnitude and phase information for one audio file for fn in progress(filelist, 'File '): cache_fn = (args.cache_spectra and os.path.join(args.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) # load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % args.dataset) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) print("Preparing training data feed...") # normalised mel spects, without data augmentation mel_spects = [(spect - mean) * istd for spect in mel_spects] # we create two theano functions # the first one uses pre-trained classifier to generate features and predictions # the second one uses pre-trained inverter to generate mel spectrograms from input features # classifier (discriminator) model input_var = T.tensor3('input') inputs = input_var.dimshuffle( 0, 'x', 1, 2 ) # insert "channels" dimension, changes a 32 x 115 x 80 input to 32 x 1 x 115 x 80 input which is fed to the CNN network = model.architecture(inputs, (None, 1, blocklen, mel_bands)) # load saved weights with np.load(args.classifier) as f: lasagne.layers.set_all_param_values( network['fc9'], [f['param%d' % i] for i in range(len(f.files))]) # create output expression outputs_score = lasagne.layers.get_output(network['fc8'], deterministic=True) outputs_pred = lasagne.layers.get_output(network['fc9'], deterministic=True) # prepare and compile prediction function print("Compiling classifier function...") pred_fn_score = theano.function([input_var], outputs_score, allow_input_downcast=True) pred_fn = theano.function([input_var], outputs_pred, allow_input_downcast=True) # inverter (generator) model input_var_deconv = T.matrix('input_var_deconv') # inverter (generator) model gen_network = upconv.architecture_upconv_fc8( input_var_deconv, (batchsize, lasagne.layers.get_output_shape(network['fc8'])[1])) # load saved weights with np.load(args.inverter) as f: lasagne.layers.set_all_param_values( gen_network, [f['param%d' % i] for i in range(len(f.files))]) # create cost expression outputs = lasagne.layers.get_output(gen_network, deterministic=True) print("Compiling inverter function...") test_fn = theano.function([input_var_deconv], outputs, allow_input_downcast=True) # instance-based feature inversion # (1) pick a file from a dataset (e.g., dataset: Jamendo test) (2) select a time index to read the instance file_idx = np.arange(0, len(filelist)) hop_size = sample_rate / fps # samples for mt in masking_threshold: np.random.seed(0) print("\n ++++++ Masking threshold: %f +++++\n " % (mt)) for file_instance in file_idx: print("<<<<Analysis for the file: %d>>>>" % (file_instance + 1)) if args.quant_analysis: time_idx = np.random.randint( start_offset, end_offset, 1 )[0] # provides a random integer start position between start and end offsets else: time_idx = time_index td = time_idx # no use for the single instance inversion case. # generate excerpts for the selected file_idx # excerpts is a 3-d array of shape: num_excerpts x blocklen x mel_spects_dimensions num_excerpts = len(mel_spects[file_instance]) - blocklen + 1 print("Number of excerpts in the file :%d" % num_excerpts) excerpts = np.lib.stride_tricks.as_strided( mel_spects[file_instance], shape=(num_excerpts, blocklen, mel_spects[file_instance].shape[1]), strides=(mel_spects[file_instance].strides[0], mel_spects[file_instance].strides[0], mel_spects[file_instance].strides[1])) while (time_idx <= td + duration): # convert the time_idx to the excerpt index excerpt_idx = int( np.round((time_idx * sample_rate) / (hop_size))) print("Time_idx: %.2f secs, Excerpt_idx: %d" % (time_idx, excerpt_idx)) if ((excerpt_idx + batchsize) > num_excerpts): print( "------------------Number of excerpts are less for file: %d--------------------" % (file_instance + 1)) break # generating feature representations for the select excerpt. # CAUTION: Need to feed mini-batch to the pre-trained model, so (mini_batch-1) following excerpts are also fed, but are not analysed # classifier can have less than minibatch data, but the inverter needs a batch of data to make prediction (comes from how the inverter was trained) scores = pred_fn_score(excerpts[excerpt_idx:excerpt_idx + batchsize]) #print("Feature"), #print(scores[file_idx]) predictions = pred_fn(excerpts[excerpt_idx:excerpt_idx + batchsize]) print("Prediction score for the excerpt without masking:%f" % (predictions[0][0])) preds_before.append(predictions[0][0]) mel_predictions = np.squeeze( test_fn(scores), axis=1 ) # mel_predictions is a 3-d array of shape batch_size x blocklen x n_mels # normalising the inverted mel to create a map, and use the map to cut the section in the input mel norm_inv = utils.normalise(mel_predictions[0]) norm_inv[norm_inv < mt] = 0 # Binary mask----- norm_inv[norm_inv >= mt] = 1 if args.quant_analysis: # calculate area area = utils.area_calculation(norm_inv, debug_flag=df) # reversing the mask to keep the portions that seem not useful for the current instance prediction norm_inv, area = utils.invert_mask( mask=norm_inv, mask_inv_flag=args.mask_inv_flag, area_mask=area, debug_flag=df) # masking out the input based on the mask created above masked_input = np.zeros((batchsize, blocklen, mel_bands)) masked_input[0] = norm_inv * excerpts[excerpt_idx] if args.quant_analysis: # save the area enabled area_per_instance.append(area) # feed the updated input to regenerate prediction # just changing the first input. predictions = pred_fn(masked_input) print( "Predictions score for the excerpt after masking:%f\n" % (predictions[0][0])) preds_after.append(predictions[0][0]) if not args.quant_analysis: # save results # saves plots for the input Mel spectrogram and its inverted representation # all plots are normalised in [0, 1] range plots.single_inst_inv( utils.normalise(excerpts[excerpt_idx]), utils.normalise(mel_predictions[0]), norm_inv, utils.normalise(masked_input[0]), preds_before[0], file_instance, excerpt_idx, args.results_dir, 'FC8') time_idx += increment # plotting figure 6.4 in thesis #plots.input_mels() # plotting figure 6.6 in thesis '''inp.append(excerpts[excerpt_idx]) expns.append(masked_input[0]) preds_before.append(predictions[0][0]) plots.special_cases(utils.normalise(inp[0]), utils.normalise(expns[0]), utils.normalise(inp[1]), utils.normalise(expns[1]), preds_before[0], preds_before[1], file_instance, excerpt_idx, args.results_dir)''' if args.quant_analysis: res_tuple = utils.quant_result_analysis(preds_before, preds_after, area_per_instance, mt, class_threshold, debug_flag=df) result.append(res_tuple) # one result per threshold value # clearing the lists for the next iteration preds_before = [] preds_after = [] area_per_instance = [] if args.quant_analysis: # save the quantitative analysis results quant_result_columns = [ 'threshold', 'total instances', 'total fails', 'explanation loss [%]', 'average area' ] with open(args.results_dir + '/' + 'quant_analysis_result.csv', 'w') as fp: results_writer = csv.writer(fp, delimiter=',') results_writer.writerow(quant_result_columns) for result_th in result: results_writer.writerow(result_th)
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'train')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] if options.validate: with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist_val = [l.strip() for l in f if l.strip()] filelist.extend(filelist_val) else: filelist_val = [] # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) if options.validate: spects_val = spects[-len(filelist_val):] spects = spects[:-len(filelist_val)] labels_val = labels[-len(filelist_val):] labels = labels[:-len(filelist_val)] # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not options.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - prepare training data generator print("Preparing training data feed...") if not options.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts(mel_spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = cfg['spline_order'] if spline_order > 1: from scipy.ndimage import spline_filter spects = [ spline_filter(spect, spline_order).astype(floatX) for spect in spects ] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = cfg['max_stretch'] batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = cfg['max_shift'] batches = augment.apply_random_stretch_shift(batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters max_db = cfg['max_db'] batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=max_db) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = cfg['bg_threads'] bg_processes = cfg['bg_processes'] if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [create_datafeed(spects, labels) for _ in range(bg_threads)], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) ########################################################################### #-----------Main changes to code to make it work with pytorch-------------# ########################################################################### print("preparing training function...") mdl = model.CNNModel() mdl = mdl.to(device) #Setting up learning rate and learning rate parameters initial_eta = cfg['initial_eta'] eta_decay = cfg['eta_decay'] momentum = cfg['momentum'] eta_decay_every = cfg.get('eta_decay_every', 1) eta = initial_eta #set up loss criterion = torch.nn.BCELoss() #set up optimizer optimizer = torch.optim.SGD(mdl.parameters(), lr=eta, momentum=momentum, nesterov=True) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=eta_decay_every, gamma=eta_decay) #set up optimizer writer = SummaryWriter(os.path.join(modelfile, 'runs')) epochs = cfg['epochs'] epochsize = cfg['epochsize'] batches = iter(batches) #conditions to save model best_val_loss = 100000. best_val_error = 1. for epoch in range(epochs): # - Initialize certain parameters that are used to monitor training err = 0 total_norm = 0 loss_accum = 0 mdl.train(True) # - Compute the L-2 norm of the gradients for p in mdl.parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) # - Start the training for this epoch for batch in progress(range(epochsize), min_delay=0.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): data = next(batches) input_data = np.transpose(data[0][:, :, :, np.newaxis], (0, 3, 1, 2)) labels = data[1][:, np.newaxis].astype(np.float32) #map labels to make them softer labels = (0.02 + 0.96 * labels) optimizer.zero_grad() outputs = mdl(torch.from_numpy(input_data).to(device)) loss = criterion(outputs, torch.from_numpy(labels).to(device)) loss.backward() optimizer.step() loss_accum += loss.item() # - Compute validation loss and error if desired if options.validate: from eval import evaluate mdl.train(False) val_loss = 0 preds = [] labs = [] max_len = fps mel_spects_val = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects_val) mel_spects_val = [(spect - mean) * istd for spect in mel_spects_val] num_iter = 0 for spect, label in zip(mel_spects_val, labels_val): num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - Pass mini-batches through the network and concatenate results for pos in range(0, num_excerpts, batchsize): input_data = np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], (0, 3, 1, 2)) if (pos + batchsize > num_excerpts): label_batch = label[blocklen // 2 + pos:blocklen // 2 + num_excerpts, np.newaxis].astype(np.float32) else: label_batch = label[blocklen // 2 + pos:blocklen // 2 + pos + batchsize, np.newaxis].astype(np.float32) pred = mdl(torch.from_numpy(input_data).to(device)) e = criterion(pred, torch.from_numpy(label_batch).to(device)) preds = np.append(preds, pred[:, 0].cpu().detach().numpy()) labs = np.append(labs, label_batch) val_loss += e.item() num_iter += 1 print("Validation loss: %.3f" % (val_loss / num_iter)) _, results = evaluate(preds, labs) print("Validation error: %.3f" % (1 - results['accuracy'])) if (val_loss / num_iter < best_val_loss and (1 - results['accuracy']) < best_val_error): torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) best_val_loss = val_loss / num_iter best_val_error = 1 - results['accuracy'] print('New saved model', best_val_loss, best_val_error) #Update the learning rate scheduler.step() print('Training Loss per epoch', loss_accum / epochsize) # - Save parameters for examining writer.add_scalar('Training Loss', loss_accum / epochsize, epoch) writer.add_scalar('Validation loss', val_loss / num_iter, epoch) writer.add_scalar('Gradient norm', total_norm, epoch) writer.add_scalar('Validation error', 1 - results['accuracy']) for param_group in optimizer.param_groups: print(param_group['lr']) if not options.validate: torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))