def _extract_latent(fn, model, sr=22050, is_mulaw=True): """Helper function to extract latent embedding for a single music file Args: fn (str): filename for a single music file sr (int): sampling rate """ if basename(fn).split('.')[-1] == 'npy': if is_mulaw: y = load_mulaw(fn) else: y = np.load(fn) else: y, sr = librosa.load(fn, sr=sr) y = y.astype(np.float32) if len(y) > model.sig_len: # find the center and crop from there mid = int(len(y) / 2) half_len = int(model.sig_len / 2) start_point = mid - half_len y = y[start_point:start_point + model.sig_len] elif len(y) < model.sig_len: # zero-pad rem = model.sig_len - len(y) y = np.r_[y, np.zeros((rem, ), dtype=y.dtype)] return model.get_bottleneck(torch.from_numpy(y)[None]).data.numpy()[0]
def _transform(fn, transformer, out_root, perturbations=cfg.PERTURBATIONS, normalization=True): """Transform given signal and save Args: fn (str): path to the input signal transformer (BaseTransformer): transformation class out_root (str): path to dump outputs perturbations (OrderedDict): perturbations set normalization (bool): flag for normalization """ # load the signal if basename(fn).split('.')[-1] == 'npy': x = load_mulaw(fn) else: x, sr = librosa.load(fn, sr=cfg.FS) # transform for a in get_transform_range(transformer, perturbations): y = transformer(x, a) if normalization: # normalization y = lufs_norm(y, x, cfg.FS) out_fn = '_'.join( [basename(fn).split('.')[0], get_pert_id(transformer, a)]) save_mulaw(join(out_root, out_fn), y)
def _extract_mfcc(fn, out_root, include_coef0=False, sig_len=44100, sr=22050, n_mfccs=25): """Helper function to extract MFCC from a single music file Args: fn (str): filename for a single music file out_root (str): path to dump output MFCCs include_coef0 (bool): decides including 1st coefficient sig_len (int): length of desired signal sr (int): sampling rate n_mfcc (int): number of coefficients """ if basename(fn).split('.')[-1] == 'npy': y = load_mulaw(fn) else: y, sr = librosa.load(fn, sr=sr) # we do not include the first coefficient M = librosa.feature.mfcc(y, sr, n_mfcc=n_mfccs).T if not include_coef0: M = M[:, 1:] out_fn = join(out_root, splitext(basename(fn))[0] + '.npy') np.save(out_fn, M)
def ext_latents(fns, out_fn, model=None, n_jobs=1, is_mulaw=True, batch_sz=100): """Extract latent features Args: fns (str): file name of the music out_fn (str): path to dump files model (BaseModel, None, 'mfcc'): model used for the extraction n_jobs (int): number of parallel jobs """ is_gpu = next(model.parameters()).is_cuda batch = [] Z = [] for i, fn in tqdm(enumerate(fns), total=len(fns), ncols=80): if basename(fn).split('.')[-1] == 'npy': if is_mulaw: y = load_mulaw(fn) else: y = np.load(fn) else: y, sr = librosa.load(fn, sr=sr) y = y.astype(np.float32) if len(y) > model.sig_len: # find the center and crop from there mid = int(len(y) / 2) half_len = int(model.sig_len / 2) start_point = mid - half_len y = y[start_point:start_point + model.sig_len] elif len(y) < model.sig_len: # zero-pad rem = model.sig_len - len(y) y = np.r_[y, np.zeros((rem, ), dtype=y.dtype)] batch.append(y) if (len(batch) >= batch_sz) or ((i + 1) == len(fns)): batch = torch.from_numpy(np.array(batch).astype(np.float32)) if is_gpu: batch = batch.cuda() z = model.get_bottleneck(batch) if is_gpu: z = z.data.cpu().numpy() else: z = z.data.numpy() Z.append(z) batch = [] Z = np.concatenate(Z, axis=0) # save the output np.save(out_fn, np.array(Z))
def load_audio(fn, mulaw=True): """""" if basename(fn).split('.')[-1] == 'npy': if mulaw: y = load_mulaw(fn) else: y = np.load(fn) else: y, sr = librosa.load(fn, sr=sr) # make sure the input is right dtype y = y.astype(np.float32) return y
def _extract_mfcc(fn, sr=22050, is_mulaw=True): """Extract MFCC Args: fn (str): filename for a single music file sr (int): sampling rate Returns: np.ndarray: MFCC feature vector """ if basename(fn).split('.')[-1] == 'npy': if is_mulaw: y = load_mulaw(fn) else: y = np.load(fn) else: y, sr = librosa.load(fn, sr=sr) return _mfcc(y, sr)
def _random_crop(in_fn, out_root, sig_len, n_samples=1, sr=22050): """Randomly drop audio and save Args: in_fn (str): path to load the original signal out_root (str): path to save the cropped signals sig_len (int): length of the cropped signal n_samples (int): the number of clips to be cropped sr (int): sampling rate of the signal Returns: str: path where the file is saved tuple: contains starting point and the end point """ # helper function to save def _save(signal, sample_rate, start_ix): out_fn = join( out_root, splitext(basename(in_fn))[0] + '_{:d}_{:d}.npy'.format( int(start_ix), int(start_ix+sig_len) ) ) save_mulaw(out_fn, signal, sample_rate) return (out_fn, (start_ix, start_ix + sig_len)) if basename(in_fn).split('.')[-1] == 'npy': y = load_mulaw(in_fn) else: y, sr = librosa.load(in_fn, sr=sr) # check nbr of chunks is too many considering signal length assert sig_len * n_samples < len(y) info = [_save(sig, sr, st) for sig, st in _random_crop_signal(y, n_samples, sig_len)] return info