def separate(mixture, model, params, device): labels = ['s%d' % i for i in range(1, params['num_attractors'] + 1)] estimates = {} mix = mixture if (len(mix.shape) > 1): mix = mixture[:, 0] _, mix = utils.mask_mixture(1, mix, params['n_fft'], params['hop_length']) log_spec = utils.transform(mix, params['n_fft'], params['hop_length']) silence_mask = log_spec > -25 log_spec = utils.whiten(log_spec) with torch.no_grad(): input_data = torch.from_numpy(log_spec).unsqueeze(0).requires_grad_().to(device) if 'DeepAttractor' in str(model): with torch.no_grad(): masks, _, embedding, _ = model(input_data, one_hots=None) clusterer = KMeans(n_clusters=params['num_attractors']) embedding_ = embedding.squeeze(0).cpu().data.numpy() clusterer.fit(embedding_[silence_mask.flatten()]) assignments = clusterer.predict(embedding_) assignments = assignments.reshape((masks.shape[1], masks.shape[2])) for i, label in enumerate(labels): mask = (assignments == i).T.astype(float) source, mix = utils.mask_mixture(mask, mix, params['n_fft'], params['hop_length']) estimates[label] = source return estimates
def main(train_db, test_db): # Load train dataset _, lst_vec_train = utils.data_load(train_db) # Get PCA transform matrix from train data X_train = np.array(lst_vec_train) _, W = utils.whiten(X_train) # Load test dataset lst_id, lst_vec_test = utils.data_load(test_db) # Whitening test dataset X_test = np.array(lst_vec_test) X_test_white = np.dot(X_test, W) # Calc scores for test dataset lst_white_vec = [X_test_white[i, :] for i in range(X_test_white.shape[0])] lst_compare_key_result, lst_compare_ivec_result = utils.calc_scores( lst_id, lst_white_vec) # Plot FR/FA curve utils.plot_fr_fa(lst_compare_key_result, lst_compare_ivec_result) # Plot scores hist utils.plot_hist_scores(lst_compare_key_result, lst_compare_ivec_result)
def test_whitening(self): desired_rms = 0.038021 test_data, sample_rate = sf.read(PATH + '/data/whitening_test_audio.flac') test_data = np.stack([test_data] * 2) whitened = whiten(torch.from_numpy(test_data), desired_rms) # Mean correct self.assert_(np.isclose(whitened.mean().item(), 0)) # RMS correct self.assert_( np.isclose( np.sqrt(np.power(whitened[0, :], 2).mean()).item(), desired_rms))
def read_train_data(): train_data = [] train_label = [] for num, dirlist in enumerate(os.listdir(train_root)): label = int(dirlist[5:7]) - 1 file_path = os.path.join(train_root, dirlist) im = cv2.imread(file_path) width, heigh = im.shape[0], im.shape[1] if width >= heigh: length = heigh else: length = width # print(length) im = utils.whiten(im) im = utils.random_crop(im, image_size=length) im = cv2.resize(im, (128, 128)) train_data.append(im) train_label.append(label) np.savetxt('train_label.txt', train_label) print("loading train_data, train_labels") return np.array(train_data), np.array(train_label)
def input_test_data(): test_data = [] name = [] for dirlist in os.listdir(test_root): im_name = str(dirlist.split('.')[0]) name.append(im_name) file_path = os.path.join(test_root, dirlist) im = cv2.imread(file_path) width, heigh = im.shape[0], im.shape[1] if width >= heigh: length = heigh else: length = width print(length) im = utils.whiten(im) im = utils.random_crop(im, image_size=length) im = cv2.resize(im, (128, 128)) test_data.append(im) print(name) print(len(test_data)) print(len(name)) np.savetxt('test_image_name.csv', name, fmt='%s') return np.array(test_data)
def preprocessing(X, phase_shift=0, time_shift=0): fband = [35.0, 350.0] T = 800.0 XWhiten = whiten(X, dt=T) return bandpass(XWhiten, fband, T)
import random import utils import network import tensorflow as tf timeseries, validate_timeseries, validate_labels = utils.load() lag = 1 assert lag > 0 x, y = utils.lag_data(timeseries, lag=lag) x, y = utils.whiten(x), utils.whiten(y) val_x, val_y = utils.lag_data(validate_timeseries, lag=0) val_x, val_y = utils.whiten(val_x), utils.whiten(val_y) # length_minib = val_x.shape[0] # minibatches = [] # for i in range(x.shape[0] - length_minib + 1): # minibatches.append((x[i:i + length_minib], y[i:i + length_minib])) timeseries_x = tf.placeholder(tf.float32, shape=[None, timeseries.shape[-1]]) timeseries_y = tf.placeholder(tf.float32, shape=[None, timeseries.shape[-1]]) loss, encoded, decoded = network.time_lagged_autoencoder( timeseries_x, timeseries_y) train = tf.train.AdamOptimizer().minimize(loss) epochs = 1500 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(epochs):
outline = load_image('cat1_outline.png') # outline consists of 3 values 0, 1, 255. We will igure the value 255. plt.subplot(1, 2, 1) plt.imshow(im) plt.axis('off') plt.subplot(1, 2, 2) plt.imshow(outline[..., np.newaxis] * im) plt.axis('off') # plt.show() # Create a grid CRF N_LABEL = 2 MAX_ITER = 200 feat = whiten(im) H, W = outline.shape # indicate whether the node has outline label label = outline[1:H - 1, 1:W - 1] def rbf_kernel(dist, sigma=3): return np.exp(-np.sum(dist**2, 2) / sigma) # Create pairwise conditional probability with nearest 4 neighbors curr_feat = feat[1:H - 1, 1:W - 1] top, bottom, left, right = np.zeros((H, W)), np.zeros((H, W)), np.zeros( (H, W)), np.zeros((H, W))
def predict(): intervals = Data(False) existing_results = [] for root, folders, files in os.walk(PATH + '/results/'): for f in files: if f.endswith('.csv'): speaker_name = f.replace('results_for_', '').replace('.csv', '') existing_results.append(speaker_name) num_intervals = 0 for key in intervals.speaker_to_intervals: if key not in existing_results: for interval in intervals.speaker_to_intervals[key]: num_intervals += 1 pbar = tqdm(total=num_intervals) for key in intervals.all_speakers: if key not in existing_results: model_path = 'data/weights/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch' step_seconds = 0.04 batchsize_for_prediction = 1 if key in intervals.complex_transcripts: audio_path = PATH + '/data/voc/audio/' + key + '.wav' if key in intervals.speakers: audio_path = PATH + '/data/voc/simple_audio/' + key + '.wav' speaker = key ############## # Load audio # ############## audio, audio_sampling_rate = sf.read(audio_path) audio_duration_seconds = audio.shape[0] * 1. / audio_sampling_rate audio_duration_minutes = audio_duration_seconds / 60. ############## # Load model # ############## model_type = model_path.split('/')[-1].split('__')[0] model_name = model_path.split('/')[-1].split('.')[0] model_params = { i.split('=')[0]: float(i.split('=')[1]) for i in model_name.split('__')[1:] } # Here we assume that the model was trained on the LibriSpeech dataset model_sampling_rate = LIBRISPEECH_SAMPLING_RATE / model_params[ 'downsampling'] model_num_samples = int(model_params['n_seconds'] * model_sampling_rate) if model_type == 'max_pooling': model = ConvNet(int(model_params['n_filters']), int(model_params['n_layers'])) elif model_type == 'dilated': model = DilatedNet(int(model_params['n_filters']), int(model_params['n_depth']), int(model_params['n_stacks'])) else: raise (ValueError, 'Model type not recognised.') model.load_state_dict(torch.load(model_path)) model.double() model.cuda() model.eval() ###################### # Loop through audio # ###################### step_samples = int(step_seconds * model_sampling_rate) step_samples_at_audio_rate = int(step_seconds * audio_sampling_rate) default_shape = None batch = [] pred = [] start_min = [] for interval in intervals.speaker_to_intervals[key]: start = float(interval[0]) end = float(interval[1]) start_samples = int(audio_sampling_rate * start) end_samples = int(audio_sampling_rate * end) for lower in range(start_samples, end_samples, step_samples_at_audio_rate): x = audio[lower:lower + (int(model_params['n_seconds'] * audio_sampling_rate))] if x.shape[0] != model_params[ 'n_seconds'] * audio_sampling_rate: break x = torch.from_numpy(x).reshape(1, -1) x = whiten(x) # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster x = torch.from_numpy(resample(x, model_num_samples, axis=1)).reshape( (1, 1, model_num_samples)) y_hat = model(x).item() pred.append(y_hat) start_min.append(lower / 44100.) pbar.update(1) df = pd.DataFrame(data={ 'speaker': speaker, 'start_second': start_min, 'p': pred }) df = df.assign( # Time in seconds of the end of the prediction fragment t_end=df['start_second'] + model_params['n_seconds'], # Time in seconds of the center of the prediction fragment t_center=df['start_second'] * 60 + model_params['n_seconds'] / 2.) df.to_csv(PATH + '/results/results_for_' + speaker + '.csv', index=False) pbar.close()
for i in range(1, H - 1): for j in range(1, W - 1): pred[i, j] = w.dot(X(wim, i, j).reshape(-1)) return pred im = load_image('cat1.jpg') im2 = load_image('cat2.jpg') y = load_image('cat1_label.png').astype('int') y = y * 2 - 1 # {-1, 1}^{H \times W} H, W = im.shape[:2] MAX_ITER = 2000 # Whiten the image whitened_im = whiten(im) whitened_im2 = whiten(im2) # Define weight w = np.random.rand(27) / 27 for curr_iter in range(MAX_ITER + 1): # Get random image patch i = np.random.randint(1, H - 1) j = np.random.randint(1, W - 1) # Get loss and gradient loss, grad = loss_and_grad(w, y[i, j], X(whitened_im, i, j)) w -= 0.001 * grad if curr_iter % 50 == 0:
def predict_gender(audios, intervals, complex): step_seconds = 0.04 model_path = 'model/weights/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch' model_type = model_path.split('/')[-1].split('__')[0] model_name = model_path.split('/')[-1].split('.')[0] model_params = { i.split('=')[0]: float(i.split('=')[1]) for i in model_name.split('__')[1:] } # Here we assume that the model was trained on the LibriSpeech dataset model_sampling_rate = LIBRISPEECH_SAMPLING_RATE / model_params[ 'downsampling'] model_num_samples = int(model_params['n_seconds'] * model_sampling_rate) if model_type == 'max_pooling': model = ConvNet(int(model_params['n_filters']), int(model_params['n_layers'])) elif model_type == 'dilated': model = DilatedNet(int(model_params['n_filters']), int(model_params['n_depth']), int(model_params['n_stacks'])) else: raise (ValueError, 'Model type not recognised.') model.load_state_dict(torch.load(model_path)) model.double() model.cuda() model.eval() for i in trange(len(audios), desc="speakers"): speaker = audios[i].replace('.wav', '') ############## # Load audio # ############## audio_path = PATH + '/raw/voc/simple_audio/' + audios[i] audio, audio_sampling_rate = sf.read(audio_path) audio_duration_seconds = audio.shape[0] * 1. / audio_sampling_rate audio_duration_minutes = audio_duration_seconds / 60. step_samples = int(step_seconds * model_sampling_rate) step_samples_at_audio_rate = int(step_seconds * audio_sampling_rate) default_shape = None batch = [] start_min = [] pred = [] mean_pitch = [] max_pitch = [] min_pitch = [] num_zeros = [] std_pitch = [] pitch_measurements = [] for j in trange(len(intervals[speaker]), desc="intervals", leave=False): start = float(intervals[speaker][j][0]) end = float(intervals[speaker][j][1]) start_samples = int(audio_sampling_rate * start) end_samples = int(audio_sampling_rate * end) step_samples = int(step_seconds * model_sampling_rate) step_samples_at_audio_rate = int(step_seconds * audio_sampling_rate) default_shape = None for lower in tqdm(range(start_samples, end_samples, step_samples_at_audio_rate), desc="predictions", leave=False): x = audio[lower:lower + (3 * audio_sampling_rate)] if x.shape[0] != 3 * audio_sampling_rate: break sf.write(PATH + '/raw/clips/{}.wav'.format(speaker), x, audio_sampling_rate) sound = parselmouth.Sound(PATH + '/raw/clips/{}.wav'.format(speaker)) pitch = sound.to_pitch() pitch_values = pitch.selected_array['frequency'] if pitch_values[pitch_values != 0].size != 0: mean_pitch.append(np.mean(pitch_values[pitch_values != 0])) std_pitch.append(np.std(pitch_values[pitch_values != 0])) min_pitch.append(np.amin(pitch_values[pitch_values != 0])) max_pitch.append(np.amax(pitch_values[pitch_values != 0])) num_zeros.append(pitch_values[pitch_values == 0].size) pitch_measurements.append( pitch_values[pitch_values != 0].size) start_min.append(lower / 44100.) else: mean_pitch.append(0) std_pitch.append(0) min_pitch.append(0) max_pitch.append(0) num_zeros.append(pitch_values[pitch_values == 0].size) pitch_measurements.append(0) start_min.append(lower / 44100.) os.remove(PATH + '/raw/clips/{}.wav'.format(speaker)) x = torch.from_numpy(x).reshape(1, -1) x = whiten(x) # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster x = torch.from_numpy(resample(x, model_num_samples, axis=1)).reshape( (1, 1, model_num_samples)) y_hat = model(x).item() pred.append(y_hat) start_min.append(lower / 44100.) df = pd.DataFrame( data={ 'speaker': speaker, 'start_second': start_min, 'p': pred, 'mean_pitch': mean_pitch, 'max_pitch': max_pitch, 'min_pitch': min_pitch, 'num_zeros': num_zeros, 'std_pitch': std_pitch, 'pitch_measurements': pitch_measurements }) df = df.assign( # Time in seconds of the end of the prediction fragment t_end=df['start_second'] + model_params['n_seconds'] / 60, # Time in seconds of the center of the prediction fragment t_center=df['start_second'] * 60 + model_params['n_seconds'] / 2.) df.to_csv(PATH + 'analyses/results/results_for_' + speaker + '.csv', index=False)
def preprocessor(batch): batch = whiten(batch) batch = torch.from_numpy( resample(batch, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling), axis=1) ).reshape((batchsize, 1, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling))) return batch
val_acc_values = [] acc_values = [] t0 = time.time() print('\n[Epoch, Batches, Seconds]') for epoch in range(n_epochs): # loop over the dataset multiple times running_loss = 0.0 running_correct_samples = 0 for i, data in enumerate(trainloader, 0): # Get batch inputs, labels = data # Normalise the volume to a fixed root mean square value as some speakers are much quieter than others inputs = whiten(inputs) # Resample audio inputs = torch.from_numpy( resample(inputs, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling), axis=1) ).reshape((batchsize, 1, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling))) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels.reshape((batchsize, 1)).cuda().double()) loss.backward() optimizer.step()
pred = [] for lower in tqdm( range( 0, audio.shape[0] - (int(model_params['n_seconds'] * audio_sampling_rate)), step_samples_at_audio_rate)): x = audio[lower:lower + (int(model_params['n_seconds'] * audio_sampling_rate))] # Don't predict on the last bit of audio where the duration isn't large enough if x.shape[0] != model_params['n_seconds'] * audio_sampling_rate: break x = torch.from_numpy(x).reshape(1, -1) x = whiten(x) # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster x = torch.from_numpy(resample(x, model_num_samples, axis=1)).reshape( (1, 1, model_num_samples)) y_hat = model(x).item() pred.append(y_hat) ########################### # Create output dataframe # ########################### segment_start_times_minutes = np.array(range(len(pred))) * step_seconds / 60 df = pd.DataFrame(data={'minute': segment_start_times_minutes, 'p': pred}) df = df.assign(
from sklearn.cluster import MeanShift from utils import load_bilateral_image, whiten import matplotlib.pyplot as plt # Get vectorized image feat, im = load_bilateral_image() H, W = im.shape[:2] feat = whiten(feat) ms = MeanShift(bandwidth=1, bin_seeding=True) ms.fit(feat.reshape(-1, feat.shape[2])) labels = ms.labels_ plt.subplot(1, 2, 1) plt.imshow(im) plt.axis('off') plt.subplot(1, 2, 2) plt.imshow(labels.reshape(H, W)) plt.axis('off') plt.show()