コード例 #1
0
ファイル: separate_wsj.py プロジェクト: pseeth/experiments
def separate(mixture, model, params, device):
    labels = ['s%d' % i for i in range(1, params['num_attractors'] + 1)]
    estimates = {}

    mix = mixture
    if (len(mix.shape) > 1):
        mix = mixture[:, 0]
    _, mix = utils.mask_mixture(1, mix, params['n_fft'], params['hop_length'])
    log_spec = utils.transform(mix, params['n_fft'], params['hop_length'])
    silence_mask = log_spec > -25
    log_spec = utils.whiten(log_spec)

    with torch.no_grad():
        input_data = torch.from_numpy(log_spec).unsqueeze(0).requires_grad_().to(device)
        if 'DeepAttractor' in str(model):
            with torch.no_grad():
                masks, _, embedding, _ = model(input_data, one_hots=None)

            clusterer = KMeans(n_clusters=params['num_attractors'])
            embedding_ = embedding.squeeze(0).cpu().data.numpy()
            clusterer.fit(embedding_[silence_mask.flatten()])
            assignments = clusterer.predict(embedding_)
            assignments = assignments.reshape((masks.shape[1], masks.shape[2]))

    for i, label in enumerate(labels):
        mask = (assignments == i).T.astype(float)
        source, mix = utils.mask_mixture(mask, mix, params['n_fft'], params['hop_length'])
        estimates[label] = source

    return estimates
コード例 #2
0
ファイル: main.py プロジェクト: alexdoberman/DSP_test
def main(train_db, test_db):

    # Load train dataset
    _, lst_vec_train = utils.data_load(train_db)

    # Get PCA transform matrix from train data
    X_train = np.array(lst_vec_train)
    _, W = utils.whiten(X_train)

    # Load test dataset
    lst_id, lst_vec_test = utils.data_load(test_db)

    # Whitening test dataset
    X_test = np.array(lst_vec_test)
    X_test_white = np.dot(X_test, W)

    # Calc scores for test dataset
    lst_white_vec = [X_test_white[i, :] for i in range(X_test_white.shape[0])]
    lst_compare_key_result, lst_compare_ivec_result = utils.calc_scores(
        lst_id, lst_white_vec)

    # Plot FR/FA curve
    utils.plot_fr_fa(lst_compare_key_result, lst_compare_ivec_result)

    # Plot scores hist
    utils.plot_hist_scores(lst_compare_key_result, lst_compare_ivec_result)
コード例 #3
0
    def test_whitening(self):
        desired_rms = 0.038021

        test_data, sample_rate = sf.read(PATH +
                                         '/data/whitening_test_audio.flac')
        test_data = np.stack([test_data] * 2)

        whitened = whiten(torch.from_numpy(test_data), desired_rms)

        # Mean correct
        self.assert_(np.isclose(whitened.mean().item(), 0))

        # RMS correct
        self.assert_(
            np.isclose(
                np.sqrt(np.power(whitened[0, :], 2).mean()).item(),
                desired_rms))
コード例 #4
0
def read_train_data():
    train_data = []
    train_label = []
    for num, dirlist in enumerate(os.listdir(train_root)):
        label = int(dirlist[5:7]) - 1
        file_path = os.path.join(train_root, dirlist)
        im = cv2.imread(file_path)
        width, heigh = im.shape[0], im.shape[1]
        if width >= heigh:
            length = heigh
        else:
            length = width
        # print(length)
        im = utils.whiten(im)
        im = utils.random_crop(im, image_size=length)
        im = cv2.resize(im, (128, 128))
        train_data.append(im)
        train_label.append(label)
    np.savetxt('train_label.txt', train_label)
    print("loading train_data, train_labels")
    return np.array(train_data), np.array(train_label)
コード例 #5
0
def input_test_data():
    test_data = []
    name = []
    for dirlist in os.listdir(test_root):
        im_name = str(dirlist.split('.')[0])
        name.append(im_name)
        file_path = os.path.join(test_root, dirlist)
        im = cv2.imread(file_path)
        width, heigh = im.shape[0], im.shape[1]
        if width >= heigh:
            length = heigh
        else:
            length = width
        print(length)
        im = utils.whiten(im)
        im = utils.random_crop(im, image_size=length)
        im = cv2.resize(im, (128, 128))
        test_data.append(im)
    print(name)
    print(len(test_data))
    print(len(name))
    np.savetxt('test_image_name.csv', name, fmt='%s')
    return np.array(test_data)
コード例 #6
0
def preprocessing(X, phase_shift=0, time_shift=0):
    fband = [35.0, 350.0]
    T = 800.0
    XWhiten = whiten(X, dt=T)
    return bandpass(XWhiten, fband, T)
コード例 #7
0
ファイル: tae.py プロジェクト: bkmi/deep-learn-2018
import random

import utils
import network

import tensorflow as tf

timeseries, validate_timeseries, validate_labels = utils.load()

lag = 1
assert lag > 0
x, y = utils.lag_data(timeseries, lag=lag)
x, y = utils.whiten(x), utils.whiten(y)
val_x, val_y = utils.lag_data(validate_timeseries, lag=0)
val_x, val_y = utils.whiten(val_x), utils.whiten(val_y)

# length_minib = val_x.shape[0]
# minibatches = []
# for i in range(x.shape[0] - length_minib + 1):
#     minibatches.append((x[i:i + length_minib], y[i:i + length_minib]))

timeseries_x = tf.placeholder(tf.float32, shape=[None, timeseries.shape[-1]])
timeseries_y = tf.placeholder(tf.float32, shape=[None, timeseries.shape[-1]])
loss, encoded, decoded = network.time_lagged_autoencoder(
    timeseries_x, timeseries_y)
train = tf.train.AdamOptimizer().minimize(loss)

epochs = 1500
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epochs):
コード例 #8
0
ファイル: crf.py プロジェクト: zeta1999/segmentation_lecture
outline = load_image('cat1_outline.png')

# outline consists of 3 values 0, 1, 255. We will igure the value 255.
plt.subplot(1, 2, 1)
plt.imshow(im)
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(outline[..., np.newaxis] * im)
plt.axis('off')
# plt.show()

# Create a grid CRF
N_LABEL = 2
MAX_ITER = 200

feat = whiten(im)
H, W = outline.shape

# indicate whether the node has outline label
label = outline[1:H - 1, 1:W - 1]


def rbf_kernel(dist, sigma=3):
    return np.exp(-np.sum(dist**2, 2) / sigma)


# Create pairwise conditional probability with nearest 4 neighbors
curr_feat = feat[1:H - 1, 1:W - 1]
top, bottom, left, right = np.zeros((H, W)), np.zeros((H, W)), np.zeros(
    (H, W)), np.zeros((H, W))
コード例 #9
0
ファイル: process.py プロジェクト: liliest/QP2
def predict():
    intervals = Data(False)
    existing_results = []
    for root, folders, files in os.walk(PATH + '/results/'):
        for f in files:
            if f.endswith('.csv'):
                speaker_name = f.replace('results_for_',
                                         '').replace('.csv', '')
                existing_results.append(speaker_name)
    num_intervals = 0
    for key in intervals.speaker_to_intervals:
        if key not in existing_results:
            for interval in intervals.speaker_to_intervals[key]:
                num_intervals += 1
    pbar = tqdm(total=num_intervals)
    for key in intervals.all_speakers:
        if key not in existing_results:
            model_path = 'data/weights/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch'
            step_seconds = 0.04
            batchsize_for_prediction = 1
            if key in intervals.complex_transcripts:
                audio_path = PATH + '/data/voc/audio/' + key + '.wav'
            if key in intervals.speakers:
                audio_path = PATH + '/data/voc/simple_audio/' + key + '.wav'
            speaker = key

            ##############
            # Load audio #
            ##############

            audio, audio_sampling_rate = sf.read(audio_path)
            audio_duration_seconds = audio.shape[0] * 1. / audio_sampling_rate
            audio_duration_minutes = audio_duration_seconds / 60.

            ##############
            # Load model #
            ##############

            model_type = model_path.split('/')[-1].split('__')[0]
            model_name = model_path.split('/')[-1].split('.')[0]
            model_params = {
                i.split('=')[0]: float(i.split('=')[1])
                for i in model_name.split('__')[1:]
            }

            # Here we assume that the model was trained on the LibriSpeech dataset
            model_sampling_rate = LIBRISPEECH_SAMPLING_RATE / model_params[
                'downsampling']
            model_num_samples = int(model_params['n_seconds'] *
                                    model_sampling_rate)

            if model_type == 'max_pooling':
                model = ConvNet(int(model_params['n_filters']),
                                int(model_params['n_layers']))
            elif model_type == 'dilated':
                model = DilatedNet(int(model_params['n_filters']),
                                   int(model_params['n_depth']),
                                   int(model_params['n_stacks']))
            else:
                raise (ValueError, 'Model type not recognised.')

            model.load_state_dict(torch.load(model_path))
            model.double()
            model.cuda()
            model.eval()

            ######################
            # Loop through audio #
            ######################

            step_samples = int(step_seconds * model_sampling_rate)
            step_samples_at_audio_rate = int(step_seconds *
                                             audio_sampling_rate)
            default_shape = None
            batch = []
            pred = []
            start_min = []

            for interval in intervals.speaker_to_intervals[key]:
                start = float(interval[0])
                end = float(interval[1])
                start_samples = int(audio_sampling_rate * start)
                end_samples = int(audio_sampling_rate * end)

                for lower in range(start_samples, end_samples,
                                   step_samples_at_audio_rate):
                    x = audio[lower:lower + (int(model_params['n_seconds'] *
                                                 audio_sampling_rate))]

                    if x.shape[0] != model_params[
                            'n_seconds'] * audio_sampling_rate:
                        break

                    x = torch.from_numpy(x).reshape(1, -1)

                    x = whiten(x)

                    # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster
                    x = torch.from_numpy(resample(x, model_num_samples,
                                                  axis=1)).reshape(
                                                      (1, 1,
                                                       model_num_samples))

                    y_hat = model(x).item()

                    pred.append(y_hat)
                    start_min.append(lower / 44100.)
                pbar.update(1)

            df = pd.DataFrame(data={
                'speaker': speaker,
                'start_second': start_min,
                'p': pred
            })
            df = df.assign(
                # Time in seconds of the end of the prediction fragment
                t_end=df['start_second'] + model_params['n_seconds'],
                # Time in seconds of the center of the prediction fragment
                t_center=df['start_second'] * 60 +
                model_params['n_seconds'] / 2.)
            df.to_csv(PATH + '/results/results_for_' + speaker + '.csv',
                      index=False)
    pbar.close()
コード例 #10
0
ファイル: svm.py プロジェクト: zeta1999/segmentation_lecture
    for i in range(1, H - 1):
        for j in range(1, W - 1):
            pred[i, j] = w.dot(X(wim, i, j).reshape(-1))

    return pred


im = load_image('cat1.jpg')
im2 = load_image('cat2.jpg')
y = load_image('cat1_label.png').astype('int')
y = y * 2 - 1  # {-1, 1}^{H \times W}
H, W = im.shape[:2]
MAX_ITER = 2000

# Whiten the image
whitened_im = whiten(im)
whitened_im2 = whiten(im2)

# Define weight
w = np.random.rand(27) / 27

for curr_iter in range(MAX_ITER + 1):
    # Get random image patch
    i = np.random.randint(1, H - 1)
    j = np.random.randint(1, W - 1)

    # Get loss and gradient
    loss, grad = loss_and_grad(w, y[i, j], X(whitened_im, i, j))
    w -= 0.001 * grad

    if curr_iter % 50 == 0:
コード例 #11
0
def predict_gender(audios, intervals, complex):

    step_seconds = 0.04

    model_path = 'model/weights/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch'

    model_type = model_path.split('/')[-1].split('__')[0]
    model_name = model_path.split('/')[-1].split('.')[0]
    model_params = {
        i.split('=')[0]: float(i.split('=')[1])
        for i in model_name.split('__')[1:]
    }

    # Here we assume that the model was trained on the LibriSpeech dataset
    model_sampling_rate = LIBRISPEECH_SAMPLING_RATE / model_params[
        'downsampling']
    model_num_samples = int(model_params['n_seconds'] * model_sampling_rate)

    if model_type == 'max_pooling':
        model = ConvNet(int(model_params['n_filters']),
                        int(model_params['n_layers']))
    elif model_type == 'dilated':
        model = DilatedNet(int(model_params['n_filters']),
                           int(model_params['n_depth']),
                           int(model_params['n_stacks']))
    else:
        raise (ValueError, 'Model type not recognised.')

    model.load_state_dict(torch.load(model_path))
    model.double()
    model.cuda()
    model.eval()
    for i in trange(len(audios), desc="speakers"):
        speaker = audios[i].replace('.wav', '')

        ##############
        # Load audio #
        ##############
        audio_path = PATH + '/raw/voc/simple_audio/' + audios[i]
        audio, audio_sampling_rate = sf.read(audio_path)
        audio_duration_seconds = audio.shape[0] * 1. / audio_sampling_rate
        audio_duration_minutes = audio_duration_seconds / 60.

        step_samples = int(step_seconds * model_sampling_rate)
        step_samples_at_audio_rate = int(step_seconds * audio_sampling_rate)
        default_shape = None
        batch = []
        start_min = []
        pred = []
        mean_pitch = []
        max_pitch = []
        min_pitch = []
        num_zeros = []
        std_pitch = []
        pitch_measurements = []

        for j in trange(len(intervals[speaker]), desc="intervals",
                        leave=False):
            start = float(intervals[speaker][j][0])
            end = float(intervals[speaker][j][1])
            start_samples = int(audio_sampling_rate * start)
            end_samples = int(audio_sampling_rate * end)
            step_samples = int(step_seconds * model_sampling_rate)
            step_samples_at_audio_rate = int(step_seconds *
                                             audio_sampling_rate)
            default_shape = None

            for lower in tqdm(range(start_samples, end_samples,
                                    step_samples_at_audio_rate),
                              desc="predictions",
                              leave=False):

                x = audio[lower:lower + (3 * audio_sampling_rate)]
                if x.shape[0] != 3 * audio_sampling_rate:
                    break

                sf.write(PATH + '/raw/clips/{}.wav'.format(speaker), x,
                         audio_sampling_rate)
                sound = parselmouth.Sound(PATH +
                                          '/raw/clips/{}.wav'.format(speaker))
                pitch = sound.to_pitch()
                pitch_values = pitch.selected_array['frequency']

                if pitch_values[pitch_values != 0].size != 0:
                    mean_pitch.append(np.mean(pitch_values[pitch_values != 0]))
                    std_pitch.append(np.std(pitch_values[pitch_values != 0]))
                    min_pitch.append(np.amin(pitch_values[pitch_values != 0]))
                    max_pitch.append(np.amax(pitch_values[pitch_values != 0]))
                    num_zeros.append(pitch_values[pitch_values == 0].size)
                    pitch_measurements.append(
                        pitch_values[pitch_values != 0].size)
                    start_min.append(lower / 44100.)

                else:
                    mean_pitch.append(0)
                    std_pitch.append(0)
                    min_pitch.append(0)
                    max_pitch.append(0)
                    num_zeros.append(pitch_values[pitch_values == 0].size)
                    pitch_measurements.append(0)
                    start_min.append(lower / 44100.)

                os.remove(PATH + '/raw/clips/{}.wav'.format(speaker))

                x = torch.from_numpy(x).reshape(1, -1)

                x = whiten(x)

                # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster
                x = torch.from_numpy(resample(x, model_num_samples,
                                              axis=1)).reshape(
                                                  (1, 1, model_num_samples))

                y_hat = model(x).item()

                pred.append(y_hat)
                start_min.append(lower / 44100.)

        df = pd.DataFrame(
            data={
                'speaker': speaker,
                'start_second': start_min,
                'p': pred,
                'mean_pitch': mean_pitch,
                'max_pitch': max_pitch,
                'min_pitch': min_pitch,
                'num_zeros': num_zeros,
                'std_pitch': std_pitch,
                'pitch_measurements': pitch_measurements
            })

        df = df.assign(
            # Time in seconds of the end of the prediction fragment
            t_end=df['start_second'] + model_params['n_seconds'] / 60,
            # Time in seconds of the center of the prediction fragment
            t_center=df['start_second'] * 60 + model_params['n_seconds'] / 2.)
        df.to_csv(PATH + 'analyses/results/results_for_' + speaker + '.csv',
                  index=False)
コード例 #12
0
def preprocessor(batch):
    batch = whiten(batch)
    batch = torch.from_numpy(
        resample(batch, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling), axis=1)
    ).reshape((batchsize, 1, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling)))
    return batch
コード例 #13
0
val_acc_values = []
acc_values = []

t0 = time.time()

print('\n[Epoch, Batches, Seconds]')
for epoch in range(n_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    running_correct_samples = 0
    for i, data in enumerate(trainloader, 0):
        # Get batch
        inputs, labels = data

        # Normalise the volume to a fixed root mean square value as some speakers are much quieter than others
        inputs = whiten(inputs)

        # Resample audio
        inputs = torch.from_numpy(
            resample(inputs, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling), axis=1)
        ).reshape((batchsize, 1, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling)))

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels.reshape((batchsize, 1)).cuda().double())
        loss.backward()
        optimizer.step()
コード例 #14
0
pred = []
for lower in tqdm(
        range(
            0, audio.shape[0] -
            (int(model_params['n_seconds'] * audio_sampling_rate)),
            step_samples_at_audio_rate)):
    x = audio[lower:lower +
              (int(model_params['n_seconds'] * audio_sampling_rate))]

    # Don't predict on the last bit of audio where the duration isn't large enough
    if x.shape[0] != model_params['n_seconds'] * audio_sampling_rate:
        break

    x = torch.from_numpy(x).reshape(1, -1)

    x = whiten(x)

    # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster
    x = torch.from_numpy(resample(x, model_num_samples, axis=1)).reshape(
        (1, 1, model_num_samples))

    y_hat = model(x).item()

    pred.append(y_hat)

###########################
# Create output dataframe #
###########################
segment_start_times_minutes = np.array(range(len(pred))) * step_seconds / 60
df = pd.DataFrame(data={'minute': segment_start_times_minutes, 'p': pred})
df = df.assign(
コード例 #15
0
from sklearn.cluster import MeanShift

from utils import load_bilateral_image, whiten
import matplotlib.pyplot as plt

# Get vectorized image
feat, im = load_bilateral_image()
H, W = im.shape[:2]
feat = whiten(feat)

ms = MeanShift(bandwidth=1, bin_seeding=True)
ms.fit(feat.reshape(-1, feat.shape[2]))
labels = ms.labels_

plt.subplot(1, 2, 1)
plt.imshow(im)
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(labels.reshape(H, W))
plt.axis('off')
plt.show()