Beispiel #1
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE if not args.debug else 100

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :BIG_FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    big_h0 = numpy.zeros(
        (N_SEQS - fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT * BIG_DIM),
        dtype='float32')
    big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0)
    h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM),
                     dtype='float32')
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    big_frame_level_outputs = None
    frame_level_outputs = None

    for t in xrange(BIG_FRAME_SIZE, LENGTH):

        if t % BIG_FRAME_SIZE == 0:
            big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
                samples[:, t - BIG_FRAME_SIZE:t], big_h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t - FRAME_SIZE:t],
                big_frame_level_outputs[:, (t / FRAME_SIZE) %
                                        (BIG_FRAME_SIZE / FRAME_SIZE)], h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE], samples[:,
                                                            t - FRAME_SIZE:t])

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(
                    os.path.join(SAMPLES_PATH, name+'.wav'),
                    BITRATE,
                    data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS*BITRATE if not args.debug else 100

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    h0 = numpy.zeros(
            (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
            dtype='float32'
    )
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    frame_level_outputs = None

    for t in xrange(FRAME_SIZE, LENGTH):

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t-FRAME_SIZE:t],
                h0,
                #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
                numpy.int32(t == FRAME_SIZE)
            )

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t-FRAME_SIZE:t],
        )

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #3
0
def generate_and_save_samples(tag, N_SECS=5):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    LENGTH = N_SECS * BITRATE if not args.debug else 100

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM),
                     dtype='float32')
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    frame_level_outputs = None

    for t in xrange(FRAME_SIZE, LENGTH):

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t - FRAME_SIZE:t],
                h0,
                #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
                numpy.int32(t == FRAME_SIZE))

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t - FRAME_SIZE:t],
        )

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')

        now = datetime.datetime.now()
        now_time = "{}:{}:{}".format(now.hour, now.minute, now.second)

        file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i)
        print "writing...", file_name
        write_audio_file(file_name, samp)
Beispiel #4
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time.time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE

    if args.debug:
        LENGTH = 1024

    num_prev_samples_to_use = (2**args.dilation_layers_per_block -
                               1) * args.wavenet_blocks + 1

    samples = numpy.zeros((N_SEQS, LENGTH + num_prev_samples_to_use),
                          dtype='int32')
    samples[:, :num_prev_samples_to_use] = Q_ZERO

    for t in range(LENGTH):
        samples[:, num_prev_samples_to_use + t:num_prev_samples_to_use + t +
                1] = generate_fn(samples[:, t:t + num_prev_samples_to_use + 1])
        if (t > 2 * BITRATE) and (t < 3 * BITRATE):
            samples[:, num_prev_samples_to_use + t:num_prev_samples_to_use +
                    t + 1] = Q_ZERO

    total_time = time.time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i, num_prev_samples_to_use:]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag, samples):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #6
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(
                    os.path.join(SAMPLES_PATH, name+'.wav'),
                    BITRATE,
                    data)

    total_time = time.time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS*BITRATE

    if args.debug:
        LENGTH = 1024

    num_prev_samples_to_use = (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1

    samples = numpy.zeros((N_SEQS, LENGTH + num_prev_samples_to_use), dtype='int32')
    samples[:, :num_prev_samples_to_use] = Q_ZERO

    for t in range(LENGTH):
        samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = generate_fn(samples[:, t:t + num_prev_samples_to_use+1])
        if (t > 2*BITRATE) and( t < 3*BITRATE):
            samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = Q_ZERO

    total_time = time.time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i, num_prev_samples_to_use: ]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE if not args.debug else 100

    #op1: init with zero
    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :FRAME_SIZE] = Q_ZERO
    if FLAG_USETRAIN_WHENTEST:
        print('')
        print('REMINDER: using training data for test')
        print('')
        testData_feeder = load_data_gen(train_feeder, LENGTH)
    else:
        testData_feeder = load_data_gen(test_feeder, LENGTH)
    mini_batch = testData_feeder.next()
    _, _, _, seqs_lab = mini_batch
    samples_lab = seqs_lab[:N_SEQS]

    #op2: init with true data
    #testData_feeder = load_data_gen(train_feeder,LENGTH+LAB_SIZE)
    #testData_feeder = load_data_gen(test_feeder,LENGTH+LAB_SIZE)
    #mini_batch = testData_feeder.next()
    #seqs, _, _, seqs_lab = mini_batch
    #samples = seqs[:N_SEQS,FRAME_SIZE:FRAME_SIZE+LENGTH]
    #samples_lab = seqs_lab[:N_SEQS,1:]

    # First half zero, others fixed random at each checkpoint
    h0 = numpy.zeros((N_SEQS, N_RNN, H0_MULT * DIM), dtype='float32')

    frame_level_outputs = None

    for t in xrange(FRAME_SIZE, LENGTH):

        if t % FRAME_SIZE == 0:
            tmp = samples_lab[:, (t - FRAME_SIZE) // FRAME_SIZE, :]
            tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1])

            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t - FRAME_SIZE:t],
                tmp,
                h0,
                #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
                numpy.int32(t == FRAME_SIZE))

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t - FRAME_SIZE:t],
        )

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #8
0
def generate_and_save_samples():
    # Sampling at frame level
    frame_level_generate_fn = theano.function(
        [sequences, h0, reset],
        frame_level_rnn(sequences, h0, reset),
        on_unused_input='warn'
    )
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(
                    os.path.join(SAMPLES_PATH, name+'.wav'),
                    BITRATE,
                    data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS*BITRATE

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    h0 = numpy.zeros(
            (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
            dtype='float32'
    )
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    frame_level_outputs = None

    for t in xrange(FRAME_SIZE, LENGTH):

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t-FRAME_SIZE:t],
                h0,
                #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
                numpy.int32(t == FRAME_SIZE)
            )

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t-FRAME_SIZE:t],
        )

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #9
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= numpy.mean(data)
        data /= numpy.absolute(data).max() # [-1,1]
        data *= 32768
        data = data.astype('int16')
        scipy.io.wavfile.write(
                    os.path.join(SAMPLES_PATH, name+'.wav'),
                    BITRATE,
                    data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS*BITRATE if not args.debug else 100

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    if flag_dict['RMZERO']:
        testData_feeder = load_data(test_feeder)
        mini_batch = testData_feeder.next()
        tmp, _, _ = mini_batch
        samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE]
    else:
        samples[:, :BIG_FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    big_h0 = numpy.zeros(
            (N_SEQS-fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM),
            dtype='float32'
    )
    
    big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0)
    h0_1 = numpy.zeros(
            (N_SEQS-fixed_rand_h0_1.shape[0], N_RNN_LIST[1], H0_MULT*DIM),
            dtype='float32'
    )
    h0_1 = numpy.concatenate((h0_1, fixed_rand_h0_1), axis=0)
    h0_2 = numpy.zeros(
            (N_SEQS-fixed_rand_h0_2.shape[0], N_RNN_LIST[2], H0_MULT*DIM),
            dtype='float32'
    )
    h0_2 = numpy.concatenate((h0_2, fixed_rand_h0_2), axis=0)
    big_frame_level_outputs = None
    frame_level_outputs_1 = None
    frame_level_outputs_2 = None

    for t in xrange(BIG_FRAME_SIZE, LENGTH):

        if t % BIG_FRAME_SIZE == 0:
            big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
                samples[:, t-BIG_FRAME_SIZE:t],
                big_h0,
                numpy.int32(t == BIG_FRAME_SIZE)
            )

        if t % FRAME_SIZE_1 == 0:
            frame_level_outputs_1, h0_1 = frame_level_generate_fn_1(
                samples[:, t-FRAME_SIZE_1:t],
                big_frame_level_outputs[:, (t / FRAME_SIZE_1) % (BIG_FRAME_SIZE / FRAME_SIZE_1)],
                h0_1,
                numpy.int32(t == BIG_FRAME_SIZE)
            )
        if t % FRAME_SIZE_2 == 0:
            frame_level_outputs_2, h0_2 = frame_level_generate_fn_2(
                samples[:, t-FRAME_SIZE_2:t],
                frame_level_outputs_1[:, (t / FRAME_SIZE_2) % (FRAME_SIZE_1 / FRAME_SIZE_2)],
                h0_2,
                numpy.int32(t == BIG_FRAME_SIZE)
            )

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs_2[:, t % FRAME_SIZE_2],
            samples[:, t-FRAME_SIZE_DNN:t]
        )

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        #pdb.set_trace()
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
            #pdb.set_trace()
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= numpy.mean(data)
        data /= numpy.absolute(data).max()  # [-1,1]
        data *= 32768
        data = data.astype('int16')
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE if not args.debug else 160  #before it was 100, but 160 was better as it should be divisible by 80
    if FLAG_GEN: LENGTH = 785 * 80

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')

    if FLAG_USETRAIN_WHENTEST:
        print('')
        print('REMINDER: using training data for test')
        print('')
        testData_feeder = load_data_gen(train_feeder, LENGTH)
    else:
        testData_feeder = load_data_gen(test_feeder, LENGTH)
    mini_batch = testData_feeder.next()
    tmp, _, _, seqs_lab = mini_batch
    samples_lab = seqs_lab[:N_SEQS]

    if flag_dict['RMZERO']:
        samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE]
    else:
        samples[:, :BIG_FRAME_SIZE] = Q_ZERO

    samples_lab_big = get_lab_big(samples_lab)

    # First half zero, others fixed random at each checkpoint
    big_h0 = numpy.zeros((N_SEQS, N_BIG_RNN, H0_MULT * BIG_DIM),
                         dtype='float32')

    h0 = numpy.zeros((N_SEQS, N_RNN, H0_MULT * DIM), dtype='float32')

    big_frame_level_outputs = None
    frame_level_outputs = None

    for t in xrange(BIG_FRAME_SIZE, LENGTH):

        if t % BIG_FRAME_SIZE == 0:
            tmp = samples_lab_big[:, (t - BIG_FRAME_SIZE) // BIG_FRAME_SIZE, :]
            tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1])

            big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
                samples[:, t - BIG_FRAME_SIZE:t], tmp, big_h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        if t % FRAME_SIZE == 0:
            tmp = samples_lab[:, (t - BIG_FRAME_SIZE) // FRAME_SIZE, :]
            # tmp = samples_lab[:,(t-FRAME_SIZE)//FRAME_SIZE,:] #classic, but might introduce a slight mis-alignment
            tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1])

            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t - FRAME_SIZE:t], tmp,
                big_frame_level_outputs[:, (t / FRAME_SIZE) %
                                        (BIG_FRAME_SIZE / FRAME_SIZE)], h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t - FRAME_SIZE_DNN:t])

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #11
0
def generate_and_save_samples(tag, conditioning=None):
    # Conditioning (N_SEQS, LENGTH)
    # N_SEQ = several different condition sequences, but all must have the same size... (yes, it's shitt)
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each LENGHT seconds long
    N_SEQS = 20
    if GEN_FLAG:
        if conditioning is not None:
            N_SEQS = conditioning.shape[0]
            LENGTH = conditioning.shape[1] if not args.debug else 100
        else:
            if args.debug:
                LENGTH = 5 * BITRATE
                conditioning = np.ones((N_SEQS, LENGTH), dtype='int32')
            else:
                raise ("No conditionning !!")
    else:
        LENGTH = 5 * BITRATE
        conditioning = np.ones((N_SEQS, LENGTH), dtype='int32')

    if GEN_FLAG:
        print("Generating %d samples" % LENGTH)

    # Uniform [-0.5, 0.5) for half of initial state for generated samples
    # to study the behaviour of the model and also to introduce some diversity
    # to samples in a simple way. [it's disabled]
    fixed_rand_h0 = numpy.random.rand(N_SEQS // 2, N_RNN, H0_MULT * DIM)
    fixed_rand_h0 -= 0.5
    fixed_rand_h0 = fixed_rand_h0.astype('float32')

    fixed_rand_big_h0 = numpy.random.rand(N_SEQS // 2, N_RNN, H0_MULT * DIM)
    fixed_rand_big_h0 -= 0.5
    fixed_rand_big_h0 = fixed_rand_big_h0.astype('float32')

    ############################################################
    ############################################################
    # Initialize the sequence with zeros
    # Lame !?? Why not with a short "test sequence"
    # Would give much power to the mode with a small user cost no ?
    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :BIG_FRAME_SIZE] = Q_ZERO
    ############################################################
    ############################################################

    # First half zero, others fixed random at each checkpoint
    big_h0 = numpy.zeros(
        (N_SEQS - fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT * BIG_DIM),
        dtype='float32')
    big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0)
    h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM),
                     dtype='float32')
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    big_frame_level_outputs = None
    frame_level_outputs = None

    # During generation
    # The BIG_FRAME_SIZE first times of samples are zeros used to initialize the sampleRNN
    # Hence, condi[0:BIG_FRAME_SIZE] are used to generate samples[BIG_FRAME_SIZE:2*BIG_FRAME_SIZE]
    for t in xrange(BIG_FRAME_SIZE, LENGTH):

        if GEN_FLAG:
            if t % 1000 == 0:
                print("%.2f secs generated..." % (t * 1. / BITRATE))

        if t % BIG_FRAME_SIZE == 0:
            big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
                samples[:, t - BIG_FRAME_SIZE:t],
                conditioning[:, t - BIG_FRAME_SIZE:t], big_h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t - FRAME_SIZE:t], conditioning[:,
                                                           t - FRAME_SIZE:t],
                big_frame_level_outputs[:, (t / FRAME_SIZE) %
                                        (BIG_FRAME_SIZE / FRAME_SIZE)], h0,
                numpy.int32(t == BIG_FRAME_SIZE))
        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            conditioning[:, t - FRAME_SIZE:t], samples[:, t - FRAME_SIZE:t])

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #12
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(
                    os.path.join(SAMPLES_PATH, name+'.wav'),
                    SAMPLERATE,
                    data)

    total_time = time()
    # Generate N_SEQS' sample files, each N_SECS seconds long
    N_SECS = args.length_sec
    LENGTH = N_SECS*SAMPLERATE if not args.debug else 100

    print("Generating %d samples"%LENGTH)

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    h0 = numpy.zeros(
            (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
            dtype='float32'
    )
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    frame_level_outputs = None

    for t in xrange(FRAME_SIZE, LENGTH):

        if t % 1000 == 0:
            print("%.2f secs generated..."%(t * 1./SAMPLERATE))

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t-FRAME_SIZE:t],
                h0,
                #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
                numpy.int32(t == FRAME_SIZE)
            )

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t-FRAME_SIZE:t],
        )

    total_time = time() - total_time
    log = "{} seconds length generated in {} seconds."
    log = log.format(N_SECS, total_time)
    print log,

    now = datetime.now()
    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}_{}".format(tag, i, now.strftime('%Y%m%d_%H%M%S')), samp)
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        #data -= data.min()
        #data /= data.max()
        #data -= 0.5
        #data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name), BITRATE, data)

    total_time = time()
    costs_g = []
    accuracys_g = []
    samples_low_list = []
    samples_list = []
    masks_g_index = []
    samples_number = 0
    count = 0
    data_feeder = load_data(test_feeder)
    for seqs_g_8k, seqs_g_up, reset_g, end_flag_g, mask_g, con_g, batch_g, seqs_g_8k_real in data_feeder:
        if reset_g == 1:
            con_h0_g = numpy.zeros(
                (batch_g, N_CON_RNN, H0_MULT * CON_TIER_DIM), dtype='float32')
            big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT * DIM),
                                   dtype='float32')
            h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT * DIM),
                               dtype='float32')
            cost_batch = np.zeros((batch_g, ), dtype='float32')
            accuracy_batch = np.zeros((batch_g, ), dtype='float32')
            mask_batch = np.zeros((batch_g, ), dtype='float32')
        cost_g, accuracy_g, mask_sum_g, sample, con_h0_g, big_h0_g, h0_g = test_fn(
            seqs_g_8k, seqs_g_up, con_g, con_h0_g, big_h0_g, h0_g, reset_g,
            mask_g, batch_g)
        cost_batch = cost_batch + cost_g
        accuracy_batch = accuracy_batch + accuracy_g
        mask_batch = mask_batch + mask_sum_g
        if end_flag_g == 1:
            costs_g.extend(list(cost_batch / mask_batch))
            accuracys_g.extend(list(accuracy_batch / mask_batch))

        if reset_g == 1:
            samples_low = seqs_g_8k_real[:, 0:-OVERLAP]
            samples = sample
            masks_g = mask_g[:, 0:-OVERLAP]
        else:
            samples_low = np.concatenate(
                [samples_low, seqs_g_8k_real[:, 0:-OVERLAP]], axis=1)
            samples = np.concatenate([samples, sample], axis=1)
            masks_g = np.concatenate([masks_g, mask_g[:, 0:-OVERLAP]], axis=1)

        if end_flag_g == 1:
            samples_low_list.append(samples_low)
            samples_list.append(samples)
            masks_g_index.append(masks_g)
    fid = open('datasets/TIMIT/test_list.scp', 'r')
    test_id_list = fid.readlines()
    for i in xrange(len(samples_list)):
        samples_number += samples_list[i].shape[0] * samples_list[i].shape[1]
        for j in xrange(samples_list[i].shape[0]):
            samples_lowi = samples_low_list[i][j]
            samplei = samples_list[i][j]
            maski = masks_g_index[i][j]
            samples_lowi = samples_lowi[0:len(np.where(maski == 1)[0])]
            samplei = samplei[0:len(np.where(maski == 1)[0])]
            if Q_TYPE == 'mu-law':
                from datasets.dataset import mu2linear
                samplei = mu2linear(samplei)
            write_audio_file(test_id_list[count].split()[0],
                             samplei / 3 + samples_lowi)
            count += 1

    total_time = time() - total_time
    log = "192 samples generated in {} minutes.\nThe time of generating 1 second speech is {} seconds."
    log = log.format(total_time / 60, total_time / samples_number * 16000)
    print log,

    return numpy.mean(
        costs_g), numpy.mean(accuracys_g) * 100, total_time, list(
            np.array(accuracys_g) * 100)
Beispiel #14
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        # data -= data.min()
        # data /= data.max()
        # data -= 0.5
        # data *= 0.95
        data -= numpy.mean(data)
        data /= numpy.absolute(data).max()
        data /= 2.0
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE if not args.debug else 100

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    if flag_dict['RMZERO']:
        testData_feeder = load_data(test_feeder)
        mini_batch = testData_feeder.next()
        tmp, _, _ = mini_batch
        samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE]
    else:
        samples[:, :BIG_FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    ###QDOU: soft code
    big_h0 = numpy.zeros(
        (N_SEQS - fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT * BIG_DIM),
        dtype='float32')
    big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0)
    big_frame_level_outputs = None
    h0_list, frame_level_outputs_list = [], []
    h0_list.append(big_h0)
    frame_level_outputs_list.append(big_frame_level_outputs)
    for idx in INTER_TIER_IDX_LIST:
        tmp_h0 = numpy.zeros((N_SEQS - fixed_rand_h0_list[idx].shape[0],
                              RNN_DEPTH_LIST[idx], H0_MULT * DIM),
                             dtype='float32')
        tmp_h0 = numpy.concatenate((tmp_h0, fixed_rand_h0_list[dix]), axis=0)
        h0_list.append(tmp_h0)
        frame_level_outputs_list.append(None)
    ###QDOU: soft code

    pdb.set_trace()
    for t in xrange(BIG_FRAME_SIZE, LENGTH):
        if t % BIG_FRAME_SIZE == 0:
            big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
                samples[:, t - BIG_FRAME_SIZE:t], big_h0,
                numpy.int32(t == BIG_FRAME_SIZE))
        ###QDOU: soft code
        for idx in INTER_TIER_IDX_LIST:
            if t % FRAME_SIZE_LIST[idx] == 0:
                frame_level_outputs_list[idx], h0_list[idx] = gen_fn_list[idx](
                    samples[:, t - FRAME_SIZE_LIST[idx]:t],
                    frame_level_outputs_list[idx -
                                             1][:, (t / FRAME_SIZE_LIST[idx]) %
                                                (FRAME_SIZE_LIST[idx - 1] /
                                                 FRAME_SIZE_LIST[idx])],
                    h0_list[idx], numpy.int32(t == BIG_FRAME_SIZE))
        ###QDOU: soft code

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs_2[:, t % FRAME_SIZE_2],
            samples[:, t - FRAME_SIZE_2:t])

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        #pdb.set_trace()
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
            #pdb.set_trace()
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
Beispiel #15
0
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= numpy.mean(data)
        data /= numpy.absolute(data).max()  # [-1,1]
        data *= 32768
        data = data.astype('int16')
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE if not args.debug else 160  #before it was 100, but 160 was better as it should be divisible by 80
    if FLAG_GEN: LENGTH = 785 * 80

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')

    if FLAG_USETRAIN_WHENTEST:
        print('')
        print('REMINDER: using training data for test')
        print('')
        testData_feeder = load_data_gen(train_feeder, LENGTH)
    else:
        testData_feeder = load_data_gen(test_feeder, LENGTH)
    mini_batch = testData_feeder.next()
    tmp, _, _, seqs_lab, seqs_noise = mini_batch
    samples_lab = seqs_lab[:N_SEQS]
    seqs_noise = seqs_noise[:N_SEQS]
    # Quantisation Steps (do this on the dataset not per minibatch)
    #seqs_noise = (seqs_noise /  np.amax(np.abs(seqs_noise), 1)[:,None]) + 1
    #seqs_noise = np.divide(np.multiply(seqs_noise, Q_LEVELS-1), 2)
    #seqs_noise = np.round(seqs_noise)
    seqs_noise = seqs_noise.astype(np.int32)

    if flag_dict['RMZERO']:
        samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE]
    else:
        samples[:, :BIG_FRAME_SIZE] = Q_ZERO
        samples_noise[:, :BIG_FRAME_SIZE] = Q_ZERO

    samples_lab_big = get_lab_big(samples_lab)

    # First half zero, others fixed random at each checkpoint
    big_h0 = numpy.zeros((N_SEQS, N_BIG_RNN, H0_MULT * BIG_DIM),
                         dtype='float32')

    h0 = numpy.zeros((N_SEQS, N_RNN, H0_MULT * DIM), dtype='float32')

    big_frame_level_outputs = None
    frame_level_outputs = None

    # LENGTH is length of utterance to generate.
    # Take one frame of silence, then start at index BIG_FRAME_SIZE.
    # Do this for training and debugging.
    # As the RNN needs initial state.
    # Once model is good enough, actually use 20 frames.
    for t in xrange(BIG_FRAME_SIZE, LENGTH):  # for loop going sample by sample

        if t % BIG_FRAME_SIZE == 0:
            tmp = samples_lab_big[:, (t - BIG_FRAME_SIZE) // BIG_FRAME_SIZE, :]
            tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1])

            big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
                seqs_noise[:, t - BIG_FRAME_SIZE:t], tmp, big_h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        if t % FRAME_SIZE == 0:
            tmp = samples_lab[:, (t - BIG_FRAME_SIZE) // FRAME_SIZE, :]
            # tmp = samples_lab[:,(t-FRAME_SIZE)//FRAME_SIZE,:] #classic, but might introduce a slight mis-alignment
            tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1])

            frame_level_outputs, h0 = frame_level_generate_fn(
                seqs_noise[:, t - FRAME_SIZE:t], tmp,
                big_frame_level_outputs[:, (t / FRAME_SIZE) %
                                        (BIG_FRAME_SIZE / FRAME_SIZE)], h0,
                numpy.int32(t == BIG_FRAME_SIZE))

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            seqs_noise[:, t - FRAME_SIZE_DNN:t])

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        data -= data.min()
        data /= data.max()
        data -= 0.5
        data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    # Generate N_SEQS' sample files, each 5 seconds long
    N_SECS = 5
    LENGTH = N_SECS * BITRATE if not args.debug else 100

    #sid=numpy.int16(10) # specify the speaker ID
    #g_spkids=numpy.empty(0)
    g_spkids = []
    for i in range(15):
        for j in range(20):
            g_spkids = numpy.append(g_spkids, i)

    g_spkids = numpy.int16(g_spkids)
    g_spkids = numpy.asarray(g_spkids, dtype='int16')
    print g_spkids

    samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
    samples[:, :FRAME_SIZE] = Q_ZERO

    # First half zero, others fixed random at each checkpoint
    h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM),
                     dtype='float32')
    h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
    frame_level_outputs = None

    for t in xrange(FRAME_SIZE, LENGTH):

        if t % FRAME_SIZE == 0:
            frame_level_outputs, h0 = frame_level_generate_fn(
                samples[:, t - FRAME_SIZE:t],
                g_spkids,
                h0,
                #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
                numpy.int32(t == FRAME_SIZE))

        samples[:, t] = sample_level_generate_fn(
            frame_level_outputs[:, t % FRAME_SIZE],
            samples[:, t - FRAME_SIZE:t],
            g_spkids,
        )

    total_time = time() - total_time
    log = "{} samples of {} seconds length generated in {} seconds."
    log = log.format(N_SEQS, N_SECS, total_time)
    print log,

    for i in xrange(N_SEQS):
        samp = samples[i]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samp = mu2linear(samp)
        elif Q_TYPE == 'a-law':
            raise NotImplementedError('a-law is not implemented')
        write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag):
    def write_audio_file(name, data):
        data = data.astype('float32')
        #data -= data.min()
        #data /= data.max()
        #data -= 0.5
        #data *= 0.95
        scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'),
                               BITRATE, data)

    total_time = time()
    costs_g = []
    accuracys_g = []
    count = 0
    data_feeder = load_data(test_feeder)
    for seqs_g_8k, seqs_g_up, reset_g, end_flag_g, mask_g, con_g, batch_g, seqs_g_8k_real in data_feeder:
        if reset_g == 1:
            con_h0_g = numpy.zeros(
                (batch_g, N_CON_RNN, H0_MULT * CON_TIER_DIM), dtype='float32')
            big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT * DIM),
                                   dtype='float32')
            h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT * DIM),
                               dtype='float32')
            cost_batch = np.zeros((batch_g, ), dtype='float32')
            accuracy_batch = np.zeros((batch_g, ), dtype='float32')
            mask_batch = np.zeros((batch_g, ), dtype='float32')
            count += 1
        cost_g, accuracy_g, mask_sum_g, sample, con_h0_g, big_h0_g, h0_g = test_fn(
            seqs_g_8k, seqs_g_up, con_g, con_h0_g, big_h0_g, h0_g, reset_g,
            mask_g, batch_g)
        cost_batch = cost_batch + cost_g
        accuracy_batch = accuracy_batch + accuracy_g
        mask_batch = mask_batch + mask_sum_g
        if end_flag_g == 1:
            costs_g.extend(list(cost_batch / mask_batch))
            accuracys_g.extend(list(accuracy_batch / mask_batch))

        if count == 1:
            if reset_g == 1:
                samples_low = seqs_g_8k_real[:, 0:-OVERLAP]
                samples = sample
                masks_g = mask_g[:, 0:-OVERLAP]
            else:
                samples_low = np.concatenate(
                    [samples_low, seqs_g_8k_real[:, 0:-OVERLAP]], axis=1)
                samples = np.concatenate([samples, sample], axis=1)
                masks_g = np.concatenate([masks_g, mask_g[:, 0:-OVERLAP]],
                                         axis=1)

    for i in xrange(N_SEQS):
        samples_lowi = samples_low[i]
        samplei = samples[i]
        maski = masks_g[i]
        samples_lowi = samples_lowi[0:len(np.where(maski == 1)[0])]
        samplei = samplei[0:len(np.where(maski == 1)[0])]
        if Q_TYPE == 'mu-law':
            from datasets.dataset import mu2linear
            samplei = mu2linear(samplei)
        write_audio_file("sample_{}_{}".format(tag, i),
                         samplei / 3 + samples_lowi)

    total_time = time() - total_time
    log = "{} samples generated in {} seconds."
    log = log.format(N_SEQS, total_time)
    print log,

    return numpy.mean(costs_g), numpy.mean(accuracys_g) * 100, total_time