Esempio n. 1
0
def feat_extr(s, d, s_len, d_len, Q, Nw, Ns, NFFT, fs, P, nconst, mu, sigma):
    '''
	Extracts input features and targets from given clean speech and noise.

	Inputs:
		s - clean waveform (dtype=tf.int32).
		d - noisy waveform (dtype=tf.int32).
		s_len - clean waveform length without padding (samples).
		d_len - noise waveform length without padding (samples).
		Q - SNR level.
		Nw - window length (samples).
		Ns - window shift (samples).
		NFFT - DFT components.
		fs - sampling frequency (Hz).
		P - padded waveform length (samples).
		nconst - normalization constant.
		mu - mean of a priori SNR in dB.
		sigma - standard deviation of a priori SNR in dB.

	Outputs:
		x_MS - padded noisy single-sided magnitude spectrum.
		xi_mapped - mapped a priori SNR.	
		seq_len - length of each sequence without padding.
	'''
    (s, x, d) = tf.map_fn(
        lambda z: feat.addnoisepad(z[0], z[1], z[2], z[3], z[4], P, nconst),
        (s, d, s_len, d_len, Q),
        dtype=(tf.float32, tf.float32, tf.float32))  # padded waveforms.
    seq_len = feat.nframes(s_len, Ns)  # length of each sequence.
    s_MS = feat.stms(s, Nw, Ns, NFFT)  # clean speech magnitude spectrum.
    d_MS = feat.stms(d, Nw, Ns, NFFT)  # noise magnitude spectrum.
    x_MS = feat.stms(x, Nw, Ns, NFFT)  # noisy speech magnitude spectrum.
    xi = tf.div(tf.square(tf.maximum(s_MS, 1e-12)),
                tf.square(tf.maximum(d_MS, 1e-12)))  # a priori SNR.
    xi_dB = tf.multiply(10.0, log10(xi))  # a priori SNR dB.
    xi_mapped = tf.multiply(
        0.5,
        tf.add(
            1.0,
            tf.erf(
                tf.div(tf.subtract(xi_dB, mu), tf.multiply(
                    sigma, tf.sqrt(2.0))))))  # cdf of a priori SNR in dB.
    xi_mapped = tf.boolean_mask(xi_mapped,
                                tf.sequence_mask(seq_len))  # convert to 2D.
    return (x_MS, xi_mapped, seq_len)
Esempio n. 2
0
                          shape=[None, args.input_dim],
                          name='G_ph')  # gain function placeholder.

    ## ANALYSIS
    x = tf.div(
        tf.cast(tf.slice(tf.squeeze(x_ph), [0], [tf.squeeze(x_len_ph)]),
                tf.float32), args.nconst)  # remove padding and normalise.
    x_DFT = feat.stft(
        x, args.Nw, args.Ns,
        args.NFFT)  # noisy speech single-sided short-time Fourier transform.
    x_MS_3D = tf.expand_dims(
        tf.abs(x_DFT),
        0)  # noisy speech single-sided magnitude spectrum (in 3D form).
    x_MS = tf.abs(x_DFT)  # noisy speech single-sided magnitude spectrum.
    x_PS = tf.angle(x_DFT)  # noisy speech single-sided phase spectrum.
    x_seq_len = feat.nframes(x_len_ph, args.Ns)  # length of each sequence.

    ## ENHANCEMENT
    if args.gain == 'ibm':
        G = tf.cast(tf.greater(xi_hat_ph, 1), tf.float32)  # IBM gain function.
    if args.gain == 'wf':
        G = tf.div(xi_hat_ph, tf.add(xi_hat_ph, 1.0))  # WF gain function.
    if args.gain == 'srwf':
        G = tf.sqrt(tf.div(xi_hat_ph, tf.add(xi_hat_ph,
                                             1.0)))  # SRWF gain function.
    if args.gain == 'irm':
        G = tf.sqrt(tf.div(xi_hat_ph, tf.add(xi_hat_ph,
                                             1.0)))  # IRM gain function.
    if args.gain == 'cwf':
        G = tf.sqrt(xi_hat_ph)
        G = tf.div(G, tf.add(G, 1.0))  # SPP gain function.
Esempio n. 3
0
    def __init__(self, args):

        ## PLACEHOLDERS
        self.s_ph = tf.placeholder(tf.int16, shape=[None, None],
                                   name='s_ph')  # clean speech placeholder.
        self.d_ph = tf.placeholder(tf.int16, shape=[None, None],
                                   name='d_ph')  # noise placeholder.
        self.x_ph = tf.placeholder(tf.int16, shape=[None, None],
                                   name='x_ph')  # noisy speech placeholder.
        self.s_len_ph = tf.placeholder(
            tf.int32, shape=[None],
            name='s_len_ph')  # clean speech sequence length placeholder.
        self.d_len_ph = tf.placeholder(
            tf.int32, shape=[None],
            name='d_len_ph')  # noise sequence length placeholder.
        self.x_len_ph = tf.placeholder(
            tf.int32, shape=[None],
            name='x_len_ph')  # noisy speech sequence length placeholder.
        self.snr_ph = tf.placeholder(tf.float32, shape=[None],
                                     name='snr_ph')  # SNR placeholder.
        self.x_MS_ph = tf.placeholder(
            tf.float32, shape=[None, None, args.input_dim],
            name='x_MS_ph')  # noisy speech MS placeholder.
        self.x_MS_len_ph = tf.placeholder(
            tf.int32, shape=[None],
            name='x_MS_len_ph')  # noisy speech MS sequence length placeholder.
        self.target_ph = tf.placeholder(
            tf.float32, shape=[None, args.input_dim],
            name='target_phh')  # training target placeholder.
        self.keep_prob_ph = tf.placeholder(
            tf.float32, name='keep_prob_ph')  # keep probability placeholder.
        self.training_ph = tf.placeholder(
            tf.bool, name='training_ph')  # training placeholder.

        ## A PRIORI SNR IN DB STATISTICS
        self.mu = tf.constant(args.mu_mat['mu'], dtype=tf.float32)
        self.sigma = tf.constant(args.sigma_mat['sigma'], dtype=tf.float32)

        ## FEATURE GRAPH
        print('Preparing graph...')
        self.P = tf.reduce_max(self.s_len_ph)  # padded waveform length.
        self.feature = feat.xi_mapped(self.s_ph, self.d_ph, self.s_len_ph,
                                      self.d_len_ph, self.snr_ph, args.Nw,
                                      args.Ns, args.NFFT, args.fs, self.P,
                                      args.nconst, self.mu,
                                      self.sigma)  # feature graph.

        ## RESNET
        self.output = residual.Residual(self.x_MS_ph, self.x_MS_len_ph,
                                        self.keep_prob_ph, self.training_ph,
                                        args.num_outputs, args)

        ## LOSS & OPTIMIZER
        self.loss = residual.loss(self.target_ph, self.output,
                                  'sigmoid_cross_entropy')
        self.total_loss = tf.reduce_mean(self.loss, axis=0)
        self.trainer, _ = residual.optimizer(self.total_loss,
                                             optimizer='adam',
                                             grad_clip=True)

        ## SAVE VARIABLES
        self.saver = tf.train.Saver(max_to_keep=256)

        ## NUMBER OF PARAMETERS
        if args.verbose:
            print("No. of trainable parameters: %g." % (np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.trainable_variables()
            ])))

        ## INFERENCE GRAPH
        if args.infer:

            ## PLACEHOLDERS
            self.output_ph = tf.placeholder(
                tf.float32, shape=[None, args.input_dim],
                name='output_ph')  # network output placeholder.
            self.x_MS_2D_ph = tf.placeholder(
                tf.float32, shape=[None, args.input_dim],
                name='x_MS_2D_ph')  # noisy speech MS placeholder (in 2D form).
            self.x_PS_ph = tf.placeholder(
                tf.float32, shape=[None, args.input_dim],
                name='x_PS_ph')  # noisy speech PS placeholder.
            self.xi_hat_ph = tf.placeholder(
                tf.float32, shape=[None, args.input_dim],
                name='xi_hat_ph')  # a priori SNR estimate placeholder.
            self.G_ph = tf.placeholder(
                tf.float32, shape=[None, args.input_dim],
                name='G_ph')  # gain function placeholder.

            ## ANALYSIS
            self.x = tf.truediv(
                tf.cast(
                    tf.slice(tf.squeeze(self.x_ph), [0],
                             [tf.squeeze(self.x_len_ph)]), tf.float32),
                args.nconst)  # remove padding and normalise.
            self.x_DFT = feat.stft(
                self.x, args.Nw, args.Ns, args.NFFT
            )  # noisy speech single-sided short-time Fourier transform.
            self.x_MS_3D = tf.expand_dims(
                tf.abs(self.x_DFT), 0
            )  # noisy speech single-sided magnitude spectrum (in 3D form).
            self.x_MS = tf.abs(
                self.x_DFT)  # noisy speech single-sided magnitude spectrum.
            self.x_PS = tf.angle(
                self.x_DFT)  # noisy speech single-sided phase spectrum.
            self.x_seq_len = feat.nframes(self.x_len_ph,
                                          args.Ns)  # length of each sequence.

            ## MODIFICATION (SPEECH ENHANCEMENT)
            if args.gain == 'ibm':
                self.G = tf.cast(tf.greater(self.xi_hat_ph, 1),
                                 tf.float32)  # IBM gain function.
            if args.gain == 'wf':
                self.G = tf.truediv(self.xi_hat_ph,
                                    tf.add(self.xi_hat_ph,
                                           1.0))  # WF gain function.
            if args.gain == 'srwf':
                self.G = tf.sqrt(
                    tf.truediv(self.xi_hat_ph,
                               tf.add(self.xi_hat_ph,
                                      1.0)))  # SRWF gain function.
            if args.gain == 'irm':
                self.G = tf.sqrt(
                    tf.truediv(self.xi_hat_ph,
                               tf.add(self.xi_hat_ph,
                                      1.0)))  # IRM gain function.
            if args.gain == 'cwf':
                self.G = tf.sqrt(self.xi_hat_ph)
                self.G = tf.truediv(self.G, tf.add(self.G,
                                                   1.0))  # cWF gain function.
            self.s_hat_MS = tf.multiply(
                self.x_MS_2D_ph,
                self.G_ph)  # enhanced speech single-sided magnitude spectrum.

            ## SYNTHESIS GRAPH
            self.y_DFT = tf.cast(self.s_hat_MS, tf.complex64) * tf.exp(
                1j * tf.cast(self.x_PS_ph, tf.complex64)
            )  # enhanced speech single-sided short-time Fourier transform.
            self.y = tf.contrib.signal.inverse_stft(
                self.y_DFT, args.Nw, args.Ns, args.NFFT,
                tf.contrib.signal.inverse_stft_window_fn(
                    args.Ns,
                    forward_window_fn=tf.contrib.signal.hamming_window)
            )  # synthesis.