def prediction(self): # Focus network shape = tf.shape(self.X) channels = [] for i in range(self.S): V = tf.gather(self.speaker_focus_vector, self.I[:, i]) # [B, focus_dim] V = tf.tile(tf.reshape(V, [-1, 1, self.focus_dim]), [1, tf.shape(self.X)[-2], 1]) layers = [ BLSTM(self.layer_size, 'BLSTM_' + str(i) + '_' + str(j)) for j in range(self.nb_layers) ] layers_sp = [ Conv1D([1, self.layer_size, self.embedding_size * self.F]), Reshape([self.B, shape[1], self.F, self.embedding_size]), Normalize(3) ] layers += layers_sp input_ = tf.concat([self.X, V], -1) channels.append(f_props(layers, input_)) return channels
def prediction(self): # L41 network shape = tf.shape(self.X) self.true_masks = 1.0 + self.y X_in = tf.identity(self.X) layers = [ BLSTM(self.layer_size, 'BLSTM_' + str(i)) for i in range(self.nb_layers) ] layers_sp = [ Conv1D([1, self.layer_size, self.embedding_size * self.F]), Reshape([self.B, shape[1], self.F, self.embedding_size]) ] if self.normalize: layers_sp += [Normalize(3)] layers += layers_sp y = f_props(layers, X_in) return y
def prediction(self): # L41 network self.true_masks = 1.0 + self.y X_in = tf.identity(self.X) if self.abs_input: X_in = tf.abs(X_in) if self.normalize_input == '01': self.min_ = tf.reduce_min(X_in, axis=[1, 2], keep_dims=True) self.max_ = tf.reduce_max(X_in, axis=[1, 2], keep_dims=True) X_in = (X_in - self.min_) / (self.max_ - self.min_) elif self.normalize_input == 'meanstd': mean, var = tf.nn.moments(X_in, axes=[1, 2], keep_dims=True) X_in = (X_in - mean) / tf.sqrt(var) f = 128 X_in = tf.expand_dims(X_in, 3) X_in = tf.reshape(X_in, [-1, 80, 256, 1]) y = tf.contrib.layers.conv2d(X_in, f, [1, 7], rate=[1, 1]) y = tf.contrib.layers.conv2d(y, f, [7, 1], rate=[1, 1]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[4, 1]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[8, 1]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[16, 1]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[32, 1]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[1, 1]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[2, 2]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[4, 4]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[8, 8]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[16, 16]) y = tf.contrib.layers.conv2d(y, f, [5, 5], rate=[32, 32]) y = tf.contrib.layers.conv2d(y, 4, [5, 5], rate=[1, 1]) y = tf.reshape(y, [self.B, 80, 256 * 4]) y = BLSTM(400, 'BLSTM_1').f_prop(y) y = BLSTM(400, 'BLSTM_2').f_prop(y) y = BLSTM(400, 'BLSTM_3').f_prop(y) y = Conv1D([1, 400, self.embedding_size * self.F]).f_prop(y) y = tf.reshape(y, [self.B, 80, self.F, self.embedding_size]) if self.normalize: y += tf.nn.l2_normalize(y) return y
def enhance(self): # [B, S, T, F] separated = tf.reshape(self.separate, [self.B, self.S, -1, self.F]) # X [B, T, F] # Tiling the input S time - like [ a, b, c] -> [ a, a, b, b, c, c], not [a, b, c, a, b, c] X_in = tf.expand_dims(self.X_input, 1) X_in = tf.tile(X_in, [1, self.S, 1, 1]) X_in = tf.reshape(X_in, [self.B, self.S, -1, self.F]) # Concat the separated input and the actual tiled input sep_and_in = tf.concat([separated, X_in], axis=3) sep_and_in = tf.reshape(sep_and_in, [self.B * self.S, -1, 2 * self.F]) if self.args['normalize_enhance']: mean, var = tf.nn.moments(sep_and_in, axes=[1, 2], keep_dims=True) sep_and_in = (sep_and_in - mean) / tf.sqrt(var) layers = [ BLSTM(self.args['layer_size_enhance'], drop_val=self.args["recurrent_dropout_enhance"], name='BLSTM_' + str(i)) for i in range(self.args['nb_layers_enhance']) ] layers += [Conv1D([1, self.args['layer_size_enhance'], self.F])] y = f_props(layers, sep_and_in) y = tf.reshape(y, [self.B, self.S, -1]) # [B, S, TF] tf.summary.image('mask/predicted/enhanced', tf.reshape(y, [self.B * self.S, -1, self.F, 1])) y = tf.transpose(y, [0, 2, 1]) # [B, TF, S] if self.args['nonlinearity'] == 'softmax': y = tf.nn.softmax(y) elif self.args['nonlinearity'] == 'tanh': y = tf.nn.tanh(y) self.enhanced_masks = tf.identity(y, name='enhanced_masks') tf.summary.image( 'mask/predicted/enhanced_soft', tf.reshape(tf.transpose(y, [0, 2, 1]), [self.B * self.S, -1, self.F, 1])) y = y * tf.reshape(self.X_input, [ self.B, -1, 1 ]) # Apply enhanced filters # [B, TF, S] -> [BS, T, F, 1] self.cost_in = y y = tf.transpose(y, [0, 2, 1]) self.separated = y return tf.reshape(y, [self.B * self.S, -1, self.F, 1])
def prediction(self): # DPCL network shape = tf.shape(self.X) layers = [BLSTM(self.layer_size, 'BLSTM_'+str(i)) for i in range(self.nb_layers)] layers_sp = [ Conv1D([1, self.layer_size, self.embedding_size*self.F]), Reshape([self.B, shape[1], self.F, self.embedding_size]), Normalize(3) ] layers += layers_sp y = f_props(layers, self.X) return y
def enhance(self): # [B, S, T, F] separated = tf.reshape(self.separate, [self.B, self.S, -1, self.F]) min_ = tf.reduce_min(separated, axis=[1, 2], keep_dims=True) max_ = tf.reduce_max(separated, axis=[1, 2], keep_dims=True) separated = (separated - min_) / (max_ - min_) # X [B, T, F] # Tiling the input S time - like [ a, b, c] -> [ a, a, b, b, c, c], not [a, b, c, a, b, c] X_in = tf.expand_dims(self.X, 1) X_in = tf.tile(X_in, [1, self.S, 1, 1]) X_in = tf.reshape(X_in, [self.B, self.S, -1, self.F]) # Concat the binary separated input and the actual tiled input sep_and_in = tf.concat([separated, X_in], axis=3) sep_and_in = tf.reshape(sep_and_in, [self.B * self.S, -1, 2 * self.F]) layers = [ BLSTM(self.args['layer_size_enhance'], 'BLSTM_' + str(i)) for i in range(self.args['nb_layers_enhance']) ] y = f_props(layers, sep_and_in) y = tf.layers.dense(y, self.F) y = tf.reshape(y, [self.B, self.S, -1]) # [B, S, TF] y = tf.transpose(y, [0, 2, 1]) # [B, TF, S] if self.args['nonlinearity'] == 'softmax': y = tf.nn.softmax(y) * tf.reshape(self.X, [ self.B, -1, 1 ]) # Apply enhanced filters # [B, TF, S] -> [BS, T, F, 1] elif self.args['nonlinearity'] == 'tanh': y = tf.nn.tanh(y) * tf.reshape(self.X, [ self.B, -1, 1 ]) # Apply enhanced filters # [B, TF, S] -> [BS, T, F, 1] # y = y * tf.reshape(self.X, [self.B, -1, 1]) # Apply enhanced filters # [B, TF, S] -> [BS, T, F, 1] self.cost_in = y y = tf.transpose(y, [0, 2, 1]) return tf.reshape(y, [self.B * self.S, -1, self.F, 1])
def prediction(self): # L41 network shape = tf.shape(self.X) self.true_masks = 1.0 + self.y X_in = tf.identity(self.X) layers = [ BLSTM(self.layer_size, name='BLSTM_' + str(i), drop_val=self.rdropout) for i in range(self.nb_layers) ] layers_sp = [ Conv1D([1, self.layer_size, self.embedding_size * self.F]), Reshape([self.B, shape[1], self.F, self.embedding_size]), ] layers += layers_sp y = f_props(layers, X_in) return y
def prediction(self): # L41 network shape = tf.shape(self.X) self.true_masks = 1.0 + self.y X_in = tf.identity(self.X) if self.abs_input: X_in = tf.abs(X_in) if self.normalize_input == '01': self.min_ = tf.reduce_min(X_in, axis=[1, 2], keep_dims=True) self.max_ = tf.reduce_max(X_in, axis=[1, 2], keep_dims=True) X_in = (X_in - self.min_) / (self.max_ - self.min_) elif self.normalize_input == 'meanstd': mean, var = tf.nn.moments(X_in, axes=[1, 2], keep_dims=True) X_in = (X_in - mean) / tf.sqrt(var) layers = [ BLSTM(self.layer_size, 'BLSTM_' + str(i)) for i in range(self.nb_layers) ] layers_sp = [ Conv1D([1, self.layer_size, self.embedding_size * self.F]), Reshape([self.B, shape[1], self.F, self.embedding_size]) ] if self.normalize: layers_sp += [Normalize(3)] layers += layers_sp y = f_props(layers, X_in) return y