def etdnn(features, params, is_training=None, reuse_variables=None, aux_features=None): """ Build an extended TDNN network. The network is larger than the classic TDNN. Still, this architecture is similar to Kaldi with some modifications. Args: features: A tensor with shape [batch, length, dim]. params: Configuration loaded from a JSON. is_training: True if the network is used for training. reuse_variables: True if the network has been built and enable variable reuse. aux_features: Auxiliary features (e.g. linguistic features or bottleneck features). :return: features: The output of the last layer. endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to the network. Thus it is convenient to split the network by a output name """ tf.logging.info("Build an extended TDNN network.") # ReLU is a normal choice while other activation function is possible. relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu elif params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu endpoints = OrderedDict() with tf.variable_scope("etdnn", reuse=reuse_variables): # # Convert to [b, 1, l, d] # features = tf.expand_dims(features, 1) # Input features [b, l, d] # Use 1-D covn instead # Layer 1: [-2,-1,0,1,2] --> [b, l-4, 512] # conv2d + batchnorm + relu features = tf.layers.conv1d( features, 512, 5, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn1_conv') endpoints["tdnn1_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn1_bn") endpoints["tdnn1_bn"] = features features = relu(features, name='tdnn1_relu') endpoints["tdnn1_relu"] = features # Layer 2: FC [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn2_dense") endpoints["tdnn2_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn2_bn") endpoints["tdnn2_bn"] = features features = relu(features, name='tdnn2_relu') endpoints["tdnn2_relu"] = features # Layer 3: [-2, -1, 0, 1, 2] --> [b, l-4, 512] # conv2d + batchnorm + relu # This is slightly different with Kaldi which use dilation convolution features = tf.layers.conv1d( features, 512, 5, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn3_conv') endpoints["tdnn3_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn3_bn") endpoints["tdnn3_bn"] = features features = relu(features, name='tdnn3_relu') endpoints["tdnn3_relu"] = features # Layer 4: FC [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn4_dense") endpoints["tdnn4_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn4_bn") endpoints["tdnn4_bn"] = features features = relu(features, name='tdnn4_relu') endpoints["tdnn4_relu"] = features # Layer 5: [-3, -2, -1, 0, 1, 2, 3] --> [b, l-6, 512] # conv2d + batchnorm + relu # Still, use a non-dilation one features = tf.layers.conv1d( features, 512, 7, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn5_conv') endpoints["tdnn5_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn5_bn") endpoints["tdnn5_bn"] = features features = relu(features, name='tdnn5_relu') endpoints["tdnn5_relu"] = features # Layer 6: FC [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn6_dense") endpoints["tdnn6_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn6_bn") endpoints["tdnn6_bn"] = features features = relu(features, name='tdnn6_relu') endpoints["tdnn6_relu"] = features # Layer 7: [-4, -3, -2, -1, 0, 1, 2, 3, 4] --> [b, l-8, 512] # conv2d + batchnorm + relu # Still, use a non-dilation one features = tf.layers.conv1d( features, 512, 9, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7_conv') endpoints["tdnn7_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn7_bn") endpoints["tdnn7_bn"] = features features = relu(features, name='tdnn7_relu') endpoints["tdnn7_relu"] = features # Layer 8: FC [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn8_dense") endpoints["tdnn8_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn8_bn") endpoints["tdnn8_bn"] = features features = relu(features, name='tdnn8_relu') endpoints["tdnn8_relu"] = features # Layer 9: FC [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn9_dense") endpoints["tdnn9_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn9_bn") endpoints["tdnn9_bn"] = features features = relu(features, name='tdnn9_relu') endpoints["tdnn9_relu"] = features if "num_nodes_pooling_layer" not in params.dict: # The default number of nodes before pooling params.dict["num_nodes_pooling_layer"] = 1500 # Layer 10: FC [b, l, 512] --> [b, l, 1500] features = tf.layers.dense( features, params.num_nodes_pooling_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn10_dense") endpoints["tdnn10_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn10_bn") endpoints["tdnn10_bn"] = features features = relu(features, name='tdnn10_relu') endpoints["tdnn10_relu"] = features # Pooling layer features = general_pooling(features, aux_features, endpoints, params, is_training) # Utterance-level network # Layer 12: [b, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn12_dense') endpoints['tdnn12_dense'] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn12_bn") endpoints["tdnn12_bn"] = features features = relu(features, name='tdnn12_relu') endpoints["tdnn12_relu"] = features # Layer 13: [b, x] if "num_nodes_last_layer" not in params.dict: # The default number of nodes in the last layer params.dict["num_nodes_last_layer"] = 512 features = tf.layers.dense( features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn13_dense') endpoints['tdnn13_dense'] = features if "last_layer_no_bn" not in params.dict: params.last_layer_no_bn = False if not params.last_layer_no_bn: features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn13_bn") endpoints["tdnn13_bn"] = features if "last_layer_linear" not in params.dict: params.last_layer_linear = False if not params.last_layer_linear: # If the last layer is linear, no further activation is needed. features = relu(features, name='tdnn13_relu') endpoints["tdnn13_relu"] = features return features, endpoints
def resnet_18(features, params, is_training=None, reuse_variables=None, aux_features=None): """ Build a ResNet. Modified ResNet-18, the blocks are: [3/64, 3/64], [3/128, 3/128], [3/256, 3/256], [3/512, 3/512] The default number of blocks: [2, 2, 2, 2] The last 3 blocks can downsample the features. N fully-connected layers are appended to the output the res blocks. There are actually 2 more layers than standard ResNet implementation. The downsample in ResNet-50 with 1*1 kernel may lose the frequency resolution. About the network parameters (no batchnorm included): TDNN: 2.6M (or 4.2M without dilation) ETDNN: 4.4M (or 7.6M without dilation) Modified FTDNN: 9.2M Modified EFTDNN: 32M FTDNN: 9.0M EFTDNN: 19.8M (much smaller than modified eftdnn) ResNet-18: 13.5M ResNet-34: 23.6M ResNet-50: 16.1M ResNet-101: 28.4M Args: features: A tensor with shape [batch, length, dim]. params: Configuration loaded from a JSON. is_training: True if the network is used for training. reuse_variables: True if the network has been built and enable variable reuse. aux_features: Auxiliary features (e.g. linguistic features or bottleneck features). :return: features: The output of the last layer. endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to the network. Thus it is convenient to split the network by a output name """ # The strides only affect the last 3 conv block time_stride = 2 if params.resnet_time_stride else 1 # The dimension of the features should be 40 assert (shape_list(features)[-1] == 40) tf.logging.info("Build a ResNet-18 network.") # ReLU is a normal choice while other activation function is possible. relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu elif params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu # The block parameters # default: [2, 2, 2, 2] if "resnet_blocks" not in params.dict: params.dict["resnet_blocks"] = [2, 2, 2, 2] tf.logging.info("The resnet blocks: [%d, %d, %d, %d]", params.resnet_blocks[0], params.resnet_blocks[1], params.resnet_blocks[2], params.resnet_blocks[3]) endpoints = OrderedDict() with tf.variable_scope("resnet_18", reuse=reuse_variables): # features: [N, L, F, 1] # ndim = shape_list(features)[-1] features = tf.expand_dims(features, axis=3) # Since we use 40-dim FBanks, the kernel should be smaller. # First conv # No strides are applied. features = tf.layers.conv2d( features, 64, (3, 3), padding='same', activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv0_1') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="conv0_bn") features = relu(features, name='conv0_relu') if params.resnet_maxpooling: features = tf.layers.max_pooling2d(features, (3, 3), (1, 1), padding='same', name='conv0_max') # Conv Block 1 features = conv_block(features, [[3, 3], [3, 3]], [64, 64], [1, 1], params, is_training, relu, "conv1a") for i in range(params.resnet_blocks[0] - 1): features = identity_block(features, [[3, 3], [3, 3]], [64, 64], params, is_training, relu, "conv1b_%d" % i) # Conv Block 2 features = conv_block(features, [[3, 3], [3, 3]], [128, 128], [time_stride, 2], params, is_training, relu, "conv2a") for i in range(params.resnet_blocks[1] - 1): features = identity_block(features, [[3, 3], [3, 3]], [128, 128], params, is_training, relu, "conv2b_%d" % i) # Conv Block 3 features = conv_block(features, [[3, 3], [3, 3]], [256, 256], [time_stride, 2], params, is_training, relu, "conv3a") for i in range(params.resnet_blocks[2] - 1): features = identity_block(features, [[3, 3], [3, 3]], [256, 256], params, is_training, relu, "conv3b_%d" % i) # Conv Block 4 features = conv_block(features, [[3, 3], [3, 3]], [512, 512], [time_stride, 2], params, is_training, relu, "conv4a") for i in range(params.resnet_blocks[3] - 1): features = identity_block(features, [[3, 3], [3, 3]], [512, 512], params, is_training, relu, "conv4b_%d" % i) # features: [N, L/t, 5, 512] # The original resnet use average pooling to get [N, 512] which I think will eliminate the time resolution. # Hence, in this implementation, we first obtain [N, L, 512] via conv layer and use dense layer to process the features features = tf.layers.conv2d( features, 512, (1, shape_list(features)[2]), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv5') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="conv5_bn") features = relu(features, name='conv5_relu') features = tf.squeeze(features, axis=2) # FC layers * 2 features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='dense1') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="dense1_bn") features = relu(features, name='dense1_relu') features = tf.layers.dense( features, 1500, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='dense2') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="dense2_bn") features = relu(features, name='dense2_relu') # Compute the number of parameters num_params = 3*3*64 + (2*3*3*64*64*params.resnet_blocks[0] + 64*64) + \ (3*3*64*128 + 3*3*128*128 + 64*128 + 2*3*3*128*128*(params.resnet_blocks[1]-1)) + \ (3*3*128*256 + 3*3*256*256 + 128*256 + 2*3*3*256*256*(params.resnet_blocks[2]-1)) + \ (3*3*256*512 + 3*3*512*512 + 256*512 + 2*3*3*512*512*(params.resnet_blocks[3]-1)) + \ (1*5*512*512 + 512*512 + 512*1500) tf.logging.info( "The number of parameters of the frame-level network: %d" % num_params) num_layers = 4 + 2 * ( params.resnet_blocks[0] + params.resnet_blocks[1] + params.resnet_blocks[2] + params.resnet_blocks[3]) tf.logging.info("The number of layers: %d" % num_layers) # Pooling features = general_pooling(features, aux_features, endpoints, params, is_training) # Utterance-level network # Layer 6: [b, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn6_dense') endpoints['tdnn6_dense'] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn6_bn") endpoints["tdnn6_bn"] = features features = relu(features, name='tdnn6_relu') endpoints["tdnn6_relu"] = features # Layer 7: [b, x] if "num_nodes_last_layer" not in params.dict: # The default number of nodes in the last layer params.dict["num_nodes_last_layer"] = 512 features = tf.layers.dense( features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7_dense') endpoints['tdnn7_dense'] = features if "last_layer_no_bn" not in params.dict: params.last_layer_no_bn = False if not params.last_layer_no_bn: features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn7_bn") endpoints["tdnn7_bn"] = features if "last_layer_linear" not in params.dict: params.last_layer_linear = False if not params.last_layer_linear: # If the last layer is linear, no further activation is needed. features = relu(features, name='tdnn7_relu') endpoints["tdnn7_relu"] = features return features, endpoints
def tdnns(features, params, is_training=None, reuse_variables=None, aux_features=None): """Build a TDNN network. The structure is similar to Kaldi, while it uses bn+relu rather than relu+bn. And there is no dilation used, so it has more parameters than Kaldi x-vector. Args: features: A tensor with shape [batch, length, dim]. params: Configuration loaded from a JSON. is_training: True if the network is used for training. reuse_variables: True if the network has been built and enable variable reuse. aux_features: Auxiliary features (e.g. linguistic features or bottleneck features). :return: features: The output of the last layer. endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to the network. Thus it is convenient to split the network by a output name """ # ReLU is a normal choice while other activation function is possible. tf.logging.error("Do not use this one!") quit() relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu elif params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu endpoints = OrderedDict() with tf.variable_scope("tdnn", reuse=reuse_variables): # Convert to [b, 1, l, d] features = tf.expand_dims(features, 1) # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512] # conv2d + batchnorm + relu features = tf.layers.conv2d( features, 512, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn1_conv') endpoints["tdnn1_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn1_bn") endpoints["tdnn1_bn"] = features features = relu(features, name='tdnn1_relu') endpoints["tdnn1_relu"] = features # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512] # conv2d + batchnorm + relu # This is slightly different with Kaldi which use dilation convolution features = tf.layers.conv2d( features, 512, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn2_conv') endpoints["tdnn2_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn2_bn") endpoints["tdnn2_bn"] = features features = relu(features, name='tdnn2_relu') endpoints["tdnn2_relu"] = features # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512] # conv2d + batchnorm + relu # Still, use a non-dilation one features = tf.layers.conv2d( features, 512, (1, 7), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn3_conv') endpoints["tdnn3_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn3_bn") endpoints["tdnn3_bn"] = features features = relu(features, name='tdnn3_relu') endpoints["tdnn3_relu"] = features # Convert to [b, l, 512] features = tf.squeeze(features, axis=1) # The output of the 3-rd layer can simply be rank 3. endpoints["tdnn3_relu"] = features # Layer 4: [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn4_dense") endpoints["tdnn4_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn4_bn") endpoints["tdnn4_bn"] = features features = relu(features, name='tdnn4_relu') endpoints["tdnn4_relu"] = features # Layer 5: [b, l, x] if "num_nodes_pooling_layer" not in params.dict: # The default number of nodes before pooling params.dict["num_nodes_pooling_layer"] = 1500 features = tf.layers.dense( features, params.num_nodes_pooling_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn5_dense") endpoints["tdnn5_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn5_bn") endpoints["tdnn5_bn"] = features features = relu(features, name='tdnn5_relu') endpoints["tdnn5_relu"] = features # Pooling layer features = general_pooling(features, aux_features, endpoints, params, is_training) # Layer 7: [b, x] if "num_nodes_last_layer" not in params.dict: # The default number of nodes in the last layer params.dict["num_nodes_last_layer"] = 512 features = tf.layers.dense( features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7_dense') endpoints['tdnn7_dense'] = features if "last_layer_no_bn" not in params.dict: params.last_layer_no_bn = False if not params.last_layer_no_bn: features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn7_bn") endpoints["tdnn7_bn"] = features if "last_layer_linear" not in params.dict: params.last_layer_linear = False if not params.last_layer_linear: # If the last layer is linear, no further activation is needed. features = relu(features, name='tdnn7_relu') endpoints["tdnn7_relu"] = features return features, endpoints