def __init__(self, pooling_type, num_speakers):
        super(Tdnn, self).__init__()

        if pooling_type == "statistics_pooling":
            self.pooling_layer = statistics_pooling()
        self.num_speakers = num_speakers

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=30, out_channels=512, kernel_size=(5, 1)),
            nn.BatchNorm2d(512), nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(5, 1)),
            nn.BatchNorm2d(512), nn.ReLU())

        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(7, 1)),
            nn.BatchNorm2d(512), nn.ReLU())

        self.layer4 = nn.Sequential(
            nn.Linear(in_features=512, out_features=512), nn.BatchNorm1d(512),
            nn.ReLU())

        self.layer5 = nn.Sequential(
            nn.Linear(in_features=512, out_features=1500),
            nn.BatchNorm1d(1500), nn.ReLU())

        self.layer6 = nn.Linear(in_features=3000, out_features=512)
        self.layer6_bn = nn.BatchNorm1d(512)
        self.layer6_act = nn.ReLU()

        self.layer7 = nn.Sequential(
            nn.Linear(in_features=512, out_features=512), nn.BatchNorm1d(512),
            nn.ReLU())
        self.softmax_layer = nn.Linear(in_features=512,
                                       out_features=num_speakers)
Beispiel #2
0
def tdnn_svd(features,
             params,
             is_training=None,
             reuse_variables=None,
             aux_features=None):
    """Build a TDNN network.
    The structure is similar to Kaldi, while it uses bn+relu rather than relu+bn.
    And there is no dilation used, so it has more parameters than Kaldi x-vector.

    Args:
        features: A tensor with shape [batch, length, dim].
        params: Configuration loaded from a JSON.
        svd_params: Configuration to point out which layers to be svd loaded from a JSON.
                    And it should be updated by function "update_mid_channels" before passing to "svdtdnn".
        is_training: True if the network is used for training.
        reuse_variables: True if the network has been built and enable variable reuse.
        aux_features: Auxiliary features (e.g. linguistic features or bottleneck features).
    :return:
        features: The output of the last layer.
        endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to
                   the network. Thus it is convenient to split the network by a output name
    """
    name = 'tdnn_svd'
    svd_json_path = '/data2/liry/test/tf-kaldi-speaker/model/hello.json'
    svd_params = Params(svd_json_path)

    assert svd_params.updated

    # ReLU is a normal choice while other activation function is possible.
    relu = tf.nn.relu

    for layer in svd_params.split:
        if svd_params.split[layer] and svd_params.mid_channels[layer] == -1:
            raise AttributeError(
                'Please update the mid_channels of %s before construct the graph'
                % layer)

    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        if params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    endpoints = OrderedDict()
    with tf.variable_scope(name, reuse=reuse_variables):
        # Convert to [b, 1, l, d]
        features = tf.expand_dims(features, 1)

        # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512]
        # conv2d + batchnorm + relu
        if svd_params.split['tdnn1_conv']:
            features = tf.layers.conv2d(
                features,
                svd_params.mid_channels['tdnn1_conv'], (1, 5),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn1.0_conv',
                bias_initializer=tf.zeros_initializer())
            endpoints["tdnn1.0_conv"] = features
            features = tf.layers.conv2d(
                features,
                32, (1, 1),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn1.5_conv')
            endpoints["tdnn1.5_conv"] = features
        else:
            features = tf.layers.conv2d(
                features,
                32, (1, 5),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn1_conv')
            endpoints["tdnn1_conv"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn1_bn")
        endpoints["tdnn1_bn"] = features
        features = relu(features, name='tdnn1_relu')
        endpoints["tdnn1_relu"] = features

        # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512]
        # conv2d + batchnorm + relu
        # This is slightly different with Kaldi which use dilation convolution
        if svd_params.split['tdnn2_conv']:
            features = tf.layers.conv2d(
                features,
                svd_params.mid_channels['tdnn2_conv'], (1, 5),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn2.0_conv',
                bias_initializer=tf.zeros_initializer())
            endpoints["tdnn2.0_conv"] = features
            features = tf.layers.conv2d(
                features,
                32, (1, 1),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn2.5_conv')
            endpoints["tdnn2.5_conv"] = features
        else:
            features = tf.layers.conv2d(
                features,
                32, (1, 5),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn2_conv')
            endpoints["tdnn2_conv"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn2_bn")
        endpoints["tdnn2_bn"] = features
        features = relu(features, name='tdnn2_relu')
        endpoints["tdnn2_relu"] = features

        # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512]
        # conv2d + batchnorm + relu
        # Still, use a non-dilation one
        if svd_params.split['tdnn3_conv']:
            features = tf.layers.conv2d(
                features,
                svd_params.mid_channels['tdnn3_conv'], (1, 7),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn3.0_conv',
                bias_initializer=tf.zeros_initializer())
            endpoints["tdnn3.0_conv"] = features
            features = tf.layers.conv2d(
                features,
                32, (1, 1),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn3.5_conv')
            endpoints["tdnn3.5_conv"] = features
        else:
            features = tf.layers.conv2d(
                features,
                32, (1, 7),
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn3_conv')
            endpoints["tdnn3_conv"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn3_bn")
        endpoints["tdnn3_bn"] = features
        features = relu(features, name='tdnn3_relu')
        endpoints["tdnn3_relu"] = features

        # Convert to [b, l, 512]
        features = tf.squeeze(features, axis=1)
        # The output of the 3-rd layer can simply be rank 3.
        endpoints["tdnn3_relu"] = features

        # Layer 4: [b, l, 512] --> [b, l, 512]
        if svd_params.split['tdnn4_dense']:
            features = tf.layers.dense(
                features,
                svd_params.mid_channels['tdnn4_dense'],
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name="tdnn4.0_dense",
                bias_initializer=tf.zeros_initializer())
            endpoints["tdnn4.0_dense"] = features
            features = tf.layers.dense(
                features,
                512,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name="tdnn4.5_dense")
            endpoints["tdnn4.5_dense"] = features
        else:
            features = tf.layers.dense(
                features,
                512,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name="tdnn4_dense")
            endpoints["tdnn4_dense"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn4_bn")
        endpoints["tdnn4_bn"] = features
        features = relu(features, name='tdnn4_relu')
        endpoints["tdnn4_relu"] = features

        # Layer 5: [b, l, x]
        if "num_nodes_pooling_layer" not in params.dict:
            # The default number of nodes before pooling
            params.dict["num_nodes_pooling_layer"] = 1500

        if svd_params.split['tdnn5_dense']:
            features = tf.layers.dense(
                features,
                svd_params.mid_channels['tdnn5_dense'],
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name="tdnn5.0_dense",
                bias_initializer=tf.zeros_initializer())
            endpoints["tdnn5.0_dense"] = features
            features = tf.layers.dense(
                features,
                params.num_nodes_pooling_layer,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name="tdnn5.5_dense")
            endpoints["tdnn5.5_dense"] = features
        else:
            features = tf.layers.dense(
                features,
                params.num_nodes_pooling_layer,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name="tdnn5_dense")
            endpoints["tdnn5_dense"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn5_bn")
        endpoints["tdnn5_bn"] = features
        features = relu(features, name='tdnn5_relu')
        endpoints["tdnn5_relu"] = features

        # Pooling layer
        # If you add new pooling layer, modify this code.
        # Statistics pooling
        # [b, l, 1500] --> [b, x]
        if params.pooling_type == "statistics_pooling":
            features = statistics_pooling(features, aux_features, endpoints,
                                          params, is_training)
        elif params.pooling_type == "self_attention":
            features = self_attention(features, aux_features, endpoints,
                                      params, is_training)
        elif params.pooling_type == "ghost_vlad":
            features = ghost_vlad(features, aux_features, endpoints, params,
                                  is_training)
        # elif params.pooling_type == "aux_attention":
        #     features = aux_attention(features, aux_features, endpoints, params, is_training)
        else:
            raise NotImplementedError("Not implement %s pooling" %
                                      params.pooling_type)
        endpoints['pooling'] = features

        # Utterance-level network
        # Layer 6: [b, 512]
        if svd_params.split['tdnn6_dense']:
            features = tf.layers.dense(
                features,
                svd_params.mid_channels['tdnn6_dense'],
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn6.0_dense',
                bias_initializer=tf.zeros_initializer())
            endpoints['tdnn6.0_dense'] = features
            features = tf.layers.dense(
                features,
                512,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn6.5_dense')
            endpoints['tdnn6.5_dense'] = features
        else:
            features = tf.layers.dense(
                features,
                512,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn6_dense')
            endpoints['tdnn6_dense'] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn6_bn")
        endpoints["tdnn6_bn"] = features
        features = relu(features, name='tdnn6_relu')
        endpoints["tdnn6_relu"] = features

        # Layer 7: [b, x]
        if "num_nodes_last_layer" not in params.dict:
            # The default number of nodes in the last layer
            params.dict["num_nodes_last_layer"] = 512

        if svd_params.split['tdnn7_dense']:
            features = tf.layers.dense(
                features,
                svd_params.mid_channels['tdnn7_dense'],
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn7.0_dense',
                bias_initializer=tf.zeros_initializer())
            endpoints['tdnn7.0_dense'] = features
            features = tf.layers.dense(
                features,
                params.num_nodes_last_layer,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn7.5_dense')
            endpoints['tdnn7.5_dense'] = features
        else:
            features = tf.layers.dense(
                features,
                params.num_nodes_last_layer,
                activation=None,
                kernel_regularizer=tf.contrib.layers.l2_regularizer(
                    params.weight_l2_regularizer),
                name='tdnn7_dense')
            endpoints['tdnn7_dense'] = features

        if "last_layer_no_bn" not in params.dict:
            params.last_layer_no_bn = False

        if not params.last_layer_no_bn:
            features = tf.layers.batch_normalization(
                features,
                momentum=params.batchnorm_momentum,
                training=is_training,
                name="tdnn7_bn")
            endpoints["tdnn7_bn"] = features

        if "last_layer_linear" not in params.dict:
            params.last_layer_linear = False

        if not params.last_layer_linear:
            # If the last layer is linear, no further activation is needed.
            features = relu(features, name='tdnn7_relu')
            endpoints["tdnn7_relu"] = features

    return features, endpoints
Beispiel #3
0
def tdnn_distill(features, params, is_training=None, reuse_variables=None, aux_features=None):
    """Build a TDNN network.
    The structure is similar to Kaldi, while it uses bn+relu rather than relu+bn.
    And there is no dilation used, so it has more parameters than Kaldi x-vector.

    Args:
        features: A tensor with shape [batch, length, dim].
        params: Configuration loaded from a JSON.
        is_training: True if the network is used for training.
        reuse_variables: True if the network has been built and enable variable reuse.
        aux_features: Auxiliary features (e.g. linguistic features or bottleneck features).
    :return:
        features: The output of the last layer.
        endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to
                   the network. Thus it is convenient to split the network by a output name
    """
    # ReLU is a normal choice while other activation function is possible.
    relu = tf.nn.relu
    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        if params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    endpoints = OrderedDict()
    with tf.variable_scope("tdnn", reuse=reuse_variables):
        # Convert to [b, 1, l, d]
        features = tf.expand_dims(features, 1)

        # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512]
        # conv2d + batchnorm + relu
        features = tf.layers.conv2d(features,
                                64,
                                (1, 5),
                                activation=None,
                                kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                name='tdnn1_conv')
        endpoints["tdnn1_conv"] = features
        features = tf.layers.batch_normalization(features,
                                                 momentum=params.batchnorm_momentum,
                                                 training=is_training,
                                                 name="tdnn1_bn")
        endpoints["tdnn1_bn"] = features
        features = relu(features, name='tdnn1_relu')
        endpoints["tdnn1_relu"] = features

        # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512]
        # conv2d + batchnorm + relu
        # This is slightly different with Kaldi which use dilation convolution
        features = tf.layers.conv2d(features,
                                    64,
                                    (1, 5),
                                    activation=None,
                                    kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                    name='tdnn2_conv')
        endpoints["tdnn2_conv"] = features
        features = tf.layers.batch_normalization(features,
                                                 momentum=params.batchnorm_momentum,
                                                 training=is_training,
                                                 name="tdnn2_bn")
        endpoints["tdnn2_bn"] = features
        features = relu(features, name='tdnn2_relu')
        endpoints["tdnn2_relu"] = features

        # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512]
        # conv2d + batchnorm + relu
        # Still, use a non-dilation one
        features = tf.layers.conv2d(features,
                                    64,
                                    (1, 7),
                                    activation=None,
                                    kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                    name='tdnn3_conv')
        endpoints["tdnn3_conv"] = features
        features = tf.layers.batch_normalization(features,
                                                 momentum=params.batchnorm_momentum,
                                                 training=is_training,
                                                 name="tdnn3_bn")
        endpoints["tdnn3_bn"] = features
        features = relu(features, name='tdnn3_relu')
        endpoints["tdnn3_relu"] = features

        # Convert to [b, l, 512]
        features = tf.squeeze(features, axis=1)
        # The output of the 3-rd layer can simply be rank 3.
        endpoints["tdnn3_relu"] = features

        # Layer 4: [b, l, 512] --> [b, l, 512]
        features = tf.layers.dense(features,
                                   512,
                                   activation=None,
                                   kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                   name="tdnn4_dense")
        endpoints["tdnn4_dense"] = features
        features = tf.layers.batch_normalization(features,
                                                 momentum=params.batchnorm_momentum,
                                                 training=is_training,
                                                 name="tdnn4_bn")
        endpoints["tdnn4_bn"] = features
        features = relu(features, name='tdnn4_relu')
        endpoints["tdnn4_relu"] = features

        # Layer 5: [b, l, x]
        if "num_nodes_pooling_layer" not in params.dict:
            # The default number of nodes before pooling
            params.dict["num_nodes_pooling_layer"] = 1500

        features = tf.layers.dense(features,
                                   params.num_nodes_pooling_layer,
                                   activation=None,
                                   kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                   name="tdnn5_dense")
        endpoints["tdnn5_dense"] = features
        features = tf.layers.batch_normalization(features,
                                                 momentum=params.batchnorm_momentum,
                                                 training=is_training,
                                                 name="tdnn5_bn")
        endpoints["tdnn5_bn"] = features
        features = relu(features, name='tdnn5_relu')
        endpoints["tdnn5_relu"] = features

        # Pooling layer
        # If you add new pooling layer, modify this code.
        # Statistics pooling
        # [b, l, 1500] --> [b, x]
        if params.pooling_type == "statistics_pooling":
            features = statistics_pooling(features, aux_features, endpoints, params, is_training)
        elif params.pooling_type == "self_attention":
            features = self_attention(features, aux_features, endpoints, params, is_training)
        elif params.pooling_type == "ghost_vlad":
            features = ghost_vlad(features, aux_features, endpoints, params, is_training)
        # elif params.pooling_type == "aux_attention":
        #     features = aux_attention(features, aux_features, endpoints, params, is_training)
        else:
            raise NotImplementedError("Not implement %s pooling" % params.pooling_type)
        endpoints['pooling'] = features

        # Utterance-level network
        # Layer 6: [b, 512]
        features = tf.layers.dense(features,
                                   512,
                                   activation=None,
                                   kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                   name='tdnn6_dense')
        endpoints['tdnn6_dense'] = features
        features = tf.layers.batch_normalization(features,
                                                 momentum=params.batchnorm_momentum,
                                                 training=is_training,
                                                 name="tdnn6_bn")
        endpoints["tdnn6_bn"] = features
        features = relu(features, name='tdnn6_relu')
        endpoints["tdnn6_relu"] = features

        # Layer 7: [b, x]
        if "num_nodes_last_layer" not in params.dict:
            # The default number of nodes in the last layer
            params.dict["num_nodes_last_layer"] = 64

        features = tf.layers.dense(features,
                                   params.num_nodes_last_layer,
                                   activation=None,
                                   kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer),
                                   name='tdnn7_dense')
        endpoints['tdnn7_dense'] = features

        if "last_layer_no_bn" not in params.dict:
            params.last_layer_no_bn = False

        if not params.last_layer_no_bn:
            features = tf.layers.batch_normalization(features,
                                                     momentum=params.batchnorm_momentum,
                                                     training=is_training,
                                                     name="tdnn7_bn")
            endpoints["tdnn7_bn"] = features

        if "last_layer_linear" not in params.dict:
            params.last_layer_linear = False

        if not params.last_layer_linear:
            # If the last layer is linear, no further activation is needed.
            features = relu(features, name='tdnn7_relu')
            endpoints["tdnn7_relu"] = features

    return features, endpoints