def _get_outputs(self, inputs, input_seq_length, is_training):
        '''
        Create the variables and do the forward computation

        Args:
            inputs: the inputs to the neural network, this is a list of
                [batch_size x time x ...] tensors
            input_seq_length: The sequence lengths of the input utterances, this
                is a [batch_size] vector
            is_training: whether or not the network is in training mode

        Returns:
            - output, which is a [batch_size x time x ...] tensors
        '''

        kernel_size = map(int, self.conf['filters'].split(' '))
        # num_capsules_1st_layer = int(self.conf['num_capsules_1st_layer'])
        # capsule_dim = int(self.conf['capsule_dim'])
        routing_iters = int(self.conf['routing_iters'])
        f_pool_rate = int(self.conf['f_pool_rate'])
        t_pool_rate = int(self.conf['t_pool_rate'])
        num_encoder_layers = int(self.conf['num_encoder_layers'])
        num_decoder_layers = num_encoder_layers
        num_centre_layers = int(self.conf['num_centre_layers'])
        num_capsules_lst = map(int, self.conf['num_capsules_lst'].split(' '))
        capsule_dim_lst = map(int, self.conf['capsule_dim_lst'].split(' '))

        # the encoder layers
        encoder_layers = []
        for l in range(0, num_encoder_layers):
            # num_capsules_l = num_capsules_1st_layer * 2**(l+1)
            num_capsules_l = num_capsules_lst[l + 1]
            capsule_dim_l = capsule_dim_lst[l + 1]
            strides = [1, 1]
            if (t_pool_rate != 0) & (np.mod(l, t_pool_rate) == 0):
                strides[0] = 2
            if (f_pool_rate != 0) & (np.mod(l, f_pool_rate) == 0):
                strides[1] = 2

            encoder_layers.append(
                layer.EncDecCapsule(num_capsules=num_capsules_l,
                                    capsule_dim=capsule_dim_l,
                                    kernel_size=kernel_size,
                                    strides=strides,
                                    padding='SAME',
                                    routing_iters=routing_iters))

        # the centre layers
        centre_layers = []
        for l in range(num_centre_layers):
            # num_capsules_l = num_capsules_1st_layer * 2**num_encoder_layers
            num_capsules_l = num_capsules_lst[l + 1 + num_encoder_layers]
            capsule_dim_l = capsule_dim_lst[l + 1 + num_encoder_layers]

            centre_layers.append(
                layer.EncDecCapsule(num_capsules=num_capsules_l,
                                    capsule_dim=capsule_dim_l,
                                    kernel_size=kernel_size,
                                    strides=(1, 1),
                                    padding='SAME',
                                    routing_iters=routing_iters))

        # the decoder layers
        decoder_layers = []
        for l in range(num_encoder_layers):
            corresponding_encoder_l = num_encoder_layers - 1 - l
            # if corresponding_encoder_l == 0:
            #     num_capsules_l = num_capsules_lst[corresponding_encoder_l]
            # else:
            #     num_capsules_l = encoder_layers[corresponding_encoder_l - 1].num_capsules
            num_capsules_l = num_capsules_lst[l + 1 + num_encoder_layers +
                                              num_centre_layers]
            capsule_dim_l = capsule_dim_lst[l + 1 + num_encoder_layers +
                                            num_centre_layers]
            strides = encoder_layers[corresponding_encoder_l].strides
            decoder_layers.append(
                layer.EncDecCapsule(num_capsules=num_capsules_l,
                                    capsule_dim=capsule_dim_l,
                                    kernel_size=kernel_size,
                                    strides=strides,
                                    padding='SAME',
                                    transpose=True,
                                    routing_iters=routing_iters))

        #code not available for multiple inputs!!
        if len(inputs) > 1:
            raise 'The implementation of DCNN expects 1 input and not %d' % len(
                inputs)
        else:
            inputs = inputs[0]

        with tf.variable_scope(self.scope):
            if is_training and float(self.conf['input_noise']) > 0:
                inputs = inputs + tf.random_normal(
                    tf.shape(inputs), stddev=float(self.conf['input_noise']))

            # First layer
            with tf.variable_scope('first_layer'):
                logits = tf.identity(inputs, 'inputs')
                input_seq_length = tf.identity(input_seq_length,
                                               'input_seq_length')

                # Convolution
                batch_size = logits.shape[0].value
                num_freq = logits.shape[2].value
                output_dim = num_capsules_lst[0] * capsule_dim_lst[0]
                logits = tf.expand_dims(logits, -1)

                first_layer = tf.layers.conv2d(logits,
                                               output_dim,
                                               kernel_size,
                                               strides=(1, 1),
                                               padding='SAME',
                                               activation=tf.nn.relu)

                # tf.add_to_collection('image', tf.expand_dims(prim_norm, 3))
                logits = tf.identity(first_layer, 'first_layer')

            # Primary capsule
            with tf.variable_scope('primary_capsule'):

                primary_capsules = tf.layers.conv2d(logits,
                                                    output_dim,
                                                    kernel_size,
                                                    strides=(1, 1),
                                                    padding='SAME')

                primary_capsules = tf.reshape(primary_capsules, [
                    batch_size, -1, num_freq, num_capsules_lst[0],
                    capsule_dim_lst[0]
                ])

                primary_capsules = ops.squash(primary_capsules)
                logits = tf.identity(primary_capsules, 'primary_capsules')

            with tf.variable_scope('encoder'):
                encoder_outputs = []
                for l in range(num_encoder_layers):
                    with tf.variable_scope('layer_%s' % l):
                        logits = encoder_layers[l](logits)

                        encoder_outputs.append(logits)

                        if is_training and float(self.conf['dropout']) < 1:
                            raise 'have to check wheter dropout is implemented correctly'
                            logits = tf.nn.dropout(logits,
                                                   float(self.conf['dropout']))

            with tf.variable_scope('centre'):
                for l in range(num_centre_layers):
                    with tf.variable_scope('layer_%s' % l):

                        logits = centre_layers[l](logits)

                        if is_training and float(self.conf['dropout']) < 1:
                            raise 'have to check wheter dropout is implemented correctly'
                            logits = tf.nn.dropout(logits,
                                                   float(self.conf['dropout']))

            with tf.variable_scope('decoder'):
                for l in range(num_decoder_layers):
                    with tf.variable_scope('layer_%s' % l):
                        corresponding_encoder_l = num_encoder_layers - 1 - l
                        corresponding_encoder_output = encoder_outputs[
                            corresponding_encoder_l]
                        # if l == 0:
                        #     decoder_input = logits
                        # else:
                        #     decoder_input = tf.concat([logits, corresponding_encoder_output], -2)
                        decoder_input = logits

                        if is_training and float(self.conf['dropout']) < 1:
                            raise 'have to check wheter dropout is implemented correctly'
                            logits = tf.nn.dropout(logits,
                                                   float(self.conf['dropout']))

                        #get wanted output size
                        if corresponding_encoder_l == 0:
                            wanted_size_tensor = tf.shape(primary_capsules)
                            wanted_size = primary_capsules.shape
                        else:
                            wanted_size_tensor = tf.shape(
                                encoder_outputs[corresponding_encoder_l - 1])
                            wanted_size = encoder_outputs[
                                corresponding_encoder_l - 1].shape

                        wanted_t_size = wanted_size_tensor[1]
                        freq_out = wanted_size[2]

                        logits = decoder_layers[l](decoder_input,
                                                   wanted_t_size, freq_out)

                output = logits
                # Include frequency dimension
                output = tf.reshape(output, [
                    batch_size, -1, num_freq,
                    num_capsules_lst[0] * capsule_dim_lst[0]
                ])

        return output
Example #2
0
    def  _get_outputs(self, inputs, input_seq_length, is_training):
        '''
        Create the variables and do the forward computation

        Args:
            inputs: the inputs to the neural network, this is a list of
                [batch_size x time x ...] tensors
            input_seq_length: The sequence lengths of the input utterances, this
                is a [batch_size] vector
            is_training: whether or not the network is in training mode

        Returns:
            - output, which is a [batch_size x time x ...] tensors
        '''
        use_bias = self.conf['use_bias'] == 'True'
        leaky_softmax = self.conf['leaky_softmax'] == 'True'
        shared = self.conf['shared'] == 'True'
        kernel_size = map(int, self.conf['kernel_size'].split(' '))
        num_filters = self.conf['num_filters']
        num_capsules_lst = map(int, self.conf['num_capsules_lst'].split(' '))
        capsule_dim_lst = map(int, self.conf['capsule_dim_lst'].split(' '))
        t_reduction_rate = int(self.conf['t_reduction_rate'])
        f_reduction_rate = int(self.conf['f_reduction_rate'])
        probability_fn = None
        if leaky_softmax:
            probability_fn = ops.leaky_softmax

        # code not available for multiple inputs!!
        if len(inputs) > 1:
            raise 'The implementation of DCNN expects 1 input and not %d' % len(inputs)
        else:
            inputs = inputs[0]

        with tf.variable_scope(self.scope):
            if is_training and float(self.conf['input_noise']) > 0:
                inputs = inputs + tf.random_normal(
                    tf.shape(inputs),
                    stddev=float(self.conf['input_noise']))


            logits = tf.identity(inputs, 'inputs')
            input_seq_length = tf.identity(input_seq_length, 'input_seq_length')

            # Convolution
            batch_size = logits.shape[0].value
            num_freq = logits.shape[2].value
            logits = tf.expand_dims(logits, -1)

            # Layer 1: Just a conventional Conv2D layer
            conv1 = tf.layers.conv2d(logits, filters=num_filters, kernel_size=kernel_size, strides=1, padding='same', activation=tf.nn.relu, name='conv1')

            # Reshape layer to be 1 capsule x [filters] atoms
            conv1_reshaped = tf.reshape(conv1, [batch_size, -1, num_freq, 1, 16])

            # Layer 1: Primary Capsule: Conv cap with routing 1
            primary_caps = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[0], capsule_dim=capsule_dim_lst[0],
                                               strides=(2,2), padding='SAME',
                                            routing_iters=0, use_bias=use_bias, shared=shared, name='primarycaps')(conv1_reshaped)

            # Layer 2: Convolutional Capsule
            conv_cap_2_1 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[1], capsule_dim=capsule_dim_lst[1], strides=(1,1), padding='SAME',
                                               routing_iters=3, use_bias=use_bias, shared=shared, name='conv_cap_2_1')(primary_caps)

            # Layer 2: Convolutional Capsule
            strides = [1,1]
            if (t_reduction_rate == 1):
                strides[0] = 2
            if (f_reduction_rate == 1):
                strides[1] = 2
            conv_cap_2_2 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[2], capsule_dim=capsule_dim_lst[2], strides=strides, padding='SAME',
                                               routing_iters=3, use_bias=use_bias, shared=shared, name='conv_cap_2_2')(conv_cap_2_1)

            # Layer 3: Convolutional Capsule
            conv_cap_3_1 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[3], capsule_dim=capsule_dim_lst[3], strides=(1,1), padding='SAME',
                                               routing_iters=3, use_bias=use_bias, shared=shared, name='conv_cap_3_1')(conv_cap_2_2)

            # Layer 3: Convolutional Capsule
            conv_cap_3_2 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[4], capsule_dim=capsule_dim_lst[4], strides=(2,2), padding='SAME',
                                               routing_iters=3, use_bias=use_bias, shared=shared, name='conv_cap_3_2')(conv_cap_3_1)

            # Layer 4: Convolutional Capsule
            conv_cap_4_1 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[5], capsule_dim=capsule_dim_lst[5], strides=(1,1), padding='SAME',
                                               routing_iters=3, use_bias=use_bias, shared=shared, name='conv_cap_4_1')(conv_cap_3_2)

            # Layer 1 Up: Deconvolutional Capsule
            t_out = tf.shape(conv_cap_3_1)[1]
            freq_out = conv_cap_3_1.shape[2]
            deconv_cap_1_1 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[6], capsule_dim=capsule_dim_lst[6], transpose=True,
                                                 strides=(2, 2), padding='SAME', routing_iters=3, use_bias=use_bias, shared=shared,
                                                name='deconv_cap_1_1')(conv_cap_4_1, t_out, freq_out)
            # Skip connection
            up_1 = tf.concat([deconv_cap_1_1, conv_cap_3_1], axis=-2, name='up_1')

            # Layer 1 Up: Deconvolutional Capsule
            deconv_cap_1_2 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[7], capsule_dim=capsule_dim_lst[7], strides=(1,1),
                                              padding='SAME', routing_iters=3, use_bias=use_bias, shared=shared, name='deconv_cap_1_2')(up_1)

            # Layer 2 Up: Deconvolutional Capsule
            t_out = tf.shape(conv_cap_2_1)[1]
            freq_out = conv_cap_2_1.shape[2]
            strides = [1, 1]
            if (t_reduction_rate == 1):
                strides[0] = 2
            if (f_reduction_rate == 1):
                strides[1] = 2
            deconv_cap_2_1 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[8], capsule_dim=capsule_dim_lst[8], transpose=True,
                                                 strides=strides, padding='SAME', routing_iters=3, use_bias=use_bias, shared=shared,
                                                name='deconv_cap_2_1')(deconv_cap_1_2, t_out, freq_out)

            # Skip connection
            up_2 = tf.concat([deconv_cap_2_1, conv_cap_2_1], axis=-2, name='up_2')

            # Layer 2 Up: Deconvolutional Capsule
            deconv_cap_2_2 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[9], capsule_dim=capsule_dim_lst[9], strides=(1,1),
                                              padding='SAME', routing_iters=3, use_bias=use_bias, shared=shared, name='deconv_cap_2_2')(up_2)

            # Layer 3 Up: Deconvolutional Capsule
            t_out = tf.shape(conv1_reshaped)[1]
            freq_out = conv1_reshaped.shape[2]
            deconv_cap_3_1 = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[10], capsule_dim=capsule_dim_lst[10], transpose=True,
                                                 strides=(2, 2), padding='SAME', routing_iters=3, use_bias=use_bias, shared=shared,
                                                name='deconv_cap_3_1')(deconv_cap_2_2, t_out, freq_out)

            # Skip connection
            up_3 = tf.concat([deconv_cap_3_1, conv1_reshaped], axis=-2, name='up_3')

            # Layer 4: Convolutional Capsule: 1x1
            seg_caps = layer.EncDecCapsule(kernel_size=kernel_size, num_capsules=num_capsules_lst[11], capsule_dim=capsule_dim_lst[11], strides=(1,1), padding='SAME',
                                           routing_iters=3, use_bias=use_bias, shared=shared, name='seg_caps')(up_3)


            output = seg_caps
            # Include frequency dimension
            output = tf.reshape(
                output,
                [batch_size,
                 -1,
                 num_freq, num_capsules_lst[-1]*capsule_dim_lst[-1]]
            )

        return output