def __init__(self,
                 encoder_type,
                 input_size,
                 num_units,
                 num_layers_main,
                 num_layers_sub,
                 num_classes_main,
                 num_classes_sub,
                 main_task_weight,
                 lstm_impl='LSTMBlockCell',
                 use_peephole=True,
                 splice=1,
                 parameter_init=0.1,
                 clip_grad_norm=None,
                 clip_activation=None,
                 num_proj=None,
                 weight_decay=0.0,
                 bottleneck_dim=None,
                 time_major=True):

        super(MultitaskCTC, self).__init__(
            encoder_type, input_size, num_units, num_layers_main,
            num_classes_main, lstm_impl, use_peephole, splice,
            parameter_init, clip_grad_norm, clip_activation, num_proj,
            weight_decay, bottleneck_dim, time_major)

        self.num_classes_sub = num_classes_sub + 1  # + blank label
        if float(main_task_weight) < 0 or float(main_task_weight) > 1:
            raise ValueError('Set main_task_weight between 0 to 1.')
        self.main_task_weight = main_task_weight
        self.sub_task_weight = 1 - self.main_task_weight

        # Placeholder for multi-task
        self.labels_sub_pl_list = []

        self.name = encoder_type + '_ctc'

        if ['multitask_blstm', 'multitask_lstm']:
            self.encoder = load(encoder_type)(
                num_units=num_units,
                num_proj=self.num_proj,
                num_layers_main=num_layers_main,
                num_layers_sub=num_layers_sub,
                lstm_impl=lstm_impl,
                use_peephole=use_peephole,
                parameter_init=parameter_init,
                clip_activation=clip_activation,
                time_major=time_major)
        else:
            raise NotImplementedError
    def __init__(self,
                 encoder_type,
                 input_size,
                 num_units,
                 num_layers_main,
                 num_layers_sub,
                 num_classes_main,
                 num_classes_sub,
                 main_task_weight,
                 lstm_impl='LSTMBlockCell',
                 use_peephole=True,
                 splice=1,
                 parameter_init=0.1,
                 clip_grad_norm=None,
                 clip_activation=None,
                 num_proj=None,
                 weight_decay=0.0,
                 bottleneck_dim=None,
                 time_major=True):

        super(MultitaskCTC,
              self).__init__(encoder_type, input_size, num_units,
                             num_layers_main, num_classes_main, lstm_impl,
                             use_peephole, splice, parameter_init,
                             clip_grad_norm, clip_activation, num_proj,
                             weight_decay, bottleneck_dim, time_major)

        self.num_classes_sub = num_classes_sub + 1  # + blank label
        if float(main_task_weight) < 0 or float(main_task_weight) > 1:
            raise ValueError('Set main_task_weight between 0 to 1.')
        self.main_task_weight = main_task_weight
        self.sub_task_weight = 1 - self.main_task_weight

        # Placeholder for multi-task
        self.labels_sub_pl_list = []

        self.name = encoder_type + '_ctc'

        if ['multitask_blstm', 'multitask_lstm']:
            self.encoder = load(encoder_type)(num_units=num_units,
                                              num_proj=self.num_proj,
                                              num_layers_main=num_layers_main,
                                              num_layers_sub=num_layers_sub,
                                              lstm_impl=lstm_impl,
                                              use_peephole=use_peephole,
                                              parameter_init=parameter_init,
                                              clip_activation=clip_activation,
                                              time_major=time_major)
        else:
            raise NotImplementedError
Esempio n. 3
0
    def __init__(self,
                 encoder_type,
                 input_size,
                 num_units,
                 num_layers_main,
                 num_layers_sub,
                 num_classes_main,
                 num_classes_sub,
                 main_task_weight,
                 lstm_impl='LSTMBlockCell',
                 use_peephole=True,
                 splice=1,
                 parameter_init=0.1,
                 clip_grad=None,
                 clip_activation=None,
                 num_proj=None,
                 weight_decay=0.0,
                 bottleneck_dim=None):

        super(Multitask_CTC,
              self).__init__(input_size, splice, num_classes_main,
                             num_classes_sub, main_task_weight, lstm_impl,
                             clip_grad, weight_decay)

        self.name = encoder_type + '_ctc'

        if ['multitask_blstm', 'multitask_lstm']:
            self.encoder = load(encoder_type)(
                num_units=num_units,
                num_layers_main=num_layers_main,
                num_layers_sub=num_layers_sub,
                num_classes_main=num_classes_main + 1,
                num_classes_sub=num_classes_sub + 1,
                lstm_impl=lstm_impl,
                use_peephole=use_peephole,
                parameter_init=parameter_init,
                clip_activation=clip_activation,
                num_proj=num_proj,
                bottleneck_dim=bottleneck_dim)

        else:
            raise NotImplementedError
    def check(self, encoder_type, lstm_impl=None, time_major=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  lstm_impl: %s' % lstm_impl)
        print('  time_major: %s' % time_major)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 4
            splice = 5 if encoder_type in ['vgg_blstm', 'vgg_lstm',
                                           'vgg_wang', 'resnet_wang', 'cldnn_wang',
                                           'cnn_zhang'] else 1
            num_stack = 2
            inputs, _, inputs_seq_len = generate_data(
                label_type='character',
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=splice)
            frame_num, input_size = inputs[0].shape

            # Define model graph
            if encoder_type in ['blstm', 'lstm']:
                encoder = load(encoder_type)(
                    num_units=256,
                    num_proj=None,
                    num_layers=5,
                    lstm_impl=lstm_impl,
                    use_peephole=True,
                    parameter_init=0.1,
                    clip_activation=5,
                    time_major=time_major)
            elif encoder_type in ['bgru', 'gru']:
                encoder = load(encoder_type)(
                    num_units=256,
                    num_layers=5,
                    parameter_init=0.1,
                    time_major=time_major)
            elif encoder_type in ['vgg_blstm', 'vgg_lstm', 'cldnn_wang']:
                encoder = load(encoder_type)(
                    input_size=input_size // splice // num_stack,
                    splice=splice,
                    num_stack=num_stack,
                    num_units=256,
                    num_proj=None,
                    num_layers=5,
                    lstm_impl=lstm_impl,
                    use_peephole=True,
                    parameter_init=0.1,
                    clip_activation=5,
                    time_major=time_major)
            elif encoder_type in ['multitask_blstm', 'multitask_lstm']:
                encoder = load(encoder_type)(
                    num_units=256,
                    num_proj=None,
                    num_layers_main=5,
                    num_layers_sub=3,
                    lstm_impl=lstm_impl,
                    use_peephole=True,
                    parameter_init=0.1,
                    clip_activation=5,
                    time_major=time_major)
            elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                encoder = load(encoder_type)(
                    input_size=input_size // splice // num_stack,
                    splice=splice,
                    num_stack=num_stack,
                    parameter_init=0.1,
                    time_major=time_major)
                # NOTE: topology is pre-defined
            else:
                raise NotImplementedError

            # Create placeholders
            inputs_pl = tf.placeholder(tf.float32,
                                       shape=[None, None, input_size],
                                       name='inputs')
            inputs_seq_len_pl = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='inputs_seq_len')
            keep_prob_pl = tf.placeholder(tf.float32, name='keep_prob')

            # operation for forward computation
            if encoder_type in ['multitask_blstm', 'multitask_lstm']:
                hidden_states_op, final_state_op, hidden_states_sub_op, final_state_sub_op = encoder(
                    inputs=inputs_pl,
                    inputs_seq_len=inputs_seq_len_pl,
                    keep_prob=keep_prob_pl,
                    is_training=True)
            else:
                hidden_states_op, final_state_op = encoder(
                    inputs=inputs_pl,
                    inputs_seq_len=inputs_seq_len_pl,
                    keep_prob=keep_prob_pl,
                    is_training=True)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()),
                   "{:,}".format(total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                inputs_pl: inputs,
                inputs_seq_len_pl: inputs_seq_len,
                keep_prob_pl: 0.9
            }

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Make prediction
                if encoder_type in ['multitask_blstm', 'multitask_lstm']:
                    encoder_outputs, final_state, hidden_states_sub, final_state_sub = sess.run(
                        [hidden_states_op, final_state_op,
                         hidden_states_sub_op, final_state_sub_op],
                        feed_dict=feed_dict)
                elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                    encoder_outputs = sess.run(
                        hidden_states_op, feed_dict=feed_dict)
                else:
                    encoder_outputs, final_state = sess.run(
                        [hidden_states_op, final_state_op],
                        feed_dict=feed_dict)

                # Convert always to batch-major
                if time_major:
                    encoder_outputs = encoder_outputs.transpose(1, 0, 2)

                if encoder_type in ['blstm', 'bgru', 'vgg_blstm', 'multitask_blstm', 'cldnn_wang']:
                    if encoder_type != 'cldnn_wang':
                        self.assertEqual(
                            (batch_size, frame_num, encoder.num_units * 2), encoder_outputs.shape)

                    if encoder_type != 'bgru':
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].c.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].h.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[1].c.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[1].h.shape)

                        if encoder_type == 'multitask_blstm':
                            self.assertEqual(
                                (batch_size, frame_num, encoder.num_units * 2), hidden_states_sub.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].c.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].h.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[1].c.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[1].h.shape)
                    else:
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[1].shape)

                elif encoder_type in ['lstm', 'gru', 'vgg_lstm', 'multitask_lstm']:
                    self.assertEqual(
                        (batch_size, frame_num, encoder.num_units), encoder_outputs.shape)

                    if encoder_type != 'gru':
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].c.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].h.shape)

                        if encoder_type == 'multitask_lstm':
                            self.assertEqual(
                                (batch_size, frame_num, encoder.num_units), hidden_states_sub.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].c.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].h.shape)
                    else:
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].shape)

                elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                    self.assertEqual(3, len(encoder_outputs.shape))
                    self.assertEqual(
                        (batch_size, frame_num), encoder_outputs.shape[:2])
Esempio n. 5
0
    def __init__(self,
                 encoder_type,
                 input_size,
                 num_units,
                 num_layers,
                 num_classes,
                 lstm_impl='LSTMBlockCell',
                 use_peephole=True,
                 splice=1,
                 parameter_init=0.1,
                 clip_grad_norm=None,
                 clip_activation=None,
                 num_proj=None,
                 weight_decay=0.0,
                 bottleneck_dim=None,
                 time_major=True):

        super(CTC, self).__init__()

        assert input_size % 3 == 0, 'input_size must be divisible by 3 (+ delta, double delta features).'
        assert splice % 2 == 1, 'splice must be the odd number'
        if clip_grad_norm is not None:
            assert float(
                clip_grad_norm) > 0, 'clip_grad_norm must be larger than 0.'
        assert float(
            weight_decay) >= 0, 'weight_decay must not be a negative value.'

        # Encoder setting
        self.encoder_type = encoder_type
        self.input_size = input_size
        self.splice = splice
        self.num_units = num_units
        if int(num_proj) == 0:
            self.num_proj = None
        elif num_proj is not None:
            self.num_proj = int(num_proj)
        else:
            self.num_proj = None
        self.num_layers = num_layers
        self.bottleneck_dim = bottleneck_dim
        self.num_classes = num_classes + 1  # + blank
        self.lstm_impl = lstm_impl
        self.use_peephole = use_peephole

        # Regularization
        self.parameter_init = parameter_init
        self.clip_grad_norm = clip_grad_norm
        self.clip_activation = clip_activation
        self.weight_decay = weight_decay

        # Summaries for TensorBoard
        self.summaries_train = []
        self.summaries_dev = []

        # Placeholders
        self.inputs_pl_list = []
        self.labels_pl_list = []
        self.inputs_seq_len_pl_list = []
        self.keep_prob_pl_list = []

        self.time_major = time_major
        self.name = encoder_type + '_ctc'

        if encoder_type in ['blstm', 'lstm']:
            self.encoder = load(encoder_type)(
                num_units=num_units,
                num_proj=self.num_proj,
                num_layers=num_layers,
                lstm_impl=lstm_impl,
                use_peephole=use_peephole,
                parameter_init=parameter_init,
                clip_activation=clip_activation,
                time_major=time_major)

        elif encoder_type in ['vgg_blstm', 'vgg_lstm']:
            self.encoder = load(encoder_type)(
                input_size=input_size,
                splice=splice,
                num_units=num_units,
                num_proj=self.num_proj,
                num_layers=num_layers,
                lstm_impl=lstm_impl,
                use_peephole=use_peephole,
                parameter_init=parameter_init,
                clip_activation=clip_activation,
                time_major=time_major)

        elif encoder_type in ['bgru', 'gru']:
            self.encoder = load(encoder_type)(
                num_units=num_units,
                num_layers=num_layers,
                parameter_init=parameter_init,
                time_major=time_major)

        elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
            self.encoder = load(encoder_type)(
                input_size=input_size,
                splice=splice,
                parameter_init=parameter_init,
                time_major=time_major)

        else:
            self.encoder = None
    def check_encode(self, encoder_type, lstm_impl=None):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  lstm_impl: %s' % lstm_impl)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 4
            splice = 11 if encoder_type in ['vgg_blstm', 'vgg_lstm', 'vgg_wang',
                                            'resnet_wang', 'cnn_zhang'] else 1
            inputs, _, inputs_seq_len = generate_data(
                label_type='character',
                model='ctc',
                batch_size=batch_size,
                splice=splice)
            frame_num, input_size = inputs[0].shape

            # Define model graph
            if encoder_type in ['blstm', 'lstm']:
                encoder = load(encoder_type)(
                    num_units=256,
                    num_layers=5,
                    num_classes=0,  # return hidden states
                    lstm_impl=lstm_impl,
                    parameter_init=0.1)
            elif encoder_type in ['bgru', 'gru']:
                encoder = load(encoder_type)(
                    num_units=256,
                    num_layers=5,
                    num_classes=0,  # return hidden states
                    parameter_init=0.1)
            elif encoder_type in ['vgg_blstm', 'vgg_lstm']:
                encoder = load(encoder_type)(
                    input_size=input_size // 11,
                    splice=11,
                    num_units=256,
                    num_layers=5,
                    num_classes=0,  # return hidden states
                    lstm_impl=lstm_impl,
                    parameter_init=0.1)
            elif encoder_type in ['multitask_blstm', 'multitask_lstm']:
                encoder = load(encoder_type)(
                    num_units=256,
                    num_layers_main=5,
                    num_layers_sub=3,
                    num_classes_main=0,  # return hidden states
                    num_classes_sub=0,  # return hidden states
                    lstm_impl=lstm_impl,
                    parameter_init=0.1)
            elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                encoder = load(encoder_type)(
                    input_size=input_size // 11,
                    splice=11,
                    num_classes=27,
                    parameter_init=0.1)
                # NOTE: topology is pre-defined
            else:
                raise NotImplementedError

            # Create placeholders
            inputs_pl = tf.placeholder(tf.float32,
                                       shape=[None, None, input_size],
                                       name='inputs')
            inputs_seq_len_pl = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='inputs_seq_len')
            keep_prob_input_pl = tf.placeholder(tf.float32,
                                                name='keep_prob_input')
            keep_prob_hidden_pl = tf.placeholder(tf.float32,
                                                 name='keep_prob_hidden')
            keep_prob_output_pl = tf.placeholder(tf.float32,
                                                 name='keep_prob_output')

            # operation for forward computation
            if encoder_type in ['multitask_blstm', 'multitask_lstm']:
                hidden_states_op, final_state_op, hidden_states_sub_op, final_state_sub_op = encoder(
                    inputs=inputs_pl,
                    inputs_seq_len=inputs_seq_len_pl,
                    keep_prob_input=keep_prob_input_pl,
                    keep_prob_hidden=keep_prob_hidden_pl,
                    keep_prob_output=keep_prob_output_pl)
            else:
                hidden_states_op, final_state_op = encoder(
                    inputs=inputs_pl,
                    inputs_seq_len=inputs_seq_len_pl,
                    keep_prob_input=keep_prob_input_pl,
                    keep_prob_hidden=keep_prob_hidden_pl,
                    keep_prob_output=keep_prob_output_pl)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()),
                   "{:,}".format(total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                inputs_pl: inputs,
                inputs_seq_len_pl: inputs_seq_len,
                keep_prob_input_pl: 0.9,
                keep_prob_hidden_pl: 0.9,
                keep_prob_output_pl: 1.0
            }

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Make prediction
                if encoder_type in ['multitask_blstm', 'multitask_lstm']:
                    hidden_states, final_state, hidden_states_sub, final_state_sub = sess.run(
                        [hidden_states_op, final_state_op, hidden_states_sub_op, final_state_sub_op], feed_dict=feed_dict)
                elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                    hidden_states = sess.run(
                        hidden_states_op, feed_dict=feed_dict)
                else:
                    hidden_states, final_state = sess.run(
                        [hidden_states_op, final_state_op], feed_dict=feed_dict)

                if encoder_type in ['blstm', 'bgru', 'vgg_blstm', 'multitask_blstm']:
                    self.assertEqual(
                        (batch_size, frame_num, encoder.num_units * 2), hidden_states.shape)

                    if encoder_type in ['blstm', 'vgg_blstm', 'multitask_blstm']:
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].c.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].h.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[1].c.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[1].h.shape)

                        if encoder_type == 'multitask_blstm':
                            self.assertEqual(
                                (batch_size, frame_num, encoder.num_units * 2), hidden_states_sub.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].c.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].h.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[1].c.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[1].h.shape)
                    else:
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[1].shape)

                elif encoder_type in ['lstm', 'gru', 'vgg_lstm']:
                    self.assertEqual(
                        (batch_size, frame_num, encoder.num_units), hidden_states.shape)

                    if encoder_type in ['lstm', 'vgg_lstm', 'multitask_lstm']:
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].c.shape)
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].h.shape)

                        if encoder_type == 'multitask_lstm':
                            self.assertEqual(
                                (batch_size, frame_num, encoder.num_units), hidden_states_sub.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].c.shape)
                            self.assertEqual(
                                (batch_size, encoder.num_units), final_state_sub[0].h.shape)
                    else:
                        self.assertEqual(
                            (batch_size, encoder.num_units), final_state[0].shape)

                elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                    self.assertEqual(
                        (frame_num, batch_size, encoder.num_classes), hidden_states.shape)
    def __init__(self,
                 encoder_type,
                 input_size,
                 num_units,
                 num_layers,
                 num_classes,
                 lstm_impl='LSTMBlockCell',
                 use_peephole=True,
                 splice=1,
                 num_stack=1,
                 parameter_init=0.1,
                 clip_grad_norm=None,
                 clip_activation=None,
                 num_proj=None,
                 weight_decay=0.0,
                 bottleneck_dim=None,
                 time_major=True):

        super(CTC, self).__init__()

        assert input_size % 3 == 0, 'input_size must be divisible by 3 (+ delta, acceleration coefficients).'
        assert splice % 2 == 1, 'splice must be the odd number'
        if clip_grad_norm is not None:
            assert float(
                clip_grad_norm) > 0, 'clip_grad_norm must be larger than 0.'
        assert float(
            weight_decay) >= 0, 'weight_decay must not be a negative value.'

        # Encoder setting
        self.encoder_type = encoder_type
        self.input_size = input_size
        self.splice = splice
        self.num_stack = num_stack
        self.num_units = num_units
        if int(num_proj) == 0:
            self.num_proj = None
        elif num_proj is not None:
            self.num_proj = int(num_proj)
        else:
            self.num_proj = None
        self.num_layers = num_layers
        self.bottleneck_dim = bottleneck_dim
        self.num_classes = num_classes + 1  # + blank
        self.lstm_impl = lstm_impl
        self.use_peephole = use_peephole

        # Regularization
        self.parameter_init = parameter_init
        self.clip_grad_norm = clip_grad_norm
        self.clip_activation = clip_activation
        self.weight_decay = weight_decay

        # Summaries for TensorBoard
        self.summaries_train = []
        self.summaries_dev = []

        # Placeholders
        self.inputs_pl_list = []
        self.labels_pl_list = []
        self.inputs_seq_len_pl_list = []
        self.keep_prob_pl_list = []

        self.time_major = time_major
        self.name = encoder_type + '_ctc'

        if encoder_type in ['blstm', 'lstm']:
            self.encoder = load(encoder_type)(
                num_units=num_units,
                num_proj=self.num_proj,
                num_layers=num_layers,
                lstm_impl=lstm_impl,
                use_peephole=use_peephole,
                parameter_init=parameter_init,
                clip_activation=clip_activation,
                time_major=time_major)

        elif encoder_type in ['vgg_blstm', 'vgg_lstm', 'cldnn_wang']:
            self.encoder = load(encoder_type)(
                input_size=input_size,
                splice=splice,
                num_stack=num_stack,
                num_units=num_units,
                num_proj=self.num_proj,
                num_layers=num_layers,
                lstm_impl=lstm_impl,
                use_peephole=use_peephole,
                parameter_init=parameter_init,
                clip_activation=clip_activation,
                time_major=time_major)

        elif encoder_type in ['bgru', 'gru']:
            self.encoder = load(encoder_type)(
                num_units=num_units,
                num_layers=num_layers,
                parameter_init=parameter_init,
                time_major=time_major)

        elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
            self.encoder = load(encoder_type)(
                input_size=input_size,
                splice=splice,
                num_stack=num_stack,
                parameter_init=parameter_init,
                time_major=time_major)

        elif encoder_type in ['student_cnn_ctc', 'student_cnn_compact_ctc']:
            self.encoder = load(encoder_type)(
                input_size=input_size,
                splice=splice,
                num_stack=num_stack,
                parameter_init=parameter_init,
                time_major=time_major)

        else:
            raise NotImplementedError
    def __init__(self,
                 encoder_type,
                 input_size,
                 num_units,
                 num_layers,
                 num_classes,
                 lstm_impl='LSTMBlockCell',
                 use_peephole=True,
                 splice=1,
                 parameter_init=0.1,
                 clip_grad=None,
                 clip_activation=None,
                 num_proj=None,
                 weight_decay=0.0,
                 bottleneck_dim=None):

        super(CTC, self).__init__(
            input_size, splice, num_classes, lstm_impl,
            clip_grad, weight_decay)

        self.name = encoder_type + '_ctc'

        if encoder_type in ['blstm', 'lstm']:
            self.encoder = load(encoder_type)(num_units=num_units,
                                              num_layers=num_layers,
                                              num_classes=num_classes + 1,
                                              lstm_impl=lstm_impl,
                                              use_peephole=use_peephole,
                                              parameter_init=parameter_init,
                                              clip_activation=clip_activation,
                                              num_proj=num_proj,
                                              bottleneck_dim=bottleneck_dim)

        elif encoder_type in ['vgg_blstm', 'vgg_lstm']:
            self.encoder = load(encoder_type)(input_size=input_size,
                                              splice=splice,
                                              num_units=num_units,
                                              num_layers=num_layers,
                                              num_classes=num_classes + 1,
                                              lstm_impl=lstm_impl,
                                              use_peephole=use_peephole,
                                              parameter_init=parameter_init,
                                              clip_activation=clip_activation,
                                              num_proj=num_proj,
                                              bottleneck_dim=bottleneck_dim)

        elif encoder_type in ['bgru', 'gru']:
            self.encoder = load(encoder_type)(num_units=num_units,
                                              num_layers=num_layers,
                                              num_classes=num_classes + 1,
                                              parameter_init=parameter_init,
                                              bottleneck_dim=bottleneck_dim)

        elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
            self.encoder = load(encoder_type)(input_size=input_size,
                                              splice=splice,
                                              num_classes=num_classes + 1,
                                              parameter_init=parameter_init)

        else:
            raise NotImplementedError
Esempio n. 9
0
    def check(self, encoder_type, lstm_impl=None, time_major=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  lstm_impl: %s' % lstm_impl)
        print('  time_major: %s' % time_major)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 4
            splice = 5 if encoder_type in [
                'vgg_blstm', 'vgg_lstm', 'vgg_wang', 'resnet_wang',
                'cldnn_wang', 'cnn_zhang'
            ] else 1
            num_stack = 2
            inputs, _, inputs_seq_len = generate_data(label_type='character',
                                                      model='ctc',
                                                      batch_size=batch_size,
                                                      num_stack=num_stack,
                                                      splice=splice)
            frame_num, input_size = inputs[0].shape

            # Define model graph
            if encoder_type in ['blstm', 'lstm']:
                encoder = load(encoder_type)(num_units=256,
                                             num_proj=None,
                                             num_layers=5,
                                             lstm_impl=lstm_impl,
                                             use_peephole=True,
                                             parameter_init=0.1,
                                             clip_activation=5,
                                             time_major=time_major)
            elif encoder_type in ['bgru', 'gru']:
                encoder = load(encoder_type)(num_units=256,
                                             num_layers=5,
                                             parameter_init=0.1,
                                             time_major=time_major)
            elif encoder_type in ['vgg_blstm', 'vgg_lstm', 'cldnn_wang']:
                encoder = load(encoder_type)(input_size=input_size // splice //
                                             num_stack,
                                             splice=splice,
                                             num_stack=num_stack,
                                             num_units=256,
                                             num_proj=None,
                                             num_layers=5,
                                             lstm_impl=lstm_impl,
                                             use_peephole=True,
                                             parameter_init=0.1,
                                             clip_activation=5,
                                             time_major=time_major)
            elif encoder_type in ['multitask_blstm', 'multitask_lstm']:
                encoder = load(encoder_type)(num_units=256,
                                             num_proj=None,
                                             num_layers_main=5,
                                             num_layers_sub=3,
                                             lstm_impl=lstm_impl,
                                             use_peephole=True,
                                             parameter_init=0.1,
                                             clip_activation=5,
                                             time_major=time_major)
            elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                encoder = load(encoder_type)(input_size=input_size // splice //
                                             num_stack,
                                             splice=splice,
                                             num_stack=num_stack,
                                             parameter_init=0.1,
                                             time_major=time_major)
                # NOTE: topology is pre-defined
            else:
                raise NotImplementedError

            # Create placeholders
            inputs_pl = tf.placeholder(tf.float32,
                                       shape=[None, None, input_size],
                                       name='inputs')
            inputs_seq_len_pl = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='inputs_seq_len')
            keep_prob_pl = tf.placeholder(tf.float32, name='keep_prob')

            # operation for forward computation
            if encoder_type in ['multitask_blstm', 'multitask_lstm']:
                hidden_states_op, final_state_op, hidden_states_sub_op, final_state_sub_op = encoder(
                    inputs=inputs_pl,
                    inputs_seq_len=inputs_seq_len_pl,
                    keep_prob=keep_prob_pl,
                    is_training=True)
            else:
                hidden_states_op, final_state_op = encoder(
                    inputs=inputs_pl,
                    inputs_seq_len=inputs_seq_len_pl,
                    keep_prob=keep_prob_pl,
                    is_training=True)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()), "{:,}".format(
                      total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                inputs_pl: inputs,
                inputs_seq_len_pl: inputs_seq_len,
                keep_prob_pl: 0.9
            }

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Make prediction
                if encoder_type in ['multitask_blstm', 'multitask_lstm']:
                    encoder_outputs, final_state, hidden_states_sub, final_state_sub = sess.run(
                        [
                            hidden_states_op, final_state_op,
                            hidden_states_sub_op, final_state_sub_op
                        ],
                        feed_dict=feed_dict)
                elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                    encoder_outputs = sess.run(hidden_states_op,
                                               feed_dict=feed_dict)
                else:
                    encoder_outputs, final_state = sess.run(
                        [hidden_states_op, final_state_op],
                        feed_dict=feed_dict)

                # Convert always to batch-major
                if time_major:
                    encoder_outputs = encoder_outputs.transpose(1, 0, 2)

                if encoder_type in [
                        'blstm', 'bgru', 'vgg_blstm', 'multitask_blstm',
                        'cldnn_wang'
                ]:
                    if encoder_type != 'cldnn_wang':
                        self.assertEqual(
                            (batch_size, frame_num, encoder.num_units * 2),
                            encoder_outputs.shape)

                    if encoder_type != 'bgru':
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[0].c.shape)
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[0].h.shape)
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[1].c.shape)
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[1].h.shape)

                        if encoder_type == 'multitask_blstm':
                            self.assertEqual(
                                (batch_size, frame_num, encoder.num_units * 2),
                                hidden_states_sub.shape)
                            self.assertEqual((batch_size, encoder.num_units),
                                             final_state_sub[0].c.shape)
                            self.assertEqual((batch_size, encoder.num_units),
                                             final_state_sub[0].h.shape)
                            self.assertEqual((batch_size, encoder.num_units),
                                             final_state_sub[1].c.shape)
                            self.assertEqual((batch_size, encoder.num_units),
                                             final_state_sub[1].h.shape)
                    else:
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[0].shape)
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[1].shape)

                elif encoder_type in [
                        'lstm', 'gru', 'vgg_lstm', 'multitask_lstm'
                ]:
                    self.assertEqual(
                        (batch_size, frame_num, encoder.num_units),
                        encoder_outputs.shape)

                    if encoder_type != 'gru':
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[0].c.shape)
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[0].h.shape)

                        if encoder_type == 'multitask_lstm':
                            self.assertEqual(
                                (batch_size, frame_num, encoder.num_units),
                                hidden_states_sub.shape)
                            self.assertEqual((batch_size, encoder.num_units),
                                             final_state_sub[0].c.shape)
                            self.assertEqual((batch_size, encoder.num_units),
                                             final_state_sub[0].h.shape)
                    else:
                        self.assertEqual((batch_size, encoder.num_units),
                                         final_state[0].shape)

                elif encoder_type in ['vgg_wang', 'resnet_wang', 'cnn_zhang']:
                    self.assertEqual(3, len(encoder_outputs.shape))
                    self.assertEqual((batch_size, frame_num),
                                     encoder_outputs.shape[:2])