Esempio n. 1
0
    def build_graph(self, inputs, training=None):
        image = tf.expand_dims(inputs[Keys.Image], axis=-1)  # add channel axis
        data_length = inputs[Keys.ImageLength]
        batch_size = tf.shape(image)[0]

        flowing_data = 1 - tf.cast(image, tf.float32) / 255.0  # Rescale and invert, so that black is now 1, white 0
        for conv, pool in zip(self.conv_layers, self.pool_layers):
            flowing_data = pool(conv(flowing_data))
            data_length = (data_length + 1) // 2  # 2x2 pooling

        subsampled_height, features = flowing_data.shape[2:4]
        flowing_data = tf.reshape(flowing_data, [batch_size, -1, subsampled_height * features])
        flowing_data = tf.transpose(flowing_data, [1, 0, 2])
        flowing_data = self.bilstm_layer(flowing_data)
        flowing_data = tf.transpose(flowing_data, [1, 0, 2])
        flowing_data = self.dropout_layer(flowing_data)

        blank_last_logits = self.logits_layer(flowing_data)
        blank_last_softmax = tf.nn.softmax(blank_last_logits)
        logits = tf.roll(blank_last_logits, shift=1, axis=-1)
        softmax = tf.roll(blank_last_softmax, shift=1, axis=-1)

        greedy_decoded = ctc_ops.ctc_greedy_decoder(
            inputs=tf.transpose(blank_last_logits, perm=[1, 0, 2]),
            sequence_length=tf.cast(keras.backend.flatten(data_length), "int32"),
        )[0][0]

        return {
            "blank_last_logits": blank_last_logits,
            "blank_last_softmax": blank_last_softmax,
            "logits": logits,
            "softmax": softmax,
            "decoded": tf.sparse.to_dense(greedy_decoded, default_value=-1),
            "out_len": data_length,
        }
Esempio n. 2
0
    def call(self, inputs, training=None):

        y_pred = tf.log(tf.transpose(inputs, perm=[1, 0, 2]) + 1e-8)
        #input_length = tf.to_int32(self.sample_out_size)
        input_length = K.ones_like(inputs[:, 0, 0],
                                   dtype='int32') * self.sample_out_size

        if self.greedy:
            (decoded,
             log_prob) = ctc.ctc_greedy_decoder(inputs=y_pred,
                                                sequence_length=input_length,
                                                merge_repeated=False)
        else:
            (decoded, log_prob) = ctc.ctc_beam_search_decoder(
                inputs=y_pred,
                sequence_length=input_length,
                beam_width=self.beam_width,
                top_paths=self.top_paths,
                merge_repeated=False)

        decoded_dense = [
            tf.sparse_to_dense(st.indices,
                               st.dense_shape,
                               st.values,
                               default_value=-1) for st in decoded
        ]
        dummy_vec = K.ones_like(inputs[:, :, 0],
                                dtype='int64') * self.dummy_word
        conccat_dense = [
            K.concatenate((d, dummy_vec), axis=1)[:, :self.sample_out_size]
            for d in decoded_dense
        ]

        return conccat_dense
Esempio n. 3
0
  def ctc_loss(self,outputs, targets, seq_len, num_classes,initial_learning_rate, keep_prob=0.8, scopeN="l1-ctc_loss"):
    """Implements ctc loss
    
    @param outputs: [batch,h,w,chanels]
    @param targets: sparce tensor 
    @param seq_len: the length of the inputs sequences [batch]
    @param num_classes: the number of classes
    @param initial_learning_rate: learning rate
    @param keep_prob: if true dropout layer
    @param scopeN: the scope name
    
    @returns: list with [optimizer, cost, Inaccuracy- label error rate, decoded output of the batch]
    """
    with tf.name_scope('Train'):
        with tf.variable_scope("ctc_loss-"+scopeN) as scope:
            W = tf.Variable(tf.truncated_normal([self.hidden*2,
                                                 num_classes],
                                                stddev=0.1))
            # Zero initialization
            b = tf.Variable(tf.constant(0., shape=[num_classes]))

        tf.summary.histogram('histogram-b-ctc', b)
        tf.summary.histogram('histogram-w-ctc', W)

        # Doing the affine projection
        logits = tf.matmul(outputs, W) +  b 

        if keep_prob is not None:
            logits = tf.nn.dropout(logits, keep_prob)

        # Reshaping back to the original shape
        logits = tf.reshape(logits, [self.width, self.batch_size, num_classes])    
        #logits =  tf.transpose(logits, [1,0,2])

        with tf.name_scope('CTC-loss'):
            loss = ctc_ops.ctc_loss(logits, targets, seq_len)
            cost = tf.reduce_mean(loss)
            
        with tf.name_scope('Optimizer'):
            if self.optimizer == "ADAM":
                optimizer = tf.train.AdamOptimizer(learning_rate=initial_learning_rate,name="AdamOptimizer").minimize(cost)
            elif self.optimizer == "RMSP":
                optimizer = tf.train.RMSPropOptimizer(learning_rate=initial_learning_rate, decay=self.decay, momentum=self.momentum).minimize(cost)
            else:
                raise Exception("model type not supported: {}".format(self.optimizer))
        
        with tf.name_scope('Prediction'):
            if self.ctc_decoder == 'greedy':
                decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)
            elif self.ctc_decoder == 'beam_search':
                decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_len)
            else:
                raise Exception("model type not supported: {}".format(self.ctc_decoder))

            # Inaccuracy: label error rate
            ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets))
    return optimizer, cost, ler, decoded
Esempio n. 4
0
    def decode(self, predictions, seq_len, k):
        if self.ctc_decoder == 'greedy':
            decoded, log_prob = ctc_ops.ctc_greedy_decoder(predictions, seq_len)
        elif self.ctc_decoder == 'beam_search':
            decoded, log_prob = ctc_ops.ctc_beam_search_decoder(predictions, seq_len, top_paths=k)
        else:
            raise Exception("model type not supported: {}".format(self.ctc_decoder))

        return decoded
Esempio n. 5
0
 def setup_decoder(self):
     with tf.name_scope("decode"):
         if self.beam_search_decoder == 'default':               
             self.decoded, self.log_prob = ctc_ops.ctc_beam_search_decoder(
                 self.logits, self.seq_length, merge_repeated=False)
         elif self.beam_search_decoder == 'greedy':
             self.decoded, self.log_prob = ctc_ops.ctc_greedy_decoder(
                 self.logits, self.seq_length, merge_repeated=False)
         else:
             logging.warning("Invalid beam search decoder option selected!")
Esempio n. 6
0
    def ctc_decode(self,
                   y_pred,
                   input_length,
                   greedy=True,
                   beam_width=100,
                   top_paths=1,
                   merge_repeated=False):
        """Decodes the output of a softmax.
        Can use either greedy search (also known as best path)
        or a constrained dictionary search.
        # Arguments
            y_pred: tensor `(samples, time_steps, num_categories)`
                containing the prediction, or output of the softmax.
            input_length: tensor `(samples, )` containing the sequence length for
                each batch item in `y_pred`.
            greedy: perform much faster best-path search if `True`.
                This does not use a dictionary.
            beam_width: if `greedy` is `False`: a beam search decoder will be used
                with a beam of this width.
            top_paths: if `greedy` is `False`,
                how many of the most probable paths will be returned.
            merge_repeated: if `greedy` is `False`,
                merge repeated classes in the output beams.
        # Returns
            Tuple:
                List: if `greedy` is `True`, returns a list of one element that
                    contains the decoded sequence.
                    If `False`, returns the `top_paths` most probable
                    decoded sequences.
                    Important: blank labels are returned as `-1`.
                Tensor `(top_paths, )` that contains
                    the log probability of each decoded sequence.
        """
        _EPSILON = 1e-7
        y_pred = tf_math_ops.log(
            tf.transpose(y_pred, perm=[1, 0, 2]) + _EPSILON)
        input_length = tf.cast(input_length, tf.int32)

        if greedy:
            (decoded, log_prob) = ctc_ops.ctc_greedy_decoder(
                inputs=y_pred, sequence_length=input_length)
        else:
            (decoded, log_prob) = ctc_ops.ctc_beam_search_decoder(
                inputs=y_pred,
                sequence_length=input_length,
                beam_width=beam_width,
                top_paths=top_paths,
                merge_repeated=merge_repeated)

        decoded_dense = []
        for st in decoded:
            dense_tensor = tf.sparse.to_dense(st, default_value=-1)
            decoded_dense.append(dense_tensor)
        return decoded_dense, log_prob
Esempio n. 7
0
    def build_graph(self, inputs, training=None):
        params: ModelParams = self._params
        input_data = tf.cast(inputs["img"], tf.float32) / 255.0
        input_sequence_length = K.flatten(inputs["img_len"])
        shape = input_sequence_length, -1

        # if concat or conv_T layers are present, we need to pad the input to ensure that possible
        # up-sampling layers work properly
        require_padding = any([
            isinstance(l, (ConcatLayerParams, TransposedConv2DLayerParams))
            for l in params.layers
        ])
        if require_padding:
            s = self._params.compute_max_downscale_factor()
            padding = calculate_padding(input_data, s.to_tuple())
            padded = KL.Lambda(partial(pad, x_only=True),
                               name="padded_input")([input_data, padding])
            last_layer_output = padded
        else:
            last_layer_output = input_data

        layers_outputs_by_index = []
        for layer in self.layer_instances:
            layers_outputs_by_index.append(last_layer_output)
            if isinstance(layer.params, ConcatLayerParams):
                last_layer_output = layer(layers_outputs_by_index)
            else:
                last_layer_output = layer(last_layer_output)

        lstm_seq_len, lstm_num_features = self._params.compute_downscaled(
            shape)
        lstm_seq_len = K.cast(lstm_seq_len, "int32")

        last_layer_output = self.reshape(last_layer_output)
        blank_last_logits = self.logits(last_layer_output)
        blank_last_softmax = self.softmax(blank_last_logits)

        logits = tf.roll(blank_last_logits, shift=1, axis=-1)
        softmax = tf.nn.softmax(logits)

        greedy_decoded = ctc.ctc_greedy_decoder(
            inputs=array_ops.transpose(blank_last_logits, perm=[1, 0, 2]),
            sequence_length=tf.cast(K.flatten(lstm_seq_len), "int32"),
        )[0][0]

        return {
            "blank_last_logits": blank_last_logits,
            "blank_last_softmax": blank_last_softmax,
            "out_len": lstm_seq_len,
            "logits": logits,
            "softmax": softmax,
            "decoded":
            tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1,
        }
Esempio n. 8
0
def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
    """Decodes the output of a softmax.

    Can use either greedy search (also known as best path)
    or a constrained dictionary search.

    # Arguments
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, )` containing the sequence length for
            each batch item in `y_pred`.
        greedy: perform much faster best-path search if `true`.
            This does not use a dictionary.
        beam_width: if `greedy` is `false`: a beam search decoder will be used
            with a beam of this width.
        top_paths: if `greedy` is `false`,
            how many of the most probable paths will be returned.

    # Returns
        Tuple:
            List: if `greedy` is `true`, returns a list of one element that
                contains the decoded sequence.
                If `false`, returns the `top_paths` most probable
                decoded sequences.
                Important: blank labels are returned as `-1`.
            Tensor `(top_paths, )` that contains
                the log probability of each decoded sequence.
    """
    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
    input_length = tf.to_int32(input_length)

    if greedy:
        (decoded,
         log_prob) = ctc.ctc_greedy_decoder(inputs=y_pred,
                                            sequence_length=input_length,
                                            merge_repeated=False)
    else:
        (decoded,
         log_prob) = ctc.ctc_beam_search_decoder(inputs=y_pred,
                                                 sequence_length=input_length,
                                                 beam_width=beam_width,
                                                 top_paths=top_paths,
                                                 merge_repeated=False)

    decoded_dense = [
        tf.sparse_to_dense(st.indices,
                           st.dense_shape,
                           st.values,
                           default_value=-1) for st in decoded
    ]
    return (decoded_dense, log_prob)
    def ctc_complete_analysis_lambda_func(args, **arguments):
        """
        Complete CTC analysis using Keras and tensorflow
        WARNING : tf is required
        :param args:
            y_pred, labels, input_length, label_len
        :param arguments:
            greedy, beam_width, top_paths
        :return:
            ler = label error rate
        """

        y_pred, labels, input_length, label_len = args
        my_params = arguments

        assert (K.backend() == 'tensorflow')

        batch = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
        input_length = tf.to_int32(tf.squeeze(input_length))

        greedy = my_params['greedy']
        beam_width = my_params['beam_width']
        top_paths = my_params['top_paths']

        if greedy:
            (decoded,
             log_prob) = ctc.ctc_greedy_decoder(inputs=batch,
                                                sequence_length=input_length)
        else:
            (decoded, log_prob) = ctc.ctc_beam_search_decoder(
                inputs=batch,
                sequence_length=input_length,
                beam_width=beam_width,
                top_paths=top_paths)

        cast_decoded = tf.cast(decoded[0], tf.float32)

        sparse_y = K.ctc_label_dense_to_sparse(
            labels, tf.cast(tf.squeeze(label_len), tf.int32))
        ed_tensor = tf_edit_distance(cast_decoded, sparse_y, norm=True)
        ler_per_seq = Kreshape_To1D(ed_tensor)

        return K.cast(ler_per_seq, dtype='float32')
def ctc_decode(y_pred, input_length, max_output_length):
    """
    Cut down from https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L4170

    Decodes the output of a softmax.
    Uses greedy (best path) search.

    # Arguments
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, )` containing the sequence length for
            each batch item in `y_pred`.
        max_output_length: int giving the max output sequence length

    # Returns
        List: list of one element that contains the decoded sequence.
    """
    y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon())
    input_length = tf.cast((tf.squeeze(input_length, axis=-1)), tf.int32)

    (decoded, _) = ctc_ops.ctc_greedy_decoder(inputs=y_pred,
                                              sequence_length=input_length)

    sparse = decoded[0]
    decoded_dense = tf.sparse_to_dense(sparse.indices,
                                       sparse.dense_shape,
                                       sparse.values,
                                       default_value=-1)

    # Unfortunately, decoded_dense will be of different number of columns, depending on the decodings.
    # We need to get it all in one standard shape, so let's pad if necessary.
    max_length = max_output_length + 2  # giving 2 extra characters for CTC leeway
    cols = tf.shape(decoded_dense)[-1]

    def pad():
        return tf.pad(decoded_dense, [[0, 0], [0, max_length - cols]],
                      constant_values=-1)

    def noop():
        return decoded_dense

    return tf.cond(tf.less(cols, max_length), pad, noop)
Esempio n. 11
0
    def wrap(inputs):
        logits, output_len = inputs
        outputs = {
            'blank_last_logits': logits,
            'out_len': output_len,
            'logits': tf.roll(logits, shift=1, axis=-1),
        }
        outputs['blank_last_softmax'] = tf.nn.softmax(
            outputs['blank_last_logits'], axis=-1)
        outputs['softmax'] = tf.nn.softmax(outputs['logits'])

        greedy_decoded = \
            ctc_ops.ctc_greedy_decoder(inputs=tf.transpose(outputs['blank_last_logits'], perm=[1, 0, 2]),
                                       sequence_length=tf.cast(K.flatten(outputs['out_len']),
                                                               'int32'))[0][0]
        greedy_decoded = tf.cast(greedy_decoded, 'int32', 'greedy_int32')
        outputs['decoded'] = tf.sparse.to_dense(
            greedy_decoded,
            default_value=tf.constant(-1, dtype=greedy_decoded.dtype)) + 1
        return outputs
Esempio n. 12
0
    def make_outputs(self, blank_last_softmax, lstm_seq_len, complete_outputs):
        softmax = tf.roll(blank_last_softmax, shift=1, axis=-1)

        greedy_decoded = ctc.ctc_greedy_decoder(
            inputs=tf.transpose(blank_last_softmax, perm=[1, 0, 2]),
            sequence_length=tf.cast(K.flatten(lstm_seq_len), "int32"),
        )[0][0]

        outputs = {
            "blank_last_logits": tf.math.log(blank_last_softmax),
            "blank_last_softmax": blank_last_softmax,
            "logits": tf.math.log(softmax),
            "softmax": softmax,
            "out_len": lstm_seq_len,
            "decoded": tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1,
        }

        for i, voter_output in enumerate(complete_outputs):
            for k, v in voter_output.items():
                outputs[f"{k}_{i}"] = v

        return outputs
Esempio n. 13
0
    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc_ops.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    saver = tf.train.Saver()

with tf.Session(graph=graph) as session:
    # Initializate the weights and biases
    init_op = tf.global_variables_initializer()

    init_op.run()
    # saver.restore(session, './orange.ckpt')
    # print("Model restored.")
Esempio n. 14
0
def CheckpointTest():
    # input_tensor为输入音频数据,由前面分析可知,它的结构是[batch_size, amax_stepsize, n_input + (2 * n_input * n_context)]
    # 其中,batch_size是batch的长度,amax_stepsize是时序长度,n_input + (2 * n_input * n_context)是MFCC特征数,
    # batch_size是可变的,所以设为None,由于每一批次的时序长度不固定,所有,amax_stepsize也设为None
    input_tensor = tf.placeholder(tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input')
    # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op.
    # targets保存的是音频数据对应的文本的系数张量,所以用sparse_placeholder创建一个稀疏张量
    targets = tf.sparse_placeholder(tf.int32, name='targets')
    # seq_length保存的是当前batch数据的时序长度
    seq_length = tf.placeholder(tf.int32, [None], name='seq_length')
    # keep_dropout则是dropout的参数
    keep_dropout = tf.placeholder(tf.float32)

    # logits is the non-normalized output/activations from the last layer.
    # logits will be input for the loss function.
    # nn_model is from the import statement in the load_model function
    logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout)

    aa = ctc_ops.ctc_loss(targets, logits, seq_length)
    # 使用ctc loss计算损失
    avg_loss = tf.reduce_mean(aa)

    # 优化器
    learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(avg_loss)

    # 使用CTC decoder
    with tf.name_scope("decode"):
        decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_length, merge_repeated=True)

    # 计算编辑距离
    with tf.name_scope("accuracy"):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
        # 计算label error rate (accuracy)
        ler = tf.reduce_mean(distance, name='label_error_rate')

    # 迭代次数
    epochs = 150
    # 模型保存地址
    savedir = "saver/"
    # 如果该目录不存在,新建
    if os.path.exists(savedir) == False:
        os.mkdir(savedir)

    # 生成saver
    saver = tf.train.Saver(max_to_keep=1)
    # 创建session
    with tf.Session() as sess:
        # 初始化
        sess.run(tf.global_variables_initializer())
        # 没有模型的话,就重新初始化
        kpt = tf.train.latest_checkpoint(savedir)
        print("kpt:", kpt)
        startepo = 0
        if kpt != None:
            saver.restore(sess, kpt)
            ind = kpt.find("-")
            startepo = int(kpt[ind + 1:])

        # 要识别的语音文件
        wav_file = 'input.wav'

        source, source_lengths, sparse_labels = get_speech_file(wav_file, labels)
        feed2 = {input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0}
        d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2)
        dense_decoded = tf.sparse_tensor_to_dense(d, default_value=-1).eval(session=sess)
        if (len(dense_decoded) > 0):
            decoded_str = ndarray_to_text_ch(dense_decoded[0], words)
            print('Decoded:  {}'.format(decoded_str))
 def sparse_decoded(logits, output_seq_len):
     return ctc.ctc_greedy_decoder(
         inputs=array_ops.transpose(logits, perm=[1, 0, 2]),
         sequence_length=tf.cast(K.flatten(output_seq_len),
                                 'int32'))[0][0]
Esempio n. 16
0
    def create_network(self, inputs, input_seq_len, dropout_rate,
                       reuse_variables):
        network_proto = self.network_proto
        seq_len = input_seq_len
        batch_size = tf.shape(inputs)[0]
        gpu_enabled = self.gpu_available

        with tf.variable_scope("", reuse=reuse_variables) as scope:
            no_layers = len(network_proto.layers) == 0
            if not no_layers:
                has_conv_or_pool = network_proto.layers[
                    0].type != LayerParams.LSTM
            else:
                has_conv_or_pool = False

            if has_conv_or_pool:
                cnn_inputs = tf.reshape(
                    inputs, [batch_size, -1, network_proto.features, 1])
                shape = seq_len, network_proto.features

                layers = [cnn_inputs]
                last_num_filters = 1

                cnn_layer_index = 0
                for layer in [
                        l for l in network_proto.layers
                        if l.type != LayerParams.LSTM
                ]:
                    if layer.type == LayerParams.CONVOLUTIONAL:
                        layers.append(
                            tf.layers.conv2d(
                                name="conv2d" if cnn_layer_index == 0 else
                                "conv2d_{}".format(cnn_layer_index),
                                inputs=layers[-1],
                                filters=layer.filters,
                                kernel_size=(layer.kernel_size.x,
                                             layer.kernel_size.y),
                                padding="same",
                                activation=tf.nn.relu,
                                reuse=reuse_variables,
                            ))
                        cnn_layer_index += 1
                        last_num_filters = layer.filters
                    elif layer.type == LayerParams.MAX_POOLING:
                        layers.append(
                            tf.layers.max_pooling2d(
                                inputs=layers[-1],
                                pool_size=(layer.kernel_size.x,
                                           layer.kernel_size.y),
                                strides=(layer.stride.x, layer.stride.y),
                                padding="same",
                            ))

                        shape = (tf.to_int32(shape[0] // layer.stride.x),
                                 shape[1] // layer.stride.y)
                    else:
                        raise Exception("Unknown layer of type %s" %
                                        layer.type)

                lstm_seq_len, lstm_num_features = shape
                rnn_inputs = tf.reshape(layers[-1], [
                    batch_size,
                    tf.shape(layers[-1])[1],
                    last_num_filters * lstm_num_features
                ])

                lstm_num_features = last_num_filters * lstm_num_features
            else:
                rnn_inputs = inputs
                lstm_seq_len = seq_len
                lstm_num_features = network_proto.features

            lstm_layers = [
                l for l in network_proto.layers if l.type == LayerParams.LSTM
            ]

            # Time major inputs required for lstm
            time_major_inputs = tf.transpose(rnn_inputs, [1, 0, 2])

            if len(lstm_layers) > 0:
                for i, lstm in enumerate(lstm_layers):
                    if lstm.hidden_nodes != lstm_layers[0].hidden_nodes:
                        raise Exception(
                            "Currently all lstm layers must have an equal number of hidden nodes. "
                            "Got {} != {}".format(lstm.hidden_nodes,
                                                  lstm_layers[0].hidden_nodes))

                def cpu_cudnn_compatible_lstm_backend(time_major_inputs,
                                                      hidden_nodes):
                    def get_lstm_cell(num_hidden):
                        return cudnn_rnn.CudnnCompatibleLSTMCell(
                            num_hidden, reuse=reuse_variables)

                    fw, bw = zip(*[(get_lstm_cell(hidden_nodes),
                                    get_lstm_cell(hidden_nodes))
                                   for lstm in lstm_layers])

                    time_major_outputs, output_fw, output_bw \
                        = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(list(fw), list(bw), time_major_inputs,
                                                                         sequence_length=lstm_seq_len,
                                                                         dtype=tf.float32,
                                                                         scope="{}cudnn_lstm/stack_bidirectional_rnn".format(scope.name),
                                                                         time_major=True,
                                                                         )

                    return time_major_outputs

                def gpu_cudnn_lstm_backend(time_major_inputs, hidden_nodes):
                    # Create the Cudnn LSTM factory
                    rnn_lstm = cudnn_rnn.CudnnLSTM(
                        len(lstm_layers),
                        hidden_nodes,
                        direction='bidirectional',
                        kernel_initializer=tf.initializers.random_uniform(
                            -0.1, 0.1))

                    # TODO: Check if the models are loadable from meta Graph, maybe the next line fixed this
                    rnn_lstm._saveable_cls = cudnn_rnn.CudnnLSTMSaveable

                    # Apply the lstm to the inputs
                    time_major_outputs, (
                        output_h, output_c) = rnn_lstm(time_major_inputs)
                    return time_major_outputs

                if network_proto.backend.cudnn:
                    if gpu_enabled:
                        print("Using CUDNN LSTM backend on GPU")
                        time_major_outputs = gpu_cudnn_lstm_backend(
                            time_major_inputs, lstm_layers[0].hidden_nodes)
                    else:
                        print("Using CUDNN compatible LSTM backend on CPU")
                        time_major_outputs = cpu_cudnn_compatible_lstm_backend(
                            time_major_inputs, lstm_layers[0].hidden_nodes)
                else:
                    raise Exception("Only cudnn based backend supported yet.")

                # Set the output size
                output_size = lstm_layers[-1].hidden_nodes * 2
            else:
                output_size = lstm_num_features
                time_major_outputs = time_major_inputs

            # flatten to (T * N, F) for matrix multiplication. This will be reversed later
            time_major_outputs = tf.reshape(
                time_major_outputs,
                [-1, time_major_outputs.shape.as_list()[2]])

            if network_proto.dropout > 0:
                time_major_outputs = tf.nn.dropout(time_major_outputs,
                                                   1 - dropout_rate,
                                                   name="dropout")

            # we need to turn off validate_shape so we can resize the variable on a codec resize
            w = tf.get_variable('W',
                                validate_shape=False,
                                initializer=tf.random_uniform(
                                    [output_size, network_proto.classes], -0.1,
                                    0.1))
            b = tf.get_variable('B',
                                validate_shape=False,
                                initializer=tf.constant(
                                    0., shape=[network_proto.classes]))

            # the output layer
            time_major_logits = tf.matmul(time_major_outputs, w) + b

            # reshape back
            time_major_logits = tf.reshape(
                time_major_logits,
                [-1, batch_size, tf.shape(w)[-1]],
                name="time_major_logits")

            time_major_softmax = tf.nn.softmax(time_major_logits, -1,
                                               "time_major_softmax")

            logits = tf.transpose(time_major_logits, [1, 0, 2], name="logits")
            softmax = tf.transpose(time_major_softmax, [1, 0, 2],
                                   name="softmax")

            lstm_seq_len = tf.identity(lstm_seq_len, "seq_len_out")

            # DECODER
            # ================================================================
            if network_proto.ctc == NetworkParams.CTC_DEFAULT:
                decoded, log_prob = ctc_ops.ctc_greedy_decoder(
                    time_major_logits,
                    lstm_seq_len,
                    merge_repeated=network_proto.ctc_merge_repeated)
            elif network_proto.ctc == NetworkParams.CTC_FUZZY:
                decoded, log_prob = self.fuzzy_module['decoder_op'](
                    softmax, lstm_seq_len)
            else:
                raise Exception(
                    "Unknown ctc model: '%s'. Supported are Default and Fuzzy"
                    % network_proto.ctc)

            decoded = decoded[0]
            sparse_decoded = (
                tf.identity(decoded.indices, name="decoded_indices"),
                tf.identity(decoded.values, name="decoded_values"),
                tf.identity(decoded.dense_shape, name="decoded_shape"),
            )

            return lstm_seq_len, time_major_logits, time_major_softmax, logits, softmax, decoded, sparse_decoded
Esempio n. 17
0
    def call(self, inputs, training=None):
        if training is None:
            training = K.learning_phase()

        batch_size = tf.shape(inputs['img_len'])[0]
        max_lstm_seq_len = self._params.compute_downscaled(
            tf.shape(inputs['img'])[1])
        # only pass folds to selected folds
        if 'fold_id' in inputs:
            # Training/Validation graph
            def training_step():
                tf.debugging.assert_greater_equal(inputs['fold_id'], 0)
                complete_outputs = [
                    self.fold_graphs[i](inputs)
                    for i in range(len(self.fold_graphs))
                ]

                lstm_seq_len = complete_outputs[0][
                    'out_len']  # is the same for all children
                softmax_outputs = tf.stack(
                    [out['blank_last_softmax'] for out in complete_outputs],
                    axis=0)

                # Training: Mask out network that does not contribute to a sample to generate strong voters
                if not self._params.no_masking_out_during_training:
                    mask = [
                        tf.not_equal(i, inputs['fold_id'])
                        for i in range(len(self.fold_graphs))
                    ]
                    softmax_outputs *= tf.cast(tf.expand_dims(mask, axis=-1),
                                               dtype='float32')
                    blank_last_softmax = tf.reduce_sum(
                        softmax_outputs, axis=0) / (
                            len(self.fold_graphs) - 1
                        )  # only n - 1 since one voter is 0
                else:
                    # In this case, training behaves similar to prediction
                    blank_last_softmax = tf.reduce_mean(softmax_outputs,
                                                        axis=0)
                return blank_last_softmax, lstm_seq_len, complete_outputs

            def validation_step():
                # any dummy output is max length, to get actional outpu length t use reduce_min
                def gen_empty_output(bs):
                    empty = tf.zeros(
                        shape=[bs, max_lstm_seq_len, self._params.classes],
                        dtype='float32')
                    return {
                        'blank_last_logits':
                        empty,
                        'blank_last_softmax':
                        empty,
                        'out_len':
                        tf.repeat(max_lstm_seq_len, repeats=bs),
                        'logits':
                        empty,
                        'softmax':
                        empty,
                        'decoded':
                        tf.zeros(shape=[bs, max_lstm_seq_len], dtype='int64'),
                    }

                empty_output = gen_empty_output(1)

                # Validation: Compute output for each graph but only for its own partition
                # Per sample this is one CER which is then used e. g. for early stopping
                def apply_single_model(batch):
                    batch = batch[
                        'out_len']  # Take any, all are batch id as input
                    single_batch_data = {
                        k: [tf.gather(v, batch)]
                        for k, v in inputs.items()
                    }
                    complete_outputs = [
                        tf.cond(tf.equal(i, inputs['fold_id'][batch]),
                                lambda: self.fold_graphs[i](single_batch_data),
                                lambda: empty_output)
                        for i in range(len(self.fold_graphs))
                    ]
                    outputs = {
                        k: tf.gather(
                            tf.stack([out[k] for out in complete_outputs]),
                            inputs['fold_id'][batch][0])[0]
                        for k in empty_output.keys() if k != 'decoded'
                    }
                    paddings = [
                        ([0, 0],
                         [0, max_lstm_seq_len - tf.shape(out['decoded'])[1]])
                        for out in complete_outputs
                    ]
                    outputs['decoded'] = tf.gather(
                        tf.stack([
                            tf.pad(out['decoded'],
                                   padding,
                                   'CONSTANT',
                                   constant_values=0)
                            for out, padding in zip(complete_outputs, paddings)
                        ]), inputs['fold_id'][batch][0])[0]
                    return outputs

                complete_outputs = tf.map_fn(apply_single_model, {
                    k: tf.range(batch_size, dtype=v.dtype)
                    for k, v in empty_output.items()
                },
                                             parallel_iterations=len(
                                                 self.fold_graphs),
                                             back_prop=False)
                return complete_outputs[
                    'blank_last_softmax'], complete_outputs['out_len'], [
                        complete_outputs
                    ] * len(self.fold_graphs)

            if isinstance(training, bool) or isinstance(training, int):
                blank_last_softmax, lstm_seq_len, complete_outputs = training_step(
                ) if training else validation_step()
            else:
                blank_last_softmax, lstm_seq_len, complete_outputs = tf.cond(
                    training, training_step, validation_step)
        else:
            # Prediction Graph: standard voting
            complete_outputs = [
                self.fold_graphs[i](inputs)
                for i in range(len(self.fold_graphs))
            ]

            lstm_seq_len = complete_outputs[0][
                'out_len']  # is the same for all children
            softmax_outputs = tf.stack(
                [out['blank_last_softmax'] for out in complete_outputs],
                axis=0)

            blank_last_softmax = tf.reduce_mean(softmax_outputs, axis=0)

        softmax = tf.roll(blank_last_softmax, shift=1, axis=-1)

        greedy_decoded = ctc.ctc_greedy_decoder(
            inputs=tf.transpose(blank_last_softmax, perm=[1, 0, 2]),
            sequence_length=tf.cast(K.flatten(lstm_seq_len), 'int32'))[0][0]

        outputs = {
            'blank_last_logits': tf.math.log(blank_last_softmax),
            'blank_last_softmax': blank_last_softmax,
            'logits': tf.math.log(softmax),
            'softmax': softmax,
            "out_len": lstm_seq_len,
            'decoded':
            tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1,
        }

        for i, voter_output in enumerate(complete_outputs):
            for k, v in voter_output.items():
                outputs[f"{k}_{i}"] = v

        return outputs
Esempio n. 18
0
    outputs, last_state = tf.nn.dynamic_rnn(cell, inputList,
                                            seqLengths, initial,
                                            dtype=tf.float32, scope='rnn')
    
    outputs = tf.reshape(outputs, (-1, nHidden))
    logits = tf.matmul(outputs, W) + b
    
    logits = tf.reshape(logits, (batchSize, -1, nClasses))
    logits = tf.transpose(logits, [1,0,2])
    
    ####Optimizing
    loss = tf.reduce_mean(ctc.ctc_loss(logits, targetY, seqLengths))
    optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

    ####Evaluating
    predictions = tf.to_int32(ctc.ctc_greedy_decoder(logits, seqLengths)[0][0])

    err = tf.edit_distance(predictions, targetY, normalize=True)
    err.set_shape([None])
    err = tf.reduce_mean(err, name='error')

####Run session
with tf.Session(graph=graph) as session:
    print('Initializing')
    tf.initialize_all_variables().run()
    for epoch in range(nEpochs):
        print('Epoch', epoch+1, '...')
        if epoch % 10 == 0:
            print('Saving Graph')
            tf.train.Saver().save(session, "/home/zhihaol/807/model.ckpt")
            tf.train.write_graph(session.graph_def, "/home/zhihaol/807/", "model_graph.pbtxt", True)
Esempio n. 19
0
    def from_proto(network_proto):
        reuse_variables = False
        intra_threads = network_proto.backend.num_intra_threads
        inter_threads = network_proto.backend.num_inter_threads

        # load fuzzy ctc module if available
        if len(network_proto.backend.fuzzy_ctc_library_path
               ) > 0 and network_proto.ctc == NetworkParams.CTC_FUZZY:
            from calamari_ocr.ocr.backends.tensorflow_backend.tensorflow_fuzzy_ctc_loader import load as load_fuzzy
            fuzzy_module = load_fuzzy(
                network_proto.backend.fuzzy_ctc_library_path)
        else:
            fuzzy_module = None

        graph = tf.Graph()
        with graph.as_default():
            tf.set_random_seed(network_proto.backend.random_seed)
            session = tf.Session(
                graph=graph,
                config=tf.ConfigProto(
                    intra_op_parallelism_threads=intra_threads,
                    inter_op_parallelism_threads=inter_threads,
                ))
            gpu_enabled = False
            for d in session.list_devices():
                if d.device_type == "GPU":
                    gpu_enabled = True
                    break

            inputs = tf.placeholder(tf.float32,
                                    shape=(None, None, network_proto.features),
                                    name="inputs")
            batch_size = tf.shape(inputs)[0]
            seq_len = tf.placeholder(tf.int32, shape=(None, ), name="seq_len")
            targets = tf.sparse_placeholder(tf.int32,
                                            shape=(None, None),
                                            name="targets")
            dropout_rate = tf.placeholder(tf.float32,
                                          shape=(),
                                          name="dropout_rate")

            with tf.variable_scope("", reuse=reuse_variables) as scope:
                no_layers = len(network_proto.layers) == 0
                if not no_layers:
                    has_conv_or_pool = network_proto.layers[
                        0].type != LayerParams.LSTM
                else:
                    has_conv_or_pool = False

                if has_conv_or_pool:
                    cnn_inputs = tf.reshape(
                        inputs, [batch_size, -1, network_proto.features, 1])
                    shape = seq_len, network_proto.features

                    layers = [cnn_inputs]
                    last_num_filters = 1

                    for layer in [
                            l for l in network_proto.layers
                            if l.type != LayerParams.LSTM
                    ]:
                        if layer.type == LayerParams.CONVOLUTIONAL:
                            layers.append(
                                tf.layers.conv2d(
                                    inputs=layers[-1],
                                    filters=layer.filters,
                                    kernel_size=(layer.kernel_size.x,
                                                 layer.kernel_size.y),
                                    padding="same",
                                    activation=tf.nn.relu,
                                ))
                            last_num_filters = layer.filters
                        elif layer.type == LayerParams.MAX_POOLING:
                            layers.append(
                                tf.layers.max_pooling2d(
                                    inputs=layers[-1],
                                    pool_size=(layer.kernel_size.x,
                                               layer.kernel_size.y),
                                    strides=(layer.stride.x, layer.stride.y),
                                    padding="same",
                                ))

                            shape = (tf.to_int32(shape[0] // layer.stride.x),
                                     shape[1] // layer.stride.y)
                        else:
                            raise Exception("Unknown layer of type %s" %
                                            layer.type)

                    lstm_seq_len, lstm_num_features = shape
                    rnn_inputs = tf.reshape(layers[-1], [
                        batch_size,
                        tf.shape(layers[-1])[1],
                        last_num_filters * lstm_num_features
                    ])

                    lstm_num_features = last_num_filters * lstm_num_features
                else:
                    rnn_inputs = inputs
                    lstm_seq_len = seq_len
                    lstm_num_features = network_proto.features

                lstm_layers = [
                    l for l in network_proto.layers
                    if l.type == LayerParams.LSTM
                ]

                # Time major inputs required for lstm
                time_major_inputs = tf.transpose(rnn_inputs, [1, 0, 2])

                if len(lstm_layers) > 0:
                    for i, lstm in enumerate(lstm_layers):
                        if lstm.hidden_nodes != lstm_layers[0].hidden_nodes:
                            raise Exception(
                                "Currently all lstm layers must have an equal number of hidden nodes. "
                                "Got {} != {}".format(
                                    lstm.hidden_nodes,
                                    lstm_layers[0].hidden_nodes))

                    def cpu_cudnn_compatible_lstm_backend(
                            time_major_inputs, hidden_nodes):
                        def get_lstm_cell(num_hidden):
                            return cudnn_rnn.CudnnCompatibleLSTMCell(
                                num_hidden, reuse=reuse_variables)

                        fw, bw = zip(*[(get_lstm_cell(hidden_nodes),
                                        get_lstm_cell(hidden_nodes))
                                       for lstm in lstm_layers])

                        time_major_outputs, output_fw, output_bw \
                            = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(list(fw), list(bw), time_major_inputs,
                                                                             sequence_length=lstm_seq_len,
                                                                             dtype=tf.float32,
                                                                             scope="{}cudnn_lstm/stack_bidirectional_rnn".format(scope.name),
                                                                             time_major=True,
                                                                             )

                        return time_major_outputs

                    def gpu_cudnn_lstm_backend(time_major_inputs,
                                               hidden_nodes):
                        # Create the Cudnn LSTM factory
                        rnn_lstm = cudnn_rnn.CudnnLSTM(
                            len(lstm_layers),
                            hidden_nodes,
                            direction='bidirectional',
                            kernel_initializer=tf.initializers.random_uniform(
                                -0.1, 0.1))

                        # TODO: Check if the models are loadable from meta Graph, maybe the next line fixed this
                        rnn_lstm._saveable_cls = cudnn_rnn.CudnnLSTMSaveable

                        # Apply the lstm to the inputs
                        time_major_outputs, (
                            output_h, output_c) = rnn_lstm(time_major_inputs)
                        return time_major_outputs

                    if network_proto.backend.cudnn:
                        if gpu_enabled:
                            print("Using CUDNN LSTM backend on GPU")
                            time_major_outputs = gpu_cudnn_lstm_backend(
                                time_major_inputs, lstm_layers[0].hidden_nodes)
                        else:
                            print("Using CUDNN compatible LSTM backend on CPU")
                            time_major_outputs = cpu_cudnn_compatible_lstm_backend(
                                time_major_inputs, lstm_layers[0].hidden_nodes)
                    else:
                        raise Exception(
                            "Only cudnn based backend supported yet.")

                    # Set the output size
                    output_size = lstm_layers[-1].hidden_nodes * 2
                else:
                    output_size = lstm_num_features
                    time_major_outputs = time_major_inputs

                # flatten to (T * N, F) for matrix multiplication. This will be reversed later
                time_major_outputs = tf.reshape(
                    time_major_outputs,
                    [-1, time_major_outputs.shape.as_list()[2]])

                if network_proto.dropout > 0:
                    time_major_outputs = tf.nn.dropout(time_major_outputs,
                                                       1 - dropout_rate,
                                                       name="dropout")

                # we need to turn off validate_shape so we can resize the variable on a codec resize
                W = tf.get_variable('W',
                                    validate_shape=False,
                                    initializer=tf.random_uniform(
                                        [output_size, network_proto.classes],
                                        -0.1, 0.1))
                b = tf.get_variable('B',
                                    validate_shape=False,
                                    initializer=tf.constant(
                                        0., shape=[network_proto.classes]))

                # the output layer
                time_major_logits = tf.matmul(time_major_outputs, W) + b

                # reshape back
                time_major_logits = tf.reshape(
                    time_major_logits,
                    [-1, batch_size, tf.shape(W)[-1]],
                    name="time_major_logits")

                time_major_softmax = tf.nn.softmax(time_major_logits, -1,
                                                   "time_major_softmax")

                logits = tf.transpose(time_major_logits, [1, 0, 2],
                                      name="logits")
                softmax = tf.transpose(time_major_softmax, [1, 0, 2],
                                       name="softmax")

                # ctc predictions
                # Note for codec change: the codec size is derived upon creation, therefore the ctc ops must be created
                # using the true codec size (the W/B-Matrix may change its shape however during loading/codec change
                # to match the true codec size
                if network_proto.ctc == NetworkParams.CTC_DEFAULT:
                    loss = ctc_ops.ctc_loss(
                        targets,
                        time_major_logits,
                        lstm_seq_len,
                        time_major=True,
                        ctc_merge_repeated=network_proto.ctc_merge_repeated,
                        ignore_longer_outputs_than_inputs=True)
                    decoded, log_prob = ctc_ops.ctc_greedy_decoder(
                        time_major_logits,
                        lstm_seq_len,
                        merge_repeated=network_proto.ctc_merge_repeated)
                    # decoded, log_prob = ctc_ops.ctc_beam_search_decoder(time_major_logits, lstm_seq_len, merge_repeated=model_settings["merge_repeated"])
                elif network_proto.ctc == NetworkParams.CTC_FUZZY:
                    loss, deltas = fuzzy_module['module'].fuzzy_ctc_loss(
                        logits,
                        targets.indices,
                        targets.values,
                        lstm_seq_len,
                        ignore_longer_outputs_than_inputs=True)
                    decoded, log_prob = fuzzy_module['decoder_op'](
                        softmax, lstm_seq_len)
                else:
                    raise Exception(
                        "Unknown ctc model: '%s'. Supported are Default and Fuzzy"
                        % network_proto.ctc)

                decoded = decoded[0]
                sparse_decoded = (
                    tf.identity(decoded.indices, name="decoded_indices"),
                    tf.identity(decoded.values, name="decoded_values"),
                    tf.identity(decoded.dense_shape, name="decoded_shape"),
                )

                cost = tf.reduce_mean(loss, name='cost')
                if network_proto.solver == NetworkParams.MOMENTUM_SOLVER:
                    optimizer = tf.train.MomentumOptimizer(
                        network_proto.learning_rate, network_proto.momentum)
                elif network_proto.solver == NetworkParams.ADAM_SOLVER:
                    optimizer = tf.train.AdamOptimizer(
                        network_proto.learning_rate)
                else:
                    raise Exception("Unknown solver of type '%s'" %
                                    network_proto.solver)

                gvs = optimizer.compute_gradients(cost)

                training_ops = []
                if network_proto.clipping_mode == NetworkParams.CLIP_NONE:
                    pass
                elif network_proto.clipping_mode == NetworkParams.CLIP_AUTO:
                    # exponentially follow the global average of gradients to set clipping
                    ema = tf.train.ExponentialMovingAverage(decay=0.999)

                    max_l2 = 1000
                    max_grads = 1000

                    grads = [grad for grad, _ in gvs]
                    l2 = tf.minimum(tf.global_norm([grad for grad in grads]),
                                    max_l2)
                    l2_ema_op, l2_ema = ema.apply([l2]), ema.average(l2)
                    grads, _ = tf.clip_by_global_norm(
                        grads,
                        clip_norm=tf.minimum(l2_ema / max_l2 * max_grads,
                                             max_grads))
                    gvs = zip(grads, [var for _, var in gvs])
                    training_ops.append(l2_ema_op)
                elif network_proto.clipping_mode == NetworkParams.CLIP_CONSTANT:
                    clip = network_proto.clipping_constant
                    if clip <= 0:
                        raise Exception(
                            "Invalid clipping constant. Must be greater than 0, but got {}"
                            .format(clip))

                    grads = [grad for grad, _ in gvs]
                    grads, _ = tf.clip_by_global_norm(grads, clip_norm=clip)
                    gvs = zip(grads, [var for _, var in gvs])
                else:
                    raise Exception("Unsupported clipping mode {}".format(
                        network_proto.clipping_mode))

                training_ops.append(
                    optimizer.apply_gradients(gvs, name='grad_update_op'))
                train_op = tf.group(training_ops, name="train_op")

                ler = tf.reduce_mean(tf.edit_distance(
                    tf.cast(decoded, tf.int32), targets),
                                     name='ler')

                lstm_seq_len = tf.identity(lstm_seq_len, "seq_len_out")

                return TensorflowModel(network_proto, graph, session, inputs,
                                       seq_len, lstm_seq_len, targets,
                                       train_op, cost, ler, sparse_decoded,
                                       softmax, dropout_rate)
Esempio n. 20
0
beam_search_decoder = parser.get(config_header, 'beam_search_decoder')
# set up GPU if available
tf_device = str(parser.get(config_header, 'tf_device'))
# set up the max amount of simultaneous users
# this restricts GPU usage to the inverse of self.simultaneous_users_count
simultaneous_users_count = parser.getint(config_header,
                                         'simultaneous_users_count')

input_tensor = tf.placeholder(
    tf.float32, [None, None, n_input + (2 * n_input * n_context)],
    name='input')
seq_length = tf.placeholder(tf.int32, [None], name='seq_length')
logits, summary_op = BiRNN_model(conf_path, input_tensor,
                                 tf.to_int64(seq_length), n_input, n_context)
decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits,
                                               seq_length,
                                               merge_repeated=True)

saver = tf.train.Saver()
# create the session
sess = tf.Session()
saver.restore(sess, model_path)
print('Model restored')


def evaluate(filename='data/test/1_input.npy'):

    points = np.load(filename)

    # print("Points before pre",points.shape)
    NORM_ARGS = [
Esempio n. 21
0
    def call(self, inputs, **kwargs):
        params: ModelParams = self._params
        input_data = tf.cast(inputs['img'], tf.float32) / 255.0
        input_sequence_length = K.flatten(inputs['img_len'])
        shape = input_sequence_length, -1

        # if concat or conv_T layers are present, we need to pad the input to ensure that possible upsampling layers work properly
        has_concat = any([
            l.type == LayerType.Concat or l.type == LayerType.TransposedConv
            for l in params.layers
        ])
        if has_concat:
            sx, sy = 1, 1
            for layer_index, layer in enumerate(
                [l for l in params.layers if l.type == LayerType.MaxPooling]):
                sx *= layer.stride.x
                sy *= layer.stride.y
            padding = calculate_padding(input_data, (sx, sy))
            padded = KL.Lambda(pad, name='padded_input')([input_data, padding])
            last_layer_output = padded
        else:
            last_layer_output = input_data

        layers_by_index = []
        for (lp, layer) in self.conv_layers:
            layers_by_index.append(last_layer_output)
            if lp.type == LayerType.Convolutional:
                last_layer_output = layer(last_layer_output)
            elif lp.type == LayerType.Concat:
                last_layer_output = layer(
                    [layers_by_index[i] for i in lp.concat_indices])
            elif lp.type == LayerType.DilatedBlock:
                ds = K.shape(last_layer_output)
                ss = last_layer_output.shape
                dilated_layers, concat_layer = layer
                dilated_layers = [
                    dl(last_layer_output) for dl in dilated_layers
                ]
                last_layer_output = concat_layer(dilated_layers)
                last_layer_output = K.reshape(last_layer_output,
                                              [ds[0], ds[1], ss[2], ss[3]])
            elif lp.type == LayerType.TransposedConv:
                last_layer_output = layer(last_layer_output)
            elif lp.type == LayerType.MaxPooling:
                last_layer_output = layer(last_layer_output)
                shape = (shape[0] // lp.stride.x, shape[1] // lp.stride.y)
            else:
                raise Exception("Unknown layer of type %s" % lp.type)

        lstm_seq_len, lstm_num_features = shape
        lstm_seq_len = K.cast(lstm_seq_len, 'int32')
        ds = K.shape(last_layer_output)
        ss = last_layer_output.shape
        last_layer_output = K.reshape(last_layer_output,
                                      (ds[0], ds[1], ss[2] * ss[3]))

        if len(self.lstm_layers) > 0:
            for lstm_params, lstm_layer in self.lstm_layers:
                last_layer_output = lstm_layer(last_layer_output)

        if params.dropout > 0:
            last_layer_output = self.dropout(last_layer_output)

        blank_last_logits = self.logits(last_layer_output)
        blank_last_softmax = self.softmax(blank_last_logits)

        logits = tf.roll(blank_last_logits, shift=1, axis=-1)
        softmax = tf.nn.softmax(logits)

        greedy_decoded = ctc.ctc_greedy_decoder(
            inputs=array_ops.transpose(blank_last_logits, perm=[1, 0, 2]),
            sequence_length=tf.cast(K.flatten(lstm_seq_len), 'int32'))[0][0]

        return {
            'blank_last_logits': blank_last_logits,
            'blank_last_softmax': blank_last_softmax,
            'out_len': lstm_seq_len,
            'logits': logits,
            'softmax': softmax,
            'decoded': tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1
        }