Ejemplo n.º 1
0
    def ctc_find_eos(y_true, y_pred):
        # From SO : TODO : var init, predlength objective
        #convert y_pred from one-hot to label indices
        y_pred_ind = K.argmax(y_pred, axis=-1)

        #to make sure y_pred has one end_of_sentence (to avoid errors)
        y_pred_end = K.concatenate(
            [y_pred_ind[:, :-1], eos_index * K.ones_like(y_pred_ind[:, -1:])],
            axis=1)

        #to make sure the first occurrence of the char is more important than subsequent ones
        occurrence_weights = K.arange(start=max_length,
                                      stop=0,
                                      dtype=K.floatx())

        is_eos_true = K.cast_to_floatx(K.equal(y_true, eos_index))
        is_eos_pred = K.cast_to_floatx(K.equal(y_pred_end, eos_index))

        #lengths
        true_lengths = 1 + K.argmax(occurrence_weights * is_eos_true, axis=1)
        pred_lengths = 1 + K.argmax(occurrence_weights * is_eos_pred, axis=1)

        #reshape
        true_lengths = K.reshape(true_lengths, (-1, 1))
        pred_lengths = K.reshape(pred_lengths, (-1, 1))

        return K.ctc_batch_cost(y_true, y_pred,
                                pred_lengths, true_lengths) + self.beta(
                                    pred_lengths)  # Maybe a temp fix
Ejemplo n.º 2
0
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, conv_dim=128, lstm_dim=256):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})')

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length,), name='y_true')
    input_length = Input(shape=(1,), name='input_length')
    label_length = Input(shape=(1,), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    conv = Conv2D(conv_dim, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped)

    conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv)

    lstm_output1 = lstm_fn(lstm_dim, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)
    lstm_output2 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output1)
    lstm_output3 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output2 + lstm_output1)
    lstm_output4 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output3 + lstm_output2)

    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output4)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows}
    )(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss'
    )([y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded'
    )([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output]
    )
    return model
Ejemplo n.º 3
0
def ctc_loss(y_true, y_pred):
    """Function for computing the CTC loss"""

    if len(y_true.shape) > 2:
        y_true = tf.squeeze(y_true)

    '''
    y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded)
    
              Output layer of the model is softmax. So sum across alphabet_size_1_hot_encoded results 1.
              string_length give string length.
    '''
    input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
    input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)

    # y_true strings are padded with 0. So sum of non-zero gives number of characters in this string.
    label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")

    '''
    About K.ctc_batch_loss:
        https://docs.w3cub.com/tensorflow~python/tf/keras/backend/ctc_batch_cost
        https://stackoverflow.com/questions/60782077/how-do-you-use-tensorflow-ctc-batch-cost-function-with-keras
    '''
    loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)

    # average loss across all entries in the batch
    loss = tf.reduce_mean(loss)

    return loss
    def ctc_loss_lambda_func(y_true, y_pred):
        """Function for computing the CTC loss"""

        if len(y_true.shape) > 2:
            y_true = tf.squeeze(y_true)

        # y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded)
        # output of every model is softmax
        # so sum across alphabet_size_1_hot_encoded give 1
        #               string_length give string length
        input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
        input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)

        # y_true strings are padded with 0
        # so sum of non-zero gives number of characters in this string
        label_length = tf.math.count_nonzero(y_true,
                                             axis=-1,
                                             keepdims=True,
                                             dtype="int64")

        loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)

        # average loss across all entries in the batch
        loss = tf.reduce_mean(loss)

        return loss
Ejemplo n.º 5
0
def ctc_lambda_loss(logits, labels, input_length, label_length, smoothing=0.0):
    '''
  ctc loss function
  psram: logits, (B, T, D)
  psram: input_length,  (B, 1), input length of encoder
  psram: labels, (B, T)
  psram: label_length,  (B, 1), label length for convert dense label to sparse
  returns: loss, scalar
  '''
    del smoothing

    ilen = tf.cond(
        pred=tf.equal(tf.rank(input_length), 1),
        true_fn=lambda: tf.expand_dims(input_length, axis=-1),
        false_fn=lambda: input_length,
    )
    olen = tf.cond(
        pred=tf.equal(tf.rank(label_length), 1),
        true_fn=lambda: tf.expand_dims(label_length, axis=-1),
        false_fn=lambda: label_length,
    )
    deps = [
        tf.assert_rank(labels, 2),
        tf.assert_rank(logits, 3),
        tf.assert_rank(ilen, 2),  # input_length
        tf.assert_rank(olen, 2),  # output_length
    ]
    with tf.control_dependencies(deps):
        # (B, 1)
        batch_loss = K.ctc_batch_cost(labels, logits, ilen, olen)
        loss = tf.reduce_mean(batch_loss)

    return loss
def validate(model,
             x,
             y_true,
             input_len,
             label_len,
             y_strings,
             test=False,
             save_file=None):
    input_len = np.expand_dims(input_len, axis=1)
    label_len = np.expand_dims(label_len, axis=1)

    y_pred = model(x)
    loss = ctc_batch_cost(y_true, y_pred, input_len, label_len)

    input_len = np.squeeze(input_len)
    y_decode = ctc_decode(y_pred, input_len)[0][0]

    accuracy = 0.0

    for i in range(len(y_strings)):
        predicted_sentence = indices_to_string(y_decode[i].numpy())
        accuracy += wer(predicted_sentence, y_strings[i])

        if test:
            save_file.write("Correct Sentence:" + str(y_strings[i]) + "\n")
            save_file.write("Predicted Sentence:" + predicted_sentence + "\n")

    return tf.reduce_mean(loss), accuracy / len(y_strings)
Ejemplo n.º 7
0
def ctc_loss(y_true, y_pred):
    """
    Runs CTC Loss Algorithm on each batch element

    :param y_true: tensor (samples, max_string_length) containing the truth labels.
    :param y_pred: tensor (samples, time_steps, num_categories) containing the prediction, or output of the softmax.

    * caution

    input_length : tensor (samples, 1) containing the sequence length for each batch item in y_pred
    label_length : tensor (samples, 1) containing the sequence length for each batch item in y_true

    y_true는 [3,7,12,1,2,-1,-1,-1,-1] 와 같은 형태로 구성되어 있음. -1은 Blank를 의미
    처음 등장하는 -1의 인덱스가 y_true의 sequnece length와 동일

    y_pred의 총 width와 input_length는 동일

    """

    # Get the Length of Prediction
    shape = tf.shape(y_pred)
    batch_size = shape[0]
    max_length = shape[1, None, None]
    input_length = tf.tile(max_length, [batch_size, 1])

    # Get the Length of Input
    label_length = tf.argmin(y_true, axis=-1)[:, None]

    return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
Ejemplo n.º 8
0
    def ctc_loss_lambda_func(y_true, y_pred):
        """Function for computing the CTC loss"""

        if len(y_true.shape) > 2:
            y_true = tf.squeeze(y_true)

        # y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded)
        # output of every model is softmax
        # so sum across alphabet_size_1_hot_encoded give 1
        #               string_length give string length
        input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
        input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)

        # y_true strings are padded with 0
        # so sum of non-zero gives number of characters in this string
        label_length = tf.math.count_nonzero(y_true,
                                             axis=-1,
                                             keepdims=True,
                                             dtype="int64")

        # if you have an error when training go to the definition of the ctc_batch_cost function and add
        # 'ignore_longer_outputs_than_inputs=True' in the parameters of the ctc.ctc_loss() function line 5764
        loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)

        # average loss across all entries in the batch
        loss = tf.reduce_mean(loss)

        return loss
Ejemplo n.º 9
0
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    y_pred = y_pred[:, 2:, :]

    res = backend.ctc_batch_cost(labels, y_pred, input_length, label_length)
    return res
Ejemplo n.º 10
0
 def ctc_loss(self, y_true, y_pred):
     if len(y_true.shape) > 2:
         y_true = tf.squeeze(y_true)
     input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
     input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)
     label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")
     loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
     loss = tf.reduce_mean(loss)
     return loss
Ejemplo n.º 11
0
 def ctc_lambda_func(args):
     y_pred, labels, input_length, label_length = args
     """
     labels: tensor (number of samples, max_string_length) containing the truth labels.
     y_pred: tensor (number of samples, time_steps, num_character_labels) containing the prediction, or output of the softmax.
     input_length: tensor (number of samples, 1) containing the sequence length for each batch item in y_pred.
     label_length: tensor (number of samples, 1) containing the sequence length for each batch item in y_true.
     """
     return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14):  # pylint: disable=too-many-locals
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})")

    image_input = Input(shape=input_shape, name="image")
    y_true = Input(shape=(output_length,), name="y_true")
    input_length = Input(shape=(1,), name="input_length")
    label_length = Input(shape=(1,), name="label_length")

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    # Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})(
        image_reshaped
    )
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes,))
    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    lstm_output = LSTM(128, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)

    softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output)
    # (num_windows, num_classes)
    # Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows}
    )(input_length)

    ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")(
        [y_true, softmax_output, input_length_processed, label_length]
    )

    ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")(
        [softmax_output, input_length_processed]
    )

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output],
    )
    return model
Ejemplo n.º 13
0
 def ctc_loss_lambda_func(args):
     """
     Function for computing the ctc loss (can be put in a Lambda layer)
     :param args:
         y_pred, labels, input_length, label_length
     :return: CTC loss
     """
     y_pred, labels, input_length, label_length = args
     return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def ctc_lambda_func(args):
    """Lambda implementation of CTC loss, using ctc_batch_cost from TensorFlow backend
    CTC implementation from Keras example found at https://github.com/keras-team/keras/blob/master/examples/image_ocr.py"""
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # print "y_pred_shape: ", y_pred.shape
    # y_pred = y_pred[:, 2:, :]
    # print "y_pred_shape: ", y_pred.shape
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
Ejemplo n.º 15
0
def ctcLoss(yTrue, yPred):
    # Reshape the ground truth tensor into shape required by ctc_batch_cost().
    yTrueShape = K.shape(yTrue)
    yTrue = K.reshape(yTrue, shape=(yTrueShape[0], yTrueShape[1]))

    # Get the input sequence and label sequence length for each sample in the batch.
    hasTrueLables = K.clip(yTrue + 1, 0, 1)
    labelLength = K.sum(hasTrueLables, axis=1, keepdims=True)
    hasPredLabels = K.sum(yPred, axis=2)
    inputLength = K.sum(hasPredLabels, axis=1, keepdims=True)
    return K.ctc_batch_cost(yTrue, yPred, inputLength, labelLength)
Ejemplo n.º 16
0
def ctc_lambda_func(args):
    '''
    y_true = numeric translation of text
    y_pred = output of softmax layer
    input_length = output sequence length
    label_length = length of the true sequence
    '''
    y_true, y_pred, input_length, label_length = args
    print(y_true.shape)
    print(y_pred.shape)
    print(input_length)
    print(label_length)
    return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
Ejemplo n.º 17
0
def ctc_lambda_func(y_true, y_pred, model_config,
                    **kwargs):  # 在2。0下没有**kwargs会编译不过

    outputstep = y_pred.get_shape()[1]  # 获得输入数据的序列长度

    # 为批次中的每个数据,单独指定序列长度
    input_length = np.asarray([[outputstep]] * model_config['batchsize'],
                              dtype=np.int)
    label_length = np.asarray([[model_config['label_len']]] *
                              model_config['batchsize'])
    # input_length必须大于label_length,否则会提示无效的ctc

    return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
Ejemplo n.º 18
0
def ctc_loss(y_true, y_pred):
    """
    input_length = np.array(([61]*y_true.shape[1])).reshape((1,-1))
    label_length = np.array(([61]*y_pred.shape[1])).reshape((1,-1))
    
    input_length = tf.convert_to_tensor(input_length, dtype='int64')
    label_length = tf.convert_to_tensor(label_length, dtype='int64')
    """
    labels = Input(name='the_labels', shape=[None], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')

    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
Ejemplo n.º 19
0
    def ctc_loss_lambda_func(y_true, y_pred):
        """Function for computing the CTC loss"""

        if len(y_true.shape) > 2:
            y_true = tf.squeeze(y_true)

        input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
        input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)
        label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")

        loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
        loss = tf.reduce_mean(loss)

        return loss
def ctc_loss_lambda_func(y_true, y_pred):
    """Function for computing the CTC loss"""

    input_length = tf.ones(BATCH_SIZE) * MAX_LABEL_LENGTH
    input_length = tf.expand_dims(input_length, axis=-1)
    label_length = tf.math.count_nonzero(y_true,
                                         axis=-1,
                                         keepdims=True,
                                         dtype="int64")

    loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    loss = tf.reduce_mean(loss)

    return loss
def ctc_loss(args):
    '''
    More info on CTC: https://towardsdatascience.com/intuitively-understanding-connectionist-temporal-classification-3797e43a86c

    Creates CTC (Connectionist Temporal Classification) loss for a speech_to_text model approach.

    :params:
        args - List of params: predictions, labels, input_len and labels_len

    :returns:
        calculated CTC loss based on args.
    '''
    predictions, labels, input_len, labels_len = args
    return K.ctc_batch_cost(labels, predictions, input_len, labels_len)
Ejemplo n.º 22
0
 def ctc_loss(self, labels, logits):
     print(labels.shape, 'loss')
     if labels.shape[1] == None:
         labels = k.placeholder(shape=(self.batch_size, self.max_len + 1),
                                dtype=tf.int32)
     # tf.dtypes.cast(labels, tf.int32)
     y_true, length = tf.split(labels, [(labels.shape[1] - 1), 1], 1)
     logit_length = tf.expand_dims(tf.convert_to_tensor(
         [self.frames - self.cutoff] * self.batch_size, dtype=tf.int32),
                                   axis=1)
     # logits=logits[:,self.cutoff:,:]
     # length = tf.squeeze(length,axis=1)
     print(y_true.shape, logits.shape, length.shape, logit_length.shape,
           'ctcloss')
     return k.ctc_batch_cost(y_true, logits, logit_length, length)
Ejemplo n.º 23
0
    def call(self, y_true, y_pred):
        # Compute CTC loss, add directly; return preds
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1),
                                              dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1),
                                              dtype="int64")

        loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At run time, just return the computed predictions
        return y_pred
Ejemplo n.º 24
0
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):  # pylint: disable=too-many-locals
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate >= {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 2
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    # Your code below (Lab 3)

    # Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
Ejemplo n.º 25
0
    def ctc_criterion_backend(self, labels, output_mid, label_length,
                              input_length):
        # This assume blank_index=n_class-1
        if not self.blank_index == output_mid.shape[-1] - 1:
            raise AssertionError(
                "keras.backend.ctc requires blank_index = nclass-1")
        if self.from_logits:
            output_mid = tf.nn.softmax(output_mid)
        if tf.is_tensor(input_length):
            input_length = tf.reshape(input_length, (-1, 1))
        else:
            input_length = input_length.reshape(-1, 1)
        if tf.is_tensor(label_length):
            label_length = tf.reshape(label_length, (-1, 1))
        else:
            label_length = label_length.reshape(-1, 1)

        return K.ctc_batch_cost(labels, output_mid, input_length, label_length)
Ejemplo n.º 26
0
    def loss(y_true, y_pred):
        """Why you make it so complicated?
        
        Since the prediction from models is (batch, timedistdim, tot_num_uniq_chars)
        and the true target is labels (batch_size,1) but the ctc loss need some
        additional information of different sizes. And the inputs to loss y_true,
        y_pred must be both same dimensions because of keras.
        
        So I have packed the needed information inside the y_true and just made it
        to a matching dimension with y_true"""

        batch_labels = y_true[:, :, 0]
        label_length = y_true[:, 0, 1]
        input_length = y_true[:, 0, 2]

        #reshape for the loss, add that extra meaningless dimension
        label_length = tf.expand_dims(label_length, -1)
        input_length = tf.expand_dims(input_length, -1)


        return ctc_batch_cost(batch_labels, y_pred, input_length, label_length)
def train_one_step(model, optimizer, x, y_true, input_len, label_len,
                   y_strings):
    input_len = np.expand_dims(input_len, axis=1)
    label_len = np.expand_dims(label_len, axis=1)

    with tf.GradientTape() as tape:
        y_pred = model(x)
        loss = ctc_batch_cost(y_true, y_pred, input_len, label_len)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    input_len = np.squeeze(input_len)
    y_decode = ctc_decode(y_pred, input_len)[0][0]

    accuracy = 0.0

    for i in range(len(y_strings)):
        predicted_sentence = indices_to_string(y_decode[i].numpy())
        accuracy += wer(predicted_sentence, y_strings[i])

    return tf.reduce_mean(loss), accuracy / len(y_strings)
Ejemplo n.º 28
0
def _ctc_lambda_func(args):
    '''
    Setup the CTC loss as function.
    y_pred: The logits output from the model. Shape [batch_sz, times_steps, number_characters]
    labels: The tokenized transcription. Shape [batch_sz, label_length]
    label_length: The length of the transcription. Shape [batch_sz, 1]
    '''
    y_pred, labels, label_length = args

    def _get_length(tensor):
        '''
        Returns the length of a tensor
        Reference:
        "Automatic-Speech-Recognition"
        (https://github.com/rolczynski/Automatic-Speech-Recognition/blob/master/automatic_speech_recognition/pipeline/ctc_pipeline.py)
        '''
        lengths = tf.math.reduce_sum(tf.ones_like(tensor), 1)
        lengths = tf.expand_dims(lengths, -1)
        return tf.cast(lengths, tf.int32)

    # extracts the number of time steps for the batch
    input_length = _get_length(tf.math.reduce_max(y_pred, 2))

    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
Ejemplo n.º 29
0
        # lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(lstm_output)
        # lstm_output = Dropout(0.5)(lstm_output)
        lstm_output = BatchNormalization()(lstm_output)
        lstm_output = Conv1D(256, 3, activation='relu', padding='SAME')(lstm_output)
        lstm_output = Dropout(0.5)(lstm_output)

    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows}
    )(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss'
    )([y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded'
    )([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output]
    )
    return model
Ejemplo n.º 30
0
 def __ctc_lambda_func(self, args):
     y_pred, labels, input_length, label_length = args
     return K.ctc_batch_cost(labels, y_pred, input_length, label_length)