Exemple #1
0
    def predict_on_image(self, image: np.ndarray) -> Tuple[str, float]:
        output_length = self.data.output_shape[0]
        
        if image.dtype == np.uint8:
            image = (image / 255).astype(np.float32)

        input_image = np.expand_dims(image, 0)
        with torch.no_grad():
            was_training = self.network.training
            self.network.eval()

            input_image = torch.from_numpy(input_image).to(device)
            y_pred, input_lengths = self.network(input_image) # y_pred (T,N,C)

        pred_idx = ctc_decode(y_pred.permute((1,0,2)), input_lengths, output_length) # arg[0] requires (N,T,C)
        pred_raw = pred_idx[0] # the batch only contains 1 element

        pred = ''.join(self.data.mapping[label] for label in pred_raw).strip(' |_')

        max_logit, _ = torch.max(y_pred.squeeze(dim=1), dim=1)
        # TODO: implement DP to get the right conf for best path
        conf = torch.exp(max_logit.sum())
        
        if was_training:
            self.network.train()

        return pred, conf
Exemple #2
0
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, conv_dim=128, lstm_dim=256):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})')

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length,), name='y_true')
    input_length = Input(shape=(1,), name='input_length')
    label_length = Input(shape=(1,), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    conv = Conv2D(conv_dim, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped)

    conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv)

    lstm_output1 = lstm_fn(lstm_dim, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)
    lstm_output2 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output1)
    lstm_output3 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output2 + lstm_output1)
    lstm_output4 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output3 + lstm_output2)

    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output4)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows}
    )(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss'
    )([y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded'
    )([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output]
    )
    return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14):  # pylint: disable=too-many-locals
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})")

    image_input = Input(shape=input_shape, name="image")
    y_true = Input(shape=(output_length,), name="y_true")
    input_length = Input(shape=(1,), name="input_length")
    label_length = Input(shape=(1,), name="label_length")

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    # Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})(
        image_reshaped
    )
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes,))
    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    lstm_output = LSTM(128, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)

    softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output)
    # (num_windows, num_classes)
    # Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows}
    )(input_length)

    ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")(
        [y_true, softmax_output, input_length_processed, label_length]
    )

    ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")(
        [softmax_output, input_length_processed]
    )

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output],
    )
    return model
Exemple #4
0
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):  # pylint: disable=too-many-locals
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate >= {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 2
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    # Your code below (Lab 3)

    # Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
Exemple #5
0
        # lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(lstm_output)
        # lstm_output = Dropout(0.5)(lstm_output)
        lstm_output = BatchNormalization()(lstm_output)
        lstm_output = Conv1D(256, 3, activation='relu', padding='SAME')(lstm_output)
        lstm_output = Dropout(0.5)(lstm_output)

    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows}
    )(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss'
    )([y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded'
    )([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output]
    )
    return model
Exemple #6
0
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=18,
                  window_stride=6,
                  conv_dim=256,
                  lstm_dim=256):
    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    # Make a ConvNet with windowed output
    convnet = all_conv_net((image_height, image_width), conv_dim, window_width,
                           window_stride)
    conv_out = convnet(image_input)
    # (num_windows, conv_dim)

    # 3 LSTM layers, with residual connections
    lstm_output0 = Bidirectional(lstm_fn(lstm_dim * 2,
                                         return_sequences=True))(conv_out)

    lstm_output1 = Bidirectional(lstm_fn(lstm_dim,
                                         return_sequences=True))(lstm_output0)
    #lstm_output1 = Add()([lstm_output0, lstm_output1])

    lstm_output = Bidirectional(lstm_fn(lstm_dim // 2,
                                        return_sequences=True))(lstm_output1)
    #lstm_output = Add()([lstm_output1, lstm_output])

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    full_model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return full_model
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14,
                  num_conv=128,
                  num_lstm=256):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    ##### Your code below (Lab 3)
    #    ## ORIGINAL CODE (slightly modified)
    #    image_patches = Lambda(
    #        slide_window,
    #        arguments={'window_width': window_width, 'window_stride': window_stride}
    #    )(image_reshaped)
    #    # (num_windows, image_height, window_width, 1)
    #
    #    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    #    convnet = lenet((image_height, window_width, 1), (num_classes,))
    #    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    #    convnet_outputs = TimeDistributed(convnet)(image_patches)
    #    # (num_windows, 128)
    #    drop_1 = Dropout(0.25)(convnet_outputs)
    #    lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(drop_1)
    #    # (num_windows, 128*2)
    #
    #    drop_2 = Dropout(0.25)(lstm_output)
    #    lstm_output2 = Bidirectional(lstm_fn(256, return_sequences=True))(drop_2)
    #
    #    drop_3 = Dropout(0.25)(lstm_output2)
    #    softmax_output = Dense(num_classes, activation='softmax',
    #                           name='softmax_output')(drop_3)
    #    # (num_windows, num_classes)

    #    ## UPDATED CODE
    #    conv = Conv2D(num_conv, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped)
    #    # (1, num_windows, num_conv)
    #    # num_windows = (image_width - window_width) / window_stride + 1
    #
    #    conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv)
    #    # (num_windows, num_conv)
    #
    #    drop_1 = Dropout(0.5)(conv_squeezed)
    #    lstm_output = Bidirectional(lstm_fn(num_lstm, return_sequences=True))(drop_1)
    #    # (num_windows, num_lstm * 2)
    #
    #    drop_2 = Dropout(0.5)(lstm_output)
    #    lstm_output2 = Bidirectional(lstm_fn(int(num_lstm/2), return_sequences=True))(drop_2)
    #    # (num_windows, num_lstm)
    #
    #    drop_3 = Dropout(0.5)(lstm_output2)
    #    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(drop_3)
    # (num_windows, num_classes)
    ## FINISHED UPDATE
    ##### Your code above (Lab 3)

    ##### 2nd winner
    #    image_patches = Lambda(
    #        slide_window,
    #        arguments={'window_width': window_width, 'window_stride': window_stride}
    #    )(image_reshaped)
    #    # (num_windows, image_height, window_width, 1)
    #
    #    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    #    convnet = lenet((image_height, window_width, 1), (num_classes,))
    #    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    #    convnet_outputs = TimeDistributed(convnet)(image_patches)
    #    # (num_windows, 128)
    #
    #    lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(convnet_outputs)
    #    # (num_windows, 128)
    #
    #    lstm_output_1_drop_out = Dropout(0.2)(lstm_output)
    #
    #    lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output_1_drop_out)
    #
    #    lstm_output_2_drop_out = Dropout(0.2)(lstm_output2)
    #
    #    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output_2_drop_out)
    ##### 2nd winner end

    ##### 1st winner
    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    convnet_outputs = Dropout(0.5)(convnet_outputs)
    lstm_output = Bidirectional(lstm_fn(
        256, return_sequences=True))(convnet_outputs)
    convnet_outputs = Dropout(0.5)(convnet_outputs)
    lstm_output = Bidirectional(lstm_fn(
        256, return_sequences=True))(convnet_outputs)
    lstm_output = Dropout(0.5)(lstm_output)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    ##### 1st winner end

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    lstm_output = Bidirectional(lstm_fn(
        256, return_sequences=True))(convnet_outputs)
    # (num_windows, 128)

    #lstm2_output = Bidirectional(LSTM(128, return_sequences=True))(lstm_output)
    #lstm3_output = Bidirectional(LSTM(128, return_sequences=True))(lstm2_output)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
Exemple #9
0
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape
    print(f'window_width: {window_width}, window_stride: {window_stride}')
    print(f'num_classes: {num_classes}')
    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )
    print(f'num_windows: {num_windows}')

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    # TODOs:
    # improve lenet - res, inception nets
    #   - final layer dense? or global_max_pool?
    # bidirectional mlultilayer lstms
    # Dropouts
    # window_width, window_stride
    # Optimizer, learning rate

    image_reshaped = Lambda(lambda x: K.expand_dims(x, axis=-1))(image_input)
    # image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 256)
    convnet_outputs_dr = Dropout(0.4,
                                 noise_shape=(K.shape(convnet_outputs)[0], 1,
                                              256),
                                 name='dropout1')(convnet_outputs)

    lstm_output = Bidirectional(lstm_fn(128, return_sequences=True),
                                merge_mode='concat')(
                                    convnet_outputs_dr)  # 'sum'
    # (num_windows, 256)
    # lstm_output = Bidirectional(lstm_fn(64, return_sequences=True), merge_mode='concat')(lstm_output) # 'sum'

    lstm_output_dr = Dropout(0.4,
                             noise_shape=(K.shape(convnet_outputs)[0], 1, 256),
                             name='dropout2')(lstm_output)
    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output_dr)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
Exemple #10
0
    def evaluate(self, x, y, batch_size: int = 16, verbose: bool = True) -> float:
        blank_idx = self.data.num_classes - 1
        output_length = self.data.output_shape[0]
        test_sequence = DatasetSequence(x, y, batch_size, format_fn=self.batch_format_fn)
        with torch.no_grad():
            was_training = self.network.training
            self.network.eval()
            preds_raw = []
            input_lengths = []
            labels_raw = []
            
            running_loss = 0
            for i, batch in enumerate(test_sequence):
                batch_x, batch_y = map(lambda out: out.to(device), batch)
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                # log_soft_max (T, B, num_classes)
                log_soft_max, batch_input_lengths = map(lambda out: out.to("cpu"), self.network(batch_x))
                preds_raw.append(log_soft_max.permute(1,0,2))
                input_lengths.append(batch_input_lengths)
                labels_raw.append(batch_y.to("cpu"))
                output_lengths = (torch.sum(batch_y != blank_idx, dim=1)).to(torch.long).cpu()
                
                loss = self.loss()(blank=blank_idx, reduction='mean')(log_soft_max, batch_y.cpu(), batch_input_lengths, output_lengths)
                running_loss += loss.item()
            # preds_raw: (B, T, C)
            preds_raw, input_lengths = torch.cat(preds_raw), torch.cat(input_lengths)
            labels_raw = torch.cat(labels_raw).numpy() # (B, output_length)
        print(f"Validation loss: {running_loss/(i+1):.4f}")
        
        preds = ctc_decode(preds_raw, input_lengths, output_length)

        trues = labels_raw
        pred_strings = [''.join(self.data.mapping.get(label, '') for label in pred).strip(' |_') for pred in preds]
        true_strings = [''.join(self.data.mapping.get(label, '') for label in true).strip(' |_') for true in trues]

        char_accuracies = [
            1 - editdistance.eval(true_string, pred_string) / len(true_string)
            for pred_string, true_string in zip(pred_strings, true_strings)
        ]
        if verbose:
            sorted_ind = np.argsort(char_accuracies)
            print("\nLeast accurate predictions:")
            for ind in sorted_ind[:5]:
                print(f'True: {true_strings[ind]}')
                print(f'Pred: {pred_strings[ind]}')
            print("\nMost accurate predictions:")
            for ind in sorted_ind[-5:]:
                print(f'True: {true_strings[ind]}')
                print(f'Pred: {pred_strings[ind]}')
            print("\nRandom predictions:")
            random_ind = np.random.randint(0, len(char_accuracies), 5)
            for ind in random_ind:
                print(f'True: {true_strings[ind]}')
                print(f'Pred: {pred_strings[ind]}')
        mean_accuracy = np.mean(char_accuracies)
        
        if was_training:
            self.network.train()

        return mean_accuracy
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})')

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length,), name='y_true')
    input_length = Input(shape=(1,), name='input_length')
    label_length = Input(shape=(1,), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    
    # lenet option:
    ''''''
    image_patches = Lambda(
        slide_window,
        arguments = {'window_width': window_width, 'window_stride': window_stride}
    )(image_reshaped)
    
    convnet = lenet((image_height, window_width, 1), (num_classes,))
    convnet = KerasModel(inputs = convnet.inputs, outputs = convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    ''''''
    
    # straight conv to lstm w relu option:
    '''
    # conv = BatchNormalization()(image_reshaped)
    conv = Conv2D(128, (image_height, window_width), (1, window_stride), kernel_initializer = 'lecun_normal', activation = 'selu')(image_reshaped)
    conv = BatchNormalization()(conv)
    conv = AlphaDropout(0.07)(conv)
    
    # conv = MaxPooling2D(pool_size = (2, 2))(conv)
    
    # conv = Conv2D(128, (image_height, window_width), (1, window_stride), activation = 'relu')(image_reshaped)
    
    # conv = Conv2D(256, (1, window_stride), activation = 'relu')(conv)
    
    convnet_outputs = Lambda(lambda x: K.squeeze(x, 1))(conv)
    '''

    # convnet_do = AlphaDropout(0.05)(convnet_outputs)
    
    # lstm_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_do)
    
    lstm1_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_outputs)
    
    lstm1_do = AlphaDropout(0.04)(lstm1_output)
    
    lstm2_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm1_do)
    
    lstm2_do = AlphaDropout(0.04)(lstm2_output)
    
    ''''''
    lstm3_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm2_do)
    # softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_output)
    ''''''
    
    lstm3_do = AlphaDropout(0.05)(lstm3_output)
    
    softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_do)
    
    
    # highest run: Test evaluation: 0.9641768591746657

    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows}
    )(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss'
    )([y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded'
    )([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output]
    )
    return model
def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14,
                  conv_dim=128,
                  lstm_dim=128):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    if 0:
        # Make a LeNet and get rid of the last two layers (softmax and dropout)
        convnet = lenet((image_height, window_width, 1), (num_classes, ))
        convnet = KerasModel(inputs=convnet.inputs,
                             outputs=convnet.layers[-2].output)
        convnet_outputs = TimeDistributed(convnet)(image_patches)
        # (num_windows, 128)

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = Bidirectional(lstm_fn(
            128, return_sequences=True))(convnet_outputs)
        lstm_output1 = Bidirectional(lstm_fn(
            128, return_sequences=True))(lstm_output0)
        lstm_output2 = Bidirectional(lstm_fn(
            128, return_sequences=True))(lstm_output1)
        lstm_output = Bidirectional(lstm_fn(
            128, return_sequences=True))(lstm_output2)
        # (num_windows, 128)

        #bidir = Bidirectional(lstm_output)
        #bidir = Bidirectional(lstm_output)

        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)
        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir)

        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
        # (num_windows, num_classes)

    elif 0:
        # Make a LeNet and get rid of the last two layers (softmax and dropout)
        convnet = lenet((image_height, window_width, 1), (num_classes, ))
        convnet = KerasModel(inputs=convnet.inputs,
                             outputs=convnet.layers[-2].output)
        convnet_outputs = TimeDistributed(convnet)(image_patches)
        # (num_windows, 128)

        dropout_amount = .2

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = Bidirectional(lstm_fn(
            128, return_sequences=True))(convnet_outputs)
        do0 = Dropout(dropout_amount)(lstm_output0)
        lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0)
        # do1 = Dropout(dropout_amount)(lstm_output1)
        lstm_output = Dropout(dropout_amount)(lstm_output1)
        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1)

        # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1)
        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2)
        # (num_windows, 128)

        #bidir = Bidirectional(lstm_output)
        #bidir = Bidirectional(lstm_output)

        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)
        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir)

        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
        # (num_windows, num_classes)

    elif 1:
        # restarting

        # Make a LeNet and get rid of the last two layers (softmax and dropout)
        convnet = lenet((image_height, window_width, 1), (num_classes, ))
        convnet = KerasModel(inputs=convnet.inputs,
                             outputs=convnet.layers[-2].output)
        convnet_outputs = TimeDistributed(convnet)(image_patches)
        # (num_windows, 128)

        dropout_amount = .2

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = Bidirectional(lstm_fn(
            128, return_sequences=True))(convnet_outputs)
        do0 = Dropout(dropout_amount)(lstm_output0)
        lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0)
        do1 = Dropout(dropout_amount)(lstm_output1)
        lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1)

        # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1)
        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2)
        # (num_windows, 128)

        #bidir = Bidirectional(lstm_output)
        #bidir = Bidirectional(lstm_output)

        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)
        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir)

        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
        # (num_windows, num_classes)    elif 0:
        # SERGEY:
        # Slide a conf filter stack over image in horizontal direction.
        conv = Conv2D(conv_dim, (image_height, window_width),
                      (1, window_stride),
                      activation='relu')(image_reshaped)
        # (1, num_windows, 128)
        # height of conv filter and height of image are same, so first dim is 1 of output
        # num_windows = (image_width - window_width) / window_stride + 1

        conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv)
        # (num_windows, 128)

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = lstm_fn(lstm_dim, return_sequences=True)(conv_squeezed)
        lstm_output = lstm_fn(lstm_dim, return_sequences=True)(lstm_output0)
        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)

    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
Exemple #13
0
def cnn_line_lstm_ctc(input_shape, output_shape, **kwargs):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    convnet_outputs = image_reshaped
    # convnet_outputs = Dropout(0.5)(convnet_outputs)
    convnet_outputs = Conv2D(16, 3, padding='SAME')(convnet_outputs)
    convnet_outputs = BatchNormalization()(convnet_outputs)
    convnet_outputs = LeakyReLU()(convnet_outputs)
    convnet_outputs = MaxPooling2D(2, 2)(convnet_outputs)

    # convnet_outputs = Dropout(0.5)(convnet_outputs)
    convnet_outputs = Conv2D(32, 3, padding='SAME')(convnet_outputs)
    convnet_outputs = BatchNormalization()(convnet_outputs)
    convnet_outputs = LeakyReLU()(convnet_outputs)
    convnet_outputs = MaxPooling2D(2, 2)(convnet_outputs)

    convnet_outputs = Dropout(0.2)(convnet_outputs)
    convnet_outputs = Conv2D(48, 3, padding='SAME')(convnet_outputs)
    convnet_outputs = BatchNormalization()(convnet_outputs)
    convnet_outputs = LeakyReLU()(convnet_outputs)
    convnet_outputs = MaxPooling2D(2, 2)(convnet_outputs)

    convnet_outputs = Dropout(0.2)(convnet_outputs)
    convnet_outputs = Conv2D(64, 3, padding='SAME')(convnet_outputs)
    convnet_outputs = BatchNormalization()(convnet_outputs)
    convnet_outputs = LeakyReLU()(convnet_outputs)

    convnet_outputs = Dropout(0.2)(convnet_outputs)
    convnet_outputs = Conv2D(80, 3, padding='SAME')(convnet_outputs)
    convnet_outputs = BatchNormalization()(convnet_outputs)
    convnet_outputs = LeakyReLU()(convnet_outputs)

    num_windows = 119

    convnet_outputs = Permute([2, 1, 3])(convnet_outputs)
    convnet_outputs = Reshape([num_windows, 240])(convnet_outputs)

    # (num_windows, 128)

    lstm_output = convnet_outputs
    for i in range(2):
        lstm_output = Dropout(0.5)(lstm_output)
        lstm_output = Bidirectional(lstm_fn(
            256, return_sequences=True))(lstm_output)
    lstm_output = Dropout(0.5)(lstm_output)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model
def line_lstm_ctc(input_shape, output_shape, **kwargs):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    convnet_outputs = image_reshaped
    convnet_outputs = BatchNormalization()(convnet_outputs)
    # convnet_outputs = Dropout(0.2)(convnet_outputs)
    convnet_outputs = Conv2D(32, kernel_size=(3, 3),
                             activation='relu')(convnet_outputs)
    # convnet_outputs = Dropout(0.2)(convnet_outputs)
    convnet_outputs = BatchNormalization()(convnet_outputs)
    convnet_outputs = Conv2D(64, (3, 3), activation='relu')(convnet_outputs)
    # convnet_outputs = Dropout(0.2)(convnet_outputs)
    convnet_outputs = MaxPooling2D(pool_size=(2, 2))(convnet_outputs)
    convnet_outputs = Dropout(0.5)(convnet_outputs)
    # convnet_outputs = MaxPooling2D(pool_size=(12, 1))(convnet_outputs)
    convnet_outputs = Lambda(slide_window_flatten,
                             arguments={
                                 'window_width': 12,
                                 'window_stride': 1
                             })(convnet_outputs)
    convnet_outputs = Dense(128, activation='relu')(convnet_outputs)
    print(convnet_outputs)
    num_windows = 463

    # (num_windows, 128)

    lstm_output = Dropout(0.5)(convnet_outputs)
    for i in range(kwargs.get('lstm_layers', 1)):
        # lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(lstm_output)
        # lstm_output = Dropout(0.5)(lstm_output)
        lstm_output = BatchNormalization()(lstm_output)
        lstm_output = Conv1D(256, 3, activation='relu',
                             padding='SAME')(lstm_output)
        lstm_output = Dropout(0.5)(lstm_output)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model