Python lenetの例、text_recognizer.networks.lenet.lenet Pythonの例

コード例 #1

0

ファイルを表示

def line_lstm(input_shape,
              output_shape,
              window_width=20,
              window_stride=14,
              decoder_dim=None,
              encoder_dim=None):
    # Here is another way to pass arguments to the Keras Lambda function
    def slide_window_bound(image,
                           window_width=window_width,
                           window_stride=window_stride):
        return slide_window(image, window_width, window_stride)

    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    if encoder_dim is None:
        encoder_dim = 128
    if decoder_dim is None:
        decoder_dim = 128

    image_input = Input(shape=input_shape)
    # (image_height, image_width)

    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window_bound)(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    # (image_height, window_width, 1) -> (128,)

    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm = CuDNNLSTM if gpu_present else LSTM

    ##### Your code below (Lab 3)
    encoder_output = lstm(encoder_dim,
                          return_sequences=False,
                          go_backwards=True)(convnet_outputs)
    # (encoder_dim)
    repeated_encoding = RepeatVector(output_length)(encoder_output)
    # (max_length, encoder_dim)
    decoder_output = lstm(decoder_dim,
                          return_sequences=True)(repeated_encoding)
    # decoder_output_dropout = Dropout(0.2)(decoder_output)
    # (output_length, decoder_dim)
    ##### Your code above (Lab 3)

    softmax_output = TimeDistributed(Dense(
        num_classes, activation='softmax'))(decoder_output)
    # (max_length, num_classes)

    model = KerasModel(inputs=image_input, outputs=softmax_output)
    return model

コード例 #2

0

ファイルを表示

def line_cnn_sliding_window(input_shape: Tuple[int, ...],
                            output_shape: Tuple[int, ...],
                            window_width: float = 16,
                            window_stride: float = 10) -> KerasModel:
    """
    Input is an image with shape (image_height, image_width)
    Output is of shape (output_length, num_classes)
    """
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    image_input = Input(shape=input_shape)
    # (image_height, image_width)

    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)

    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    # Now we have to get to (output_length, num_classes) shape. One way to do it is to do another sliding window with
    # width = floor(num_windows / output_length)
    # Note that this will likely produce too many items in the output sequence, so take only output_length,
    # and watch out that width is at least 2 (else we will only be able to predict on the first half of the line)

    ##### Your code below (Lab 2)
    convnet_outputs_extra_dim = Lambda(lambda x: tf.expand_dims(x, -1))(
        convnet_outputs)

    num_windows = int((image_width - window_width) / window_stride) + 1
    width = int(num_windows / output_length)

    conved_convnet_outputs = Conv2D(
        num_classes, (width, 128), (width, 1),
        activation='softmax')(convnet_outputs_extra_dim)

    squeezed_conved_convnet_outputs = Lambda(lambda x: tf.squeeze(x, 2))(
        conved_convnet_outputs)

    softmax_output = Lambda(lambda x: x[:, :output_length, :])(
        squeezed_conved_convnet_outputs)
    ##### Your code above (Lab 2)

    model = KerasModel(inputs=image_input, outputs=softmax_output)
    model.summary()
    return model

コード例 #3

0

ファイルを表示

ファイル: line_lstm_ctc.py プロジェクト: hiejulia/full-stack-deep-learning-bootcamp-project

def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14):  # pylint: disable=too-many-locals
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})")

    image_input = Input(shape=input_shape, name="image")
    y_true = Input(shape=(output_length,), name="y_true")
    input_length = Input(shape=(1,), name="input_length")
    label_length = Input(shape=(1,), name="label_length")

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    # Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})(
        image_reshaped
    )
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes,))
    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    lstm_output = LSTM(128, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)

    softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output)
    # (num_windows, num_classes)
    # Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows}
    )(input_length)

    ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")(
        [y_true, softmax_output, input_length_processed, label_length]
    )

    ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")(
        [softmax_output, input_length_processed]
    )

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output],
    )
    return model

コード例 #4

0

ファイルを表示

ファイル: line_lstm_ctc.py プロジェクト: pagpires/text-recognizer-pytorch

def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate >= {output_length} windows (currently {num_windows})'
        )

    convnet = lenet((image_height, window_width), (num_classes, ))

    class ModelCTC(nn.Module):
        """
        extract image for each window -> conv -> lstm -> dense -> softmax
        """
        def __init__(self):
            super(ModelCTC, self).__init__()
            self.conv1 = convnet.conv_layer
            self.conv2 = convnet.mlp_layer[:-3]
            self.lstm = nn.LSTM(128, 128)
            self.linear = nn.Linear(128, num_classes)

        def forward(self, x):
            x = torch.unsqueeze(x, dim=1)
            patches = slide_window(x, window_width, window_stride)
            B, C, H, Window_W, T = patches.shape

            patches = patches.permute((4, 0, 1, 2, 3))
            # PyTorch's way of TimeDistributed: merge dims T and B
            conv_o1 = self.conv1(patches.contiguous().view(
                T * B, C, H, Window_W))  # (T*B, C, H/2-2, W/2-2)
            conv_out = self.conv2(conv_o1.view(T * B, -1)).view(T, B, 128)

            lstm_out, (h_n, c_n) = self.lstm(conv_out)  # lstm_out: (T, B, 128)
            out_linear = self.linear(lstm_out)  # nn.Linear() allows 3D tensor
            logsoftmax = nn.functional.log_softmax(
                out_linear, dim=2
            )  # logsoftmax should be in shape (T, B, classes) to be consistent with ctc_decode
            input_lengths = torch.Tensor([T] * B).long()

            return logsoftmax, input_lengths

    model = ModelCTC()

    return model

コード例 #5

0

ファイルを表示

ファイル: line_lstm_ctc.py プロジェクト: timehaven/fsdl-text-recognizer-project

def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    # 1/0

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    lstm_output = lstm_fn(128, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model

コード例 #6

0

ファイルを表示

ファイル: line_lstm_ctc.py プロジェクト: tranlm/fsdl-text-recognizer-project

def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14,
                  num_conv=128,
                  num_lstm=256):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    ##### Your code below (Lab 3)
    #    ## ORIGINAL CODE (slightly modified)
    #    image_patches = Lambda(
    #        slide_window,
    #        arguments={'window_width': window_width, 'window_stride': window_stride}
    #    )(image_reshaped)
    #    # (num_windows, image_height, window_width, 1)
    #
    #    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    #    convnet = lenet((image_height, window_width, 1), (num_classes,))
    #    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    #    convnet_outputs = TimeDistributed(convnet)(image_patches)
    #    # (num_windows, 128)
    #    drop_1 = Dropout(0.25)(convnet_outputs)
    #    lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(drop_1)
    #    # (num_windows, 128*2)
    #
    #    drop_2 = Dropout(0.25)(lstm_output)
    #    lstm_output2 = Bidirectional(lstm_fn(256, return_sequences=True))(drop_2)
    #
    #    drop_3 = Dropout(0.25)(lstm_output2)
    #    softmax_output = Dense(num_classes, activation='softmax',
    #                           name='softmax_output')(drop_3)
    #    # (num_windows, num_classes)

    #    ## UPDATED CODE
    #    conv = Conv2D(num_conv, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped)
    #    # (1, num_windows, num_conv)
    #    # num_windows = (image_width - window_width) / window_stride + 1
    #
    #    conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv)
    #    # (num_windows, num_conv)
    #
    #    drop_1 = Dropout(0.5)(conv_squeezed)
    #    lstm_output = Bidirectional(lstm_fn(num_lstm, return_sequences=True))(drop_1)
    #    # (num_windows, num_lstm * 2)
    #
    #    drop_2 = Dropout(0.5)(lstm_output)
    #    lstm_output2 = Bidirectional(lstm_fn(int(num_lstm/2), return_sequences=True))(drop_2)
    #    # (num_windows, num_lstm)
    #
    #    drop_3 = Dropout(0.5)(lstm_output2)
    #    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(drop_3)
    # (num_windows, num_classes)
    ## FINISHED UPDATE
    ##### Your code above (Lab 3)

    ##### 2nd winner
    #    image_patches = Lambda(
    #        slide_window,
    #        arguments={'window_width': window_width, 'window_stride': window_stride}
    #    )(image_reshaped)
    #    # (num_windows, image_height, window_width, 1)
    #
    #    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    #    convnet = lenet((image_height, window_width, 1), (num_classes,))
    #    convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output)
    #    convnet_outputs = TimeDistributed(convnet)(image_patches)
    #    # (num_windows, 128)
    #
    #    lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(convnet_outputs)
    #    # (num_windows, 128)
    #
    #    lstm_output_1_drop_out = Dropout(0.2)(lstm_output)
    #
    #    lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output_1_drop_out)
    #
    #    lstm_output_2_drop_out = Dropout(0.2)(lstm_output2)
    #
    #    softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output_2_drop_out)
    ##### 2nd winner end

    ##### 1st winner
    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    convnet_outputs = Dropout(0.5)(convnet_outputs)
    lstm_output = Bidirectional(lstm_fn(
        256, return_sequences=True))(convnet_outputs)
    convnet_outputs = Dropout(0.5)(convnet_outputs)
    lstm_output = Bidirectional(lstm_fn(
        256, return_sequences=True))(convnet_outputs)
    lstm_output = Dropout(0.5)(lstm_output)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    ##### 1st winner end

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model

コード例 #7

0

ファイルを表示

def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape
    print(f'window_width: {window_width}, window_stride: {window_stride}')
    print(f'num_classes: {num_classes}')
    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )
    print(f'num_windows: {num_windows}')

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    # TODOs:
    # improve lenet - res, inception nets
    #   - final layer dense? or global_max_pool?
    # bidirectional mlultilayer lstms
    # Dropouts
    # window_width, window_stride
    # Optimizer, learning rate

    image_reshaped = Lambda(lambda x: K.expand_dims(x, axis=-1))(image_input)
    # image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 256)
    convnet_outputs_dr = Dropout(0.4,
                                 noise_shape=(K.shape(convnet_outputs)[0], 1,
                                              256),
                                 name='dropout1')(convnet_outputs)

    lstm_output = Bidirectional(lstm_fn(128, return_sequences=True),
                                merge_mode='concat')(
                                    convnet_outputs_dr)  # 'sum'
    # (num_windows, 256)
    # lstm_output = Bidirectional(lstm_fn(64, return_sequences=True), merge_mode='concat')(lstm_output) # 'sum'

    lstm_output_dr = Dropout(0.4,
                             noise_shape=(K.shape(convnet_outputs)[0], 1, 256),
                             name='dropout2')(lstm_output)
    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output_dr)
    # (num_windows, num_classes)
    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model

コード例 #8

0

ファイルを表示

ファイル: line_lstm_ctc.py プロジェクト: bryan-x-rai/fsdl-text-recognizer-project

def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})')

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length,), name='y_true')
    input_length = Input(shape=(1,), name='input_length')
    label_length = Input(shape=(1,), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    
    # lenet option:
    ''''''
    image_patches = Lambda(
        slide_window,
        arguments = {'window_width': window_width, 'window_stride': window_stride}
    )(image_reshaped)
    
    convnet = lenet((image_height, window_width, 1), (num_classes,))
    convnet = KerasModel(inputs = convnet.inputs, outputs = convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    ''''''
    
    # straight conv to lstm w relu option:
    '''
    # conv = BatchNormalization()(image_reshaped)
    conv = Conv2D(128, (image_height, window_width), (1, window_stride), kernel_initializer = 'lecun_normal', activation = 'selu')(image_reshaped)
    conv = BatchNormalization()(conv)
    conv = AlphaDropout(0.07)(conv)
    
    # conv = MaxPooling2D(pool_size = (2, 2))(conv)
    
    # conv = Conv2D(128, (image_height, window_width), (1, window_stride), activation = 'relu')(image_reshaped)
    
    # conv = Conv2D(256, (1, window_stride), activation = 'relu')(conv)
    
    convnet_outputs = Lambda(lambda x: K.squeeze(x, 1))(conv)
    '''

    # convnet_do = AlphaDropout(0.05)(convnet_outputs)
    
    # lstm_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_do)
    
    lstm1_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_outputs)
    
    lstm1_do = AlphaDropout(0.04)(lstm1_output)
    
    lstm2_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm1_do)
    
    lstm2_do = AlphaDropout(0.04)(lstm2_output)
    
    ''''''
    lstm3_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm2_do)
    # softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_output)
    ''''''
    
    lstm3_do = AlphaDropout(0.05)(lstm3_output)
    
    softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_do)
    
    
    # highest run: Test evaluation: 0.9641768591746657

    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows}
    )(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss'
    )([y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded'
    )([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output]
    )
    return model

コード例 #9

0

ファイルを表示

def line_lstm(input_shape,
              output_shape,
              window_width=20,
              window_stride=14,
              decoder_dim=None,
              encoder_dim=None):
    # Here is another way to pass arguments to the Keras Lambda function
    def slide_window_bound(image,
                           window_width=window_width,
                           window_stride=window_stride):
        return slide_window(image, window_width, window_stride)

    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    if encoder_dim is None:
        encoder_dim = 128
    if decoder_dim is None:
        decoder_dim = 128

    image_input = Input(shape=input_shape)
    # (image_height, image_width)

    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window_bound)(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    # (image_height, window_width, 1) -> (128,)

    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm = CuDNNLSTM if gpu_present else LSTM

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    # Make a LeNet and get rid of the last two layers (softmax and dropout)
    convnet = lenet((image_height, window_width, 1), (num_classes, ))
    convnet = KerasModel(inputs=convnet.inputs,
                         outputs=convnet.layers[-2].output)
    convnet_outputs = TimeDistributed(convnet)(image_patches)
    # (num_windows, 128)

    lstm_output = lstm_fn(128, return_sequences=True)(convnet_outputs)
    # (num_windows, 128)

    softmax_output = Dense(num_classes,
                           activation='softmax',
                           name='softmax_output')(lstm_output)
    ##### Your code above (Lab 3)

    softmax_output = TimeDistributed(Dense(
        num_classes, activation='softmax'))(decoder_output)
    # (max_length, num_classes)

    model = KerasModel(inputs=image_input, outputs=softmax_output)
    return model

コード例 #10

0

ファイルを表示

ファイル: line_lstm_ctc.py プロジェクト: timehaven/fsdl-text-recognizer-project

def line_lstm_ctc(input_shape,
                  output_shape,
                  window_width=28,
                  window_stride=14,
                  conv_dim=128,
                  lstm_dim=128):
    image_height, image_width = input_shape
    output_length, num_classes = output_shape

    num_windows = int((image_width - window_width) / window_stride) + 1
    if num_windows < output_length:
        raise ValueError(
            f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})'
        )

    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(output_length, ), name='y_true')
    input_length = Input(shape=(1, ), name='input_length')
    label_length = Input(shape=(1, ), name='label_length')

    gpu_present = len(device_lib.list_local_devices()) > 1
    lstm_fn = CuDNNLSTM if gpu_present else LSTM

    # Your code should use slide_window and extract image patches from image_input.
    # Pass a convolutional model over each image patch to generate a feature vector per window.
    # Pass these features through one or more LSTM layers.
    # Convert the lstm outputs to softmax outputs.
    # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length).

    ##### Your code below (Lab 3)
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    # (image_height, image_width, 1)

    image_patches = Lambda(slide_window,
                           arguments={
                               'window_width': window_width,
                               'window_stride': window_stride
                           })(image_reshaped)
    # (num_windows, image_height, window_width, 1)

    if 0:
        # Make a LeNet and get rid of the last two layers (softmax and dropout)
        convnet = lenet((image_height, window_width, 1), (num_classes, ))
        convnet = KerasModel(inputs=convnet.inputs,
                             outputs=convnet.layers[-2].output)
        convnet_outputs = TimeDistributed(convnet)(image_patches)
        # (num_windows, 128)

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = Bidirectional(lstm_fn(
            128, return_sequences=True))(convnet_outputs)
        lstm_output1 = Bidirectional(lstm_fn(
            128, return_sequences=True))(lstm_output0)
        lstm_output2 = Bidirectional(lstm_fn(
            128, return_sequences=True))(lstm_output1)
        lstm_output = Bidirectional(lstm_fn(
            128, return_sequences=True))(lstm_output2)
        # (num_windows, 128)

        #bidir = Bidirectional(lstm_output)
        #bidir = Bidirectional(lstm_output)

        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)
        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir)

        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
        # (num_windows, num_classes)

    elif 0:
        # Make a LeNet and get rid of the last two layers (softmax and dropout)
        convnet = lenet((image_height, window_width, 1), (num_classes, ))
        convnet = KerasModel(inputs=convnet.inputs,
                             outputs=convnet.layers[-2].output)
        convnet_outputs = TimeDistributed(convnet)(image_patches)
        # (num_windows, 128)

        dropout_amount = .2

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = Bidirectional(lstm_fn(
            128, return_sequences=True))(convnet_outputs)
        do0 = Dropout(dropout_amount)(lstm_output0)
        lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0)
        # do1 = Dropout(dropout_amount)(lstm_output1)
        lstm_output = Dropout(dropout_amount)(lstm_output1)
        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1)

        # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1)
        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2)
        # (num_windows, 128)

        #bidir = Bidirectional(lstm_output)
        #bidir = Bidirectional(lstm_output)

        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)
        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir)

        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
        # (num_windows, num_classes)

    elif 1:
        # restarting

        # Make a LeNet and get rid of the last two layers (softmax and dropout)
        convnet = lenet((image_height, window_width, 1), (num_classes, ))
        convnet = KerasModel(inputs=convnet.inputs,
                             outputs=convnet.layers[-2].output)
        convnet_outputs = TimeDistributed(convnet)(image_patches)
        # (num_windows, 128)

        dropout_amount = .2

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = Bidirectional(lstm_fn(
            128, return_sequences=True))(convnet_outputs)
        do0 = Dropout(dropout_amount)(lstm_output0)
        lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0)
        do1 = Dropout(dropout_amount)(lstm_output1)
        lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1)

        # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1)
        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2)
        # (num_windows, 128)

        #bidir = Bidirectional(lstm_output)
        #bidir = Bidirectional(lstm_output)

        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)
        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir)

        # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output)
        # (num_windows, num_classes)    elif 0:
        # SERGEY:
        # Slide a conf filter stack over image in horizontal direction.
        conv = Conv2D(conv_dim, (image_height, window_width),
                      (1, window_stride),
                      activation='relu')(image_reshaped)
        # (1, num_windows, 128)
        # height of conv filter and height of image are same, so first dim is 1 of output
        # num_windows = (image_width - window_width) / window_stride + 1

        conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv)
        # (num_windows, 128)

        # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet))
        lstm_output0 = lstm_fn(lstm_dim, return_sequences=True)(conv_squeezed)
        lstm_output = lstm_fn(lstm_dim, return_sequences=True)(lstm_output0)
        softmax_output = Dense(num_classes,
                               activation='softmax',
                               name='softmax_output')(lstm_output)

    ##### Your code above (Lab 3)

    input_length_processed = Lambda(
        lambda x, num_windows=None: x * num_windows,
        arguments={'num_windows': num_windows})(input_length)

    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')(
            [y_true, softmax_output, input_length_processed, label_length])

    ctc_decoded_output = Lambda(
        lambda x: ctc_decode(x[0], x[1], output_length),
        name='ctc_decoded')([softmax_output, input_length_processed])

    model = KerasModel(
        inputs=[image_input, y_true, input_length, label_length],
        outputs=[ctc_loss_output, ctc_decoded_output])
    return model