def line_lstm(input_shape, output_shape, window_width=20, window_stride=14, decoder_dim=None, encoder_dim=None): # Here is another way to pass arguments to the Keras Lambda function def slide_window_bound(image, window_width=window_width, window_stride=window_stride): return slide_window(image, window_width, window_stride) image_height, image_width = input_shape output_length, num_classes = output_shape if encoder_dim is None: encoder_dim = 128 if decoder_dim is None: decoder_dim = 128 image_input = Input(shape=input_shape) # (image_height, image_width) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window_bound)(image_reshaped) # (num_windows, image_height, window_width, 1) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) # (image_height, window_width, 1) -> (128,) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) gpu_present = len(device_lib.list_local_devices()) > 1 lstm = CuDNNLSTM if gpu_present else LSTM ##### Your code below (Lab 3) encoder_output = lstm(encoder_dim, return_sequences=False, go_backwards=True)(convnet_outputs) # (encoder_dim) repeated_encoding = RepeatVector(output_length)(encoder_output) # (max_length, encoder_dim) decoder_output = lstm(decoder_dim, return_sequences=True)(repeated_encoding) # decoder_output_dropout = Dropout(0.2)(decoder_output) # (output_length, decoder_dim) ##### Your code above (Lab 3) softmax_output = TimeDistributed(Dense( num_classes, activation='softmax'))(decoder_output) # (max_length, num_classes) model = KerasModel(inputs=image_input, outputs=softmax_output) return model
def line_cnn_sliding_window(input_shape: Tuple[int, ...], output_shape: Tuple[int, ...], window_width: float = 16, window_stride: float = 10) -> KerasModel: """ Input is an image with shape (image_height, image_width) Output is of shape (output_length, num_classes) """ image_height, image_width = input_shape output_length, num_classes = output_shape image_input = Input(shape=input_shape) # (image_height, image_width) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) # Now we have to get to (output_length, num_classes) shape. One way to do it is to do another sliding window with # width = floor(num_windows / output_length) # Note that this will likely produce too many items in the output sequence, so take only output_length, # and watch out that width is at least 2 (else we will only be able to predict on the first half of the line) ##### Your code below (Lab 2) convnet_outputs_extra_dim = Lambda(lambda x: tf.expand_dims(x, -1))( convnet_outputs) num_windows = int((image_width - window_width) / window_stride) + 1 width = int(num_windows / output_length) conved_convnet_outputs = Conv2D( num_classes, (width, 128), (width, 1), activation='softmax')(convnet_outputs_extra_dim) squeezed_conved_convnet_outputs = Lambda(lambda x: tf.squeeze(x, 2))( conved_convnet_outputs) softmax_output = Lambda(lambda x: x[:, :output_length, :])( squeezed_conved_convnet_outputs) ##### Your code above (Lab 2) model = KerasModel(inputs=image_input, outputs=softmax_output) model.summary() return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): # pylint: disable=too-many-locals image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})") image_input = Input(shape=input_shape, name="image") y_true = Input(shape=(output_length,), name="y_true") input_length = Input(shape=(1,), name="input_length") label_length = Input(shape=(1,), name="label_length") # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). # Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})( image_reshaped ) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes,)) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) lstm_output = LSTM(128, return_sequences=True)(convnet_outputs) # (num_windows, 128) softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output) # (num_windows, num_classes) # Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows} )(input_length) ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")( [y_true, softmax_output, input_length_processed, label_length] ) ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")( [softmax_output, input_length_processed] ) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output], ) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate >= {output_length} windows (currently {num_windows})' ) convnet = lenet((image_height, window_width), (num_classes, )) class ModelCTC(nn.Module): """ extract image for each window -> conv -> lstm -> dense -> softmax """ def __init__(self): super(ModelCTC, self).__init__() self.conv1 = convnet.conv_layer self.conv2 = convnet.mlp_layer[:-3] self.lstm = nn.LSTM(128, 128) self.linear = nn.Linear(128, num_classes) def forward(self, x): x = torch.unsqueeze(x, dim=1) patches = slide_window(x, window_width, window_stride) B, C, H, Window_W, T = patches.shape patches = patches.permute((4, 0, 1, 2, 3)) # PyTorch's way of TimeDistributed: merge dims T and B conv_o1 = self.conv1(patches.contiguous().view( T * B, C, H, Window_W)) # (T*B, C, H/2-2, W/2-2) conv_out = self.conv2(conv_o1.view(T * B, -1)).view(T, B, 128) lstm_out, (h_n, c_n) = self.lstm(conv_out) # lstm_out: (T, B, 128) out_linear = self.linear(lstm_out) # nn.Linear() allows 3D tensor logsoftmax = nn.functional.log_softmax( out_linear, dim=2 ) # logsoftmax should be in shape (T, B, classes) to be consistent with ctc_decode input_lengths = torch.Tensor([T] * B).long() return logsoftmax, input_lengths model = ModelCTC() return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape # 1/0 num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) lstm_output = lstm_fn(128, return_sequences=True)(convnet_outputs) # (num_windows, 128) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, num_conv=128, num_lstm=256): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) ##### Your code below (Lab 3) # ## ORIGINAL CODE (slightly modified) # image_patches = Lambda( # slide_window, # arguments={'window_width': window_width, 'window_stride': window_stride} # )(image_reshaped) # # (num_windows, image_height, window_width, 1) # # # Make a LeNet and get rid of the last two layers (softmax and dropout) # convnet = lenet((image_height, window_width, 1), (num_classes,)) # convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) # convnet_outputs = TimeDistributed(convnet)(image_patches) # # (num_windows, 128) # drop_1 = Dropout(0.25)(convnet_outputs) # lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(drop_1) # # (num_windows, 128*2) # # drop_2 = Dropout(0.25)(lstm_output) # lstm_output2 = Bidirectional(lstm_fn(256, return_sequences=True))(drop_2) # # drop_3 = Dropout(0.25)(lstm_output2) # softmax_output = Dense(num_classes, activation='softmax', # name='softmax_output')(drop_3) # # (num_windows, num_classes) # ## UPDATED CODE # conv = Conv2D(num_conv, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped) # # (1, num_windows, num_conv) # # num_windows = (image_width - window_width) / window_stride + 1 # # conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv) # # (num_windows, num_conv) # # drop_1 = Dropout(0.5)(conv_squeezed) # lstm_output = Bidirectional(lstm_fn(num_lstm, return_sequences=True))(drop_1) # # (num_windows, num_lstm * 2) # # drop_2 = Dropout(0.5)(lstm_output) # lstm_output2 = Bidirectional(lstm_fn(int(num_lstm/2), return_sequences=True))(drop_2) # # (num_windows, num_lstm) # # drop_3 = Dropout(0.5)(lstm_output2) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(drop_3) # (num_windows, num_classes) ## FINISHED UPDATE ##### Your code above (Lab 3) ##### 2nd winner # image_patches = Lambda( # slide_window, # arguments={'window_width': window_width, 'window_stride': window_stride} # )(image_reshaped) # # (num_windows, image_height, window_width, 1) # # # Make a LeNet and get rid of the last two layers (softmax and dropout) # convnet = lenet((image_height, window_width, 1), (num_classes,)) # convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) # convnet_outputs = TimeDistributed(convnet)(image_patches) # # (num_windows, 128) # # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(convnet_outputs) # # (num_windows, 128) # # lstm_output_1_drop_out = Dropout(0.2)(lstm_output) # # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output_1_drop_out) # # lstm_output_2_drop_out = Dropout(0.2)(lstm_output2) # # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output_2_drop_out) ##### 2nd winner end ##### 1st winner image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) convnet_outputs = Dropout(0.5)(convnet_outputs) lstm_output = Bidirectional(lstm_fn( 256, return_sequences=True))(convnet_outputs) convnet_outputs = Dropout(0.5)(convnet_outputs) lstm_output = Bidirectional(lstm_fn( 256, return_sequences=True))(convnet_outputs) lstm_output = Dropout(0.5)(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) ##### 1st winner end input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape print(f'window_width: {window_width}, window_stride: {window_stride}') print(f'num_classes: {num_classes}') num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) print(f'num_windows: {num_windows}') image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) # TODOs: # improve lenet - res, inception nets # - final layer dense? or global_max_pool? # bidirectional mlultilayer lstms # Dropouts # window_width, window_stride # Optimizer, learning rate image_reshaped = Lambda(lambda x: K.expand_dims(x, axis=-1))(image_input) # image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 256) convnet_outputs_dr = Dropout(0.4, noise_shape=(K.shape(convnet_outputs)[0], 1, 256), name='dropout1')(convnet_outputs) lstm_output = Bidirectional(lstm_fn(128, return_sequences=True), merge_mode='concat')( convnet_outputs_dr) # 'sum' # (num_windows, 256) # lstm_output = Bidirectional(lstm_fn(64, return_sequences=True), merge_mode='concat')(lstm_output) # 'sum' lstm_output_dr = Dropout(0.4, noise_shape=(K.shape(convnet_outputs)[0], 1, 256), name='dropout2')(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output_dr) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})') image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length,), name='y_true') input_length = Input(shape=(1,), name='input_length') label_length = Input(shape=(1,), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # lenet option: '''''' image_patches = Lambda( slide_window, arguments = {'window_width': window_width, 'window_stride': window_stride} )(image_reshaped) convnet = lenet((image_height, window_width, 1), (num_classes,)) convnet = KerasModel(inputs = convnet.inputs, outputs = convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) '''''' # straight conv to lstm w relu option: ''' # conv = BatchNormalization()(image_reshaped) conv = Conv2D(128, (image_height, window_width), (1, window_stride), kernel_initializer = 'lecun_normal', activation = 'selu')(image_reshaped) conv = BatchNormalization()(conv) conv = AlphaDropout(0.07)(conv) # conv = MaxPooling2D(pool_size = (2, 2))(conv) # conv = Conv2D(128, (image_height, window_width), (1, window_stride), activation = 'relu')(image_reshaped) # conv = Conv2D(256, (1, window_stride), activation = 'relu')(conv) convnet_outputs = Lambda(lambda x: K.squeeze(x, 1))(conv) ''' # convnet_do = AlphaDropout(0.05)(convnet_outputs) # lstm_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_do) lstm1_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_outputs) lstm1_do = AlphaDropout(0.04)(lstm1_output) lstm2_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm1_do) lstm2_do = AlphaDropout(0.04)(lstm2_output) '''''' lstm3_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm2_do) # softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_output) '''''' lstm3_do = AlphaDropout(0.05)(lstm3_output) softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_do) # highest run: Test evaluation: 0.9641768591746657 ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows} )(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss' )([y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded' )([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output] ) return model
def line_lstm(input_shape, output_shape, window_width=20, window_stride=14, decoder_dim=None, encoder_dim=None): # Here is another way to pass arguments to the Keras Lambda function def slide_window_bound(image, window_width=window_width, window_stride=window_stride): return slide_window(image, window_width, window_stride) image_height, image_width = input_shape output_length, num_classes = output_shape if encoder_dim is None: encoder_dim = 128 if decoder_dim is None: decoder_dim = 128 image_input = Input(shape=input_shape) # (image_height, image_width) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window_bound)(image_reshaped) # (num_windows, image_height, window_width, 1) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) # (image_height, window_width, 1) -> (128,) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) gpu_present = len(device_lib.list_local_devices()) > 1 lstm = CuDNNLSTM if gpu_present else LSTM ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) lstm_output = lstm_fn(128, return_sequences=True)(convnet_outputs) # (num_windows, 128) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) ##### Your code above (Lab 3) softmax_output = TimeDistributed(Dense( num_classes, activation='softmax'))(decoder_output) # (max_length, num_classes) model = KerasModel(inputs=image_input, outputs=softmax_output) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, conv_dim=128, lstm_dim=128): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) if 0: # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = Bidirectional(lstm_fn( 128, return_sequences=True))(convnet_outputs) lstm_output1 = Bidirectional(lstm_fn( 128, return_sequences=True))(lstm_output0) lstm_output2 = Bidirectional(lstm_fn( 128, return_sequences=True))(lstm_output1) lstm_output = Bidirectional(lstm_fn( 128, return_sequences=True))(lstm_output2) # (num_windows, 128) #bidir = Bidirectional(lstm_output) #bidir = Bidirectional(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) elif 0: # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) dropout_amount = .2 # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = Bidirectional(lstm_fn( 128, return_sequences=True))(convnet_outputs) do0 = Dropout(dropout_amount)(lstm_output0) lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0) # do1 = Dropout(dropout_amount)(lstm_output1) lstm_output = Dropout(dropout_amount)(lstm_output1) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1) # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2) # (num_windows, 128) #bidir = Bidirectional(lstm_output) #bidir = Bidirectional(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) elif 1: # restarting # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) dropout_amount = .2 # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = Bidirectional(lstm_fn( 128, return_sequences=True))(convnet_outputs) do0 = Dropout(dropout_amount)(lstm_output0) lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0) do1 = Dropout(dropout_amount)(lstm_output1) lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1) # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2) # (num_windows, 128) #bidir = Bidirectional(lstm_output) #bidir = Bidirectional(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) elif 0: # SERGEY: # Slide a conf filter stack over image in horizontal direction. conv = Conv2D(conv_dim, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped) # (1, num_windows, 128) # height of conv filter and height of image are same, so first dim is 1 of output # num_windows = (image_width - window_width) / window_stride + 1 conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv) # (num_windows, 128) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = lstm_fn(lstm_dim, return_sequences=True)(conv_squeezed) lstm_output = lstm_fn(lstm_dim, return_sequences=True)(lstm_output0) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model