def predict_on_image(self, image: np.ndarray) -> Tuple[str, float]: output_length = self.data.output_shape[0] if image.dtype == np.uint8: image = (image / 255).astype(np.float32) input_image = np.expand_dims(image, 0) with torch.no_grad(): was_training = self.network.training self.network.eval() input_image = torch.from_numpy(input_image).to(device) y_pred, input_lengths = self.network(input_image) # y_pred (T,N,C) pred_idx = ctc_decode(y_pred.permute((1,0,2)), input_lengths, output_length) # arg[0] requires (N,T,C) pred_raw = pred_idx[0] # the batch only contains 1 element pred = ''.join(self.data.mapping[label] for label in pred_raw).strip(' |_') max_logit, _ = torch.max(y_pred.squeeze(dim=1), dim=1) # TODO: implement DP to get the right conf for best path conf = torch.exp(max_logit.sum()) if was_training: self.network.train() return pred, conf
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, conv_dim=128, lstm_dim=256): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})') image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length,), name='y_true') input_length = Input(shape=(1,), name='input_length') label_length = Input(shape=(1,), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) conv = Conv2D(conv_dim, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped) conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv) lstm_output1 = lstm_fn(lstm_dim, return_sequences=True)(convnet_outputs) # (num_windows, 128) lstm_output2 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output1) lstm_output3 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output2 + lstm_output1) lstm_output4 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output3 + lstm_output2) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output4) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows} )(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss' )([y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded' )([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output] ) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): # pylint: disable=too-many-locals image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})") image_input = Input(shape=input_shape, name="image") y_true = Input(shape=(output_length,), name="y_true") input_length = Input(shape=(1,), name="input_length") label_length = Input(shape=(1,), name="label_length") # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). # Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})( image_reshaped ) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes,)) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) lstm_output = LSTM(128, return_sequences=True)(convnet_outputs) # (num_windows, 128) softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output) # (num_windows, num_classes) # Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows} )(input_length) ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")( [y_true, softmax_output, input_length_processed, label_length] ) ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")( [softmax_output, input_length_processed] ) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output], ) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): # pylint: disable=too-many-locals image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate >= {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 2 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). # Your code below (Lab 3) # Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
# lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(lstm_output) # lstm_output = Dropout(0.5)(lstm_output) lstm_output = BatchNormalization()(lstm_output) lstm_output = Conv1D(256, 3, activation='relu', padding='SAME')(lstm_output) lstm_output = Dropout(0.5)(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows} )(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss' )([y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded' )([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output] ) return model
def line_lstm_ctc(input_shape, output_shape, window_width=18, window_stride=6, conv_dim=256, lstm_dim=256): gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') # Make a ConvNet with windowed output convnet = all_conv_net((image_height, image_width), conv_dim, window_width, window_stride) conv_out = convnet(image_input) # (num_windows, conv_dim) # 3 LSTM layers, with residual connections lstm_output0 = Bidirectional(lstm_fn(lstm_dim * 2, return_sequences=True))(conv_out) lstm_output1 = Bidirectional(lstm_fn(lstm_dim, return_sequences=True))(lstm_output0) #lstm_output1 = Add()([lstm_output0, lstm_output1]) lstm_output = Bidirectional(lstm_fn(lstm_dim // 2, return_sequences=True))(lstm_output1) #lstm_output = Add()([lstm_output1, lstm_output]) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) full_model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return full_model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, num_conv=128, num_lstm=256): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) ##### Your code below (Lab 3) # ## ORIGINAL CODE (slightly modified) # image_patches = Lambda( # slide_window, # arguments={'window_width': window_width, 'window_stride': window_stride} # )(image_reshaped) # # (num_windows, image_height, window_width, 1) # # # Make a LeNet and get rid of the last two layers (softmax and dropout) # convnet = lenet((image_height, window_width, 1), (num_classes,)) # convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) # convnet_outputs = TimeDistributed(convnet)(image_patches) # # (num_windows, 128) # drop_1 = Dropout(0.25)(convnet_outputs) # lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(drop_1) # # (num_windows, 128*2) # # drop_2 = Dropout(0.25)(lstm_output) # lstm_output2 = Bidirectional(lstm_fn(256, return_sequences=True))(drop_2) # # drop_3 = Dropout(0.25)(lstm_output2) # softmax_output = Dense(num_classes, activation='softmax', # name='softmax_output')(drop_3) # # (num_windows, num_classes) # ## UPDATED CODE # conv = Conv2D(num_conv, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped) # # (1, num_windows, num_conv) # # num_windows = (image_width - window_width) / window_stride + 1 # # conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv) # # (num_windows, num_conv) # # drop_1 = Dropout(0.5)(conv_squeezed) # lstm_output = Bidirectional(lstm_fn(num_lstm, return_sequences=True))(drop_1) # # (num_windows, num_lstm * 2) # # drop_2 = Dropout(0.5)(lstm_output) # lstm_output2 = Bidirectional(lstm_fn(int(num_lstm/2), return_sequences=True))(drop_2) # # (num_windows, num_lstm) # # drop_3 = Dropout(0.5)(lstm_output2) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(drop_3) # (num_windows, num_classes) ## FINISHED UPDATE ##### Your code above (Lab 3) ##### 2nd winner # image_patches = Lambda( # slide_window, # arguments={'window_width': window_width, 'window_stride': window_stride} # )(image_reshaped) # # (num_windows, image_height, window_width, 1) # # # Make a LeNet and get rid of the last two layers (softmax and dropout) # convnet = lenet((image_height, window_width, 1), (num_classes,)) # convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) # convnet_outputs = TimeDistributed(convnet)(image_patches) # # (num_windows, 128) # # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(convnet_outputs) # # (num_windows, 128) # # lstm_output_1_drop_out = Dropout(0.2)(lstm_output) # # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output_1_drop_out) # # lstm_output_2_drop_out = Dropout(0.2)(lstm_output2) # # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output_2_drop_out) ##### 2nd winner end ##### 1st winner image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) convnet_outputs = Dropout(0.5)(convnet_outputs) lstm_output = Bidirectional(lstm_fn( 256, return_sequences=True))(convnet_outputs) convnet_outputs = Dropout(0.5)(convnet_outputs) lstm_output = Bidirectional(lstm_fn( 256, return_sequences=True))(convnet_outputs) lstm_output = Dropout(0.5)(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) ##### 1st winner end input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) lstm_output = Bidirectional(lstm_fn( 256, return_sequences=True))(convnet_outputs) # (num_windows, 128) #lstm2_output = Bidirectional(LSTM(128, return_sequences=True))(lstm_output) #lstm3_output = Bidirectional(LSTM(128, return_sequences=True))(lstm2_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape print(f'window_width: {window_width}, window_stride: {window_stride}') print(f'num_classes: {num_classes}') num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) print(f'num_windows: {num_windows}') image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) # TODOs: # improve lenet - res, inception nets # - final layer dense? or global_max_pool? # bidirectional mlultilayer lstms # Dropouts # window_width, window_stride # Optimizer, learning rate image_reshaped = Lambda(lambda x: K.expand_dims(x, axis=-1))(image_input) # image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 256) convnet_outputs_dr = Dropout(0.4, noise_shape=(K.shape(convnet_outputs)[0], 1, 256), name='dropout1')(convnet_outputs) lstm_output = Bidirectional(lstm_fn(128, return_sequences=True), merge_mode='concat')( convnet_outputs_dr) # 'sum' # (num_windows, 256) # lstm_output = Bidirectional(lstm_fn(64, return_sequences=True), merge_mode='concat')(lstm_output) # 'sum' lstm_output_dr = Dropout(0.4, noise_shape=(K.shape(convnet_outputs)[0], 1, 256), name='dropout2')(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output_dr) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def evaluate(self, x, y, batch_size: int = 16, verbose: bool = True) -> float: blank_idx = self.data.num_classes - 1 output_length = self.data.output_shape[0] test_sequence = DatasetSequence(x, y, batch_size, format_fn=self.batch_format_fn) with torch.no_grad(): was_training = self.network.training self.network.eval() preds_raw = [] input_lengths = [] labels_raw = [] running_loss = 0 for i, batch in enumerate(test_sequence): batch_x, batch_y = map(lambda out: out.to(device), batch) batch_x = batch_x.to(device) batch_y = batch_y.to(device) # log_soft_max (T, B, num_classes) log_soft_max, batch_input_lengths = map(lambda out: out.to("cpu"), self.network(batch_x)) preds_raw.append(log_soft_max.permute(1,0,2)) input_lengths.append(batch_input_lengths) labels_raw.append(batch_y.to("cpu")) output_lengths = (torch.sum(batch_y != blank_idx, dim=1)).to(torch.long).cpu() loss = self.loss()(blank=blank_idx, reduction='mean')(log_soft_max, batch_y.cpu(), batch_input_lengths, output_lengths) running_loss += loss.item() # preds_raw: (B, T, C) preds_raw, input_lengths = torch.cat(preds_raw), torch.cat(input_lengths) labels_raw = torch.cat(labels_raw).numpy() # (B, output_length) print(f"Validation loss: {running_loss/(i+1):.4f}") preds = ctc_decode(preds_raw, input_lengths, output_length) trues = labels_raw pred_strings = [''.join(self.data.mapping.get(label, '') for label in pred).strip(' |_') for pred in preds] true_strings = [''.join(self.data.mapping.get(label, '') for label in true).strip(' |_') for true in trues] char_accuracies = [ 1 - editdistance.eval(true_string, pred_string) / len(true_string) for pred_string, true_string in zip(pred_strings, true_strings) ] if verbose: sorted_ind = np.argsort(char_accuracies) print("\nLeast accurate predictions:") for ind in sorted_ind[:5]: print(f'True: {true_strings[ind]}') print(f'Pred: {pred_strings[ind]}') print("\nMost accurate predictions:") for ind in sorted_ind[-5:]: print(f'True: {true_strings[ind]}') print(f'Pred: {pred_strings[ind]}') print("\nRandom predictions:") random_ind = np.random.randint(0, len(char_accuracies), 5) for ind in random_ind: print(f'True: {true_strings[ind]}') print(f'Pred: {pred_strings[ind]}') mean_accuracy = np.mean(char_accuracies) if was_training: self.network.train() return mean_accuracy
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})') image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length,), name='y_true') input_length = Input(shape=(1,), name='input_length') label_length = Input(shape=(1,), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # lenet option: '''''' image_patches = Lambda( slide_window, arguments = {'window_width': window_width, 'window_stride': window_stride} )(image_reshaped) convnet = lenet((image_height, window_width, 1), (num_classes,)) convnet = KerasModel(inputs = convnet.inputs, outputs = convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) '''''' # straight conv to lstm w relu option: ''' # conv = BatchNormalization()(image_reshaped) conv = Conv2D(128, (image_height, window_width), (1, window_stride), kernel_initializer = 'lecun_normal', activation = 'selu')(image_reshaped) conv = BatchNormalization()(conv) conv = AlphaDropout(0.07)(conv) # conv = MaxPooling2D(pool_size = (2, 2))(conv) # conv = Conv2D(128, (image_height, window_width), (1, window_stride), activation = 'relu')(image_reshaped) # conv = Conv2D(256, (1, window_stride), activation = 'relu')(conv) convnet_outputs = Lambda(lambda x: K.squeeze(x, 1))(conv) ''' # convnet_do = AlphaDropout(0.05)(convnet_outputs) # lstm_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_do) lstm1_output = Bidirectional(lstm_fn(128, return_sequences = True))(convnet_outputs) lstm1_do = AlphaDropout(0.04)(lstm1_output) lstm2_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm1_do) lstm2_do = AlphaDropout(0.04)(lstm2_output) '''''' lstm3_output = Bidirectional(lstm_fn(128, return_sequences = True))(lstm2_do) # softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_output) '''''' lstm3_do = AlphaDropout(0.05)(lstm3_output) softmax_output = Dense(num_classes, activation = 'softmax', name = 'softmax_output')(lstm3_do) # highest run: Test evaluation: 0.9641768591746657 ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows} )(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss' )([y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded' )([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output] ) return model
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, conv_dim=128, lstm_dim=128): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={ 'window_width': window_width, 'window_stride': window_stride })(image_reshaped) # (num_windows, image_height, window_width, 1) if 0: # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = Bidirectional(lstm_fn( 128, return_sequences=True))(convnet_outputs) lstm_output1 = Bidirectional(lstm_fn( 128, return_sequences=True))(lstm_output0) lstm_output2 = Bidirectional(lstm_fn( 128, return_sequences=True))(lstm_output1) lstm_output = Bidirectional(lstm_fn( 128, return_sequences=True))(lstm_output2) # (num_windows, 128) #bidir = Bidirectional(lstm_output) #bidir = Bidirectional(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) elif 0: # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) dropout_amount = .2 # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = Bidirectional(lstm_fn( 128, return_sequences=True))(convnet_outputs) do0 = Dropout(dropout_amount)(lstm_output0) lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0) # do1 = Dropout(dropout_amount)(lstm_output1) lstm_output = Dropout(dropout_amount)(lstm_output1) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1) # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2) # (num_windows, 128) #bidir = Bidirectional(lstm_output) #bidir = Bidirectional(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) elif 1: # restarting # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes, )) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) dropout_amount = .2 # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = Bidirectional(lstm_fn( 128, return_sequences=True))(convnet_outputs) do0 = Dropout(dropout_amount)(lstm_output0) lstm_output1 = Bidirectional(lstm_fn(128, return_sequences=True))(do0) do1 = Dropout(dropout_amount)(lstm_output1) lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(do1) # lstm_output2 = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output1) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True))(lstm_output2) # (num_windows, 128) #bidir = Bidirectional(lstm_output) #bidir = Bidirectional(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(bidir) # softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) elif 0: # SERGEY: # Slide a conf filter stack over image in horizontal direction. conv = Conv2D(conv_dim, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped) # (1, num_windows, 128) # height of conv filter and height of image are same, so first dim is 1 of output # num_windows = (image_width - window_width) / window_stride + 1 conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv) # (num_windows, 128) # lstm_output = Bidirectional(lstm_fn(128, return_sequences=True)(convnet)) lstm_output0 = lstm_fn(lstm_dim, return_sequences=True)(conv_squeezed) lstm_output = lstm_fn(lstm_dim, return_sequences=True)(lstm_output0) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def cnn_line_lstm_ctc(input_shape, output_shape, **kwargs): image_height, image_width = input_shape output_length, num_classes = output_shape image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) convnet_outputs = image_reshaped # convnet_outputs = Dropout(0.5)(convnet_outputs) convnet_outputs = Conv2D(16, 3, padding='SAME')(convnet_outputs) convnet_outputs = BatchNormalization()(convnet_outputs) convnet_outputs = LeakyReLU()(convnet_outputs) convnet_outputs = MaxPooling2D(2, 2)(convnet_outputs) # convnet_outputs = Dropout(0.5)(convnet_outputs) convnet_outputs = Conv2D(32, 3, padding='SAME')(convnet_outputs) convnet_outputs = BatchNormalization()(convnet_outputs) convnet_outputs = LeakyReLU()(convnet_outputs) convnet_outputs = MaxPooling2D(2, 2)(convnet_outputs) convnet_outputs = Dropout(0.2)(convnet_outputs) convnet_outputs = Conv2D(48, 3, padding='SAME')(convnet_outputs) convnet_outputs = BatchNormalization()(convnet_outputs) convnet_outputs = LeakyReLU()(convnet_outputs) convnet_outputs = MaxPooling2D(2, 2)(convnet_outputs) convnet_outputs = Dropout(0.2)(convnet_outputs) convnet_outputs = Conv2D(64, 3, padding='SAME')(convnet_outputs) convnet_outputs = BatchNormalization()(convnet_outputs) convnet_outputs = LeakyReLU()(convnet_outputs) convnet_outputs = Dropout(0.2)(convnet_outputs) convnet_outputs = Conv2D(80, 3, padding='SAME')(convnet_outputs) convnet_outputs = BatchNormalization()(convnet_outputs) convnet_outputs = LeakyReLU()(convnet_outputs) num_windows = 119 convnet_outputs = Permute([2, 1, 3])(convnet_outputs) convnet_outputs = Reshape([num_windows, 240])(convnet_outputs) # (num_windows, 128) lstm_output = convnet_outputs for i in range(2): lstm_output = Dropout(0.5)(lstm_output) lstm_output = Bidirectional(lstm_fn( 256, return_sequences=True))(lstm_output) lstm_output = Dropout(0.5)(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def line_lstm_ctc(input_shape, output_shape, **kwargs): image_height, image_width = input_shape output_length, num_classes = output_shape image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) convnet_outputs = image_reshaped convnet_outputs = BatchNormalization()(convnet_outputs) # convnet_outputs = Dropout(0.2)(convnet_outputs) convnet_outputs = Conv2D(32, kernel_size=(3, 3), activation='relu')(convnet_outputs) # convnet_outputs = Dropout(0.2)(convnet_outputs) convnet_outputs = BatchNormalization()(convnet_outputs) convnet_outputs = Conv2D(64, (3, 3), activation='relu')(convnet_outputs) # convnet_outputs = Dropout(0.2)(convnet_outputs) convnet_outputs = MaxPooling2D(pool_size=(2, 2))(convnet_outputs) convnet_outputs = Dropout(0.5)(convnet_outputs) # convnet_outputs = MaxPooling2D(pool_size=(12, 1))(convnet_outputs) convnet_outputs = Lambda(slide_window_flatten, arguments={ 'window_width': 12, 'window_stride': 1 })(convnet_outputs) convnet_outputs = Dense(128, activation='relu')(convnet_outputs) print(convnet_outputs) num_windows = 463 # (num_windows, 128) lstm_output = Dropout(0.5)(convnet_outputs) for i in range(kwargs.get('lstm_layers', 1)): # lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(lstm_output) # lstm_output = Dropout(0.5)(lstm_output) lstm_output = BatchNormalization()(lstm_output) lstm_output = Conv1D(256, 3, activation='relu', padding='SAME')(lstm_output) lstm_output = Dropout(0.5)(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model