def build_graph(self, inputs, training=None): image = tf.expand_dims(inputs[Keys.Image], axis=-1) # add channel axis data_length = inputs[Keys.ImageLength] batch_size = tf.shape(image)[0] flowing_data = 1 - tf.cast(image, tf.float32) / 255.0 # Rescale and invert, so that black is now 1, white 0 for conv, pool in zip(self.conv_layers, self.pool_layers): flowing_data = pool(conv(flowing_data)) data_length = (data_length + 1) // 2 # 2x2 pooling subsampled_height, features = flowing_data.shape[2:4] flowing_data = tf.reshape(flowing_data, [batch_size, -1, subsampled_height * features]) flowing_data = tf.transpose(flowing_data, [1, 0, 2]) flowing_data = self.bilstm_layer(flowing_data) flowing_data = tf.transpose(flowing_data, [1, 0, 2]) flowing_data = self.dropout_layer(flowing_data) blank_last_logits = self.logits_layer(flowing_data) blank_last_softmax = tf.nn.softmax(blank_last_logits) logits = tf.roll(blank_last_logits, shift=1, axis=-1) softmax = tf.roll(blank_last_softmax, shift=1, axis=-1) greedy_decoded = ctc_ops.ctc_greedy_decoder( inputs=tf.transpose(blank_last_logits, perm=[1, 0, 2]), sequence_length=tf.cast(keras.backend.flatten(data_length), "int32"), )[0][0] return { "blank_last_logits": blank_last_logits, "blank_last_softmax": blank_last_softmax, "logits": logits, "softmax": softmax, "decoded": tf.sparse.to_dense(greedy_decoded, default_value=-1), "out_len": data_length, }
def call(self, inputs, training=None): y_pred = tf.log(tf.transpose(inputs, perm=[1, 0, 2]) + 1e-8) #input_length = tf.to_int32(self.sample_out_size) input_length = K.ones_like(inputs[:, 0, 0], dtype='int32') * self.sample_out_size if self.greedy: (decoded, log_prob) = ctc.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length, merge_repeated=False) else: (decoded, log_prob) = ctc.ctc_beam_search_decoder( inputs=y_pred, sequence_length=input_length, beam_width=self.beam_width, top_paths=self.top_paths, merge_repeated=False) decoded_dense = [ tf.sparse_to_dense(st.indices, st.dense_shape, st.values, default_value=-1) for st in decoded ] dummy_vec = K.ones_like(inputs[:, :, 0], dtype='int64') * self.dummy_word conccat_dense = [ K.concatenate((d, dummy_vec), axis=1)[:, :self.sample_out_size] for d in decoded_dense ] return conccat_dense
def ctc_loss(self,outputs, targets, seq_len, num_classes,initial_learning_rate, keep_prob=0.8, scopeN="l1-ctc_loss"): """Implements ctc loss @param outputs: [batch,h,w,chanels] @param targets: sparce tensor @param seq_len: the length of the inputs sequences [batch] @param num_classes: the number of classes @param initial_learning_rate: learning rate @param keep_prob: if true dropout layer @param scopeN: the scope name @returns: list with [optimizer, cost, Inaccuracy- label error rate, decoded output of the batch] """ with tf.name_scope('Train'): with tf.variable_scope("ctc_loss-"+scopeN) as scope: W = tf.Variable(tf.truncated_normal([self.hidden*2, num_classes], stddev=0.1)) # Zero initialization b = tf.Variable(tf.constant(0., shape=[num_classes])) tf.summary.histogram('histogram-b-ctc', b) tf.summary.histogram('histogram-w-ctc', W) # Doing the affine projection logits = tf.matmul(outputs, W) + b if keep_prob is not None: logits = tf.nn.dropout(logits, keep_prob) # Reshaping back to the original shape logits = tf.reshape(logits, [self.width, self.batch_size, num_classes]) #logits = tf.transpose(logits, [1,0,2]) with tf.name_scope('CTC-loss'): loss = ctc_ops.ctc_loss(logits, targets, seq_len) cost = tf.reduce_mean(loss) with tf.name_scope('Optimizer'): if self.optimizer == "ADAM": optimizer = tf.train.AdamOptimizer(learning_rate=initial_learning_rate,name="AdamOptimizer").minimize(cost) elif self.optimizer == "RMSP": optimizer = tf.train.RMSPropOptimizer(learning_rate=initial_learning_rate, decay=self.decay, momentum=self.momentum).minimize(cost) else: raise Exception("model type not supported: {}".format(self.optimizer)) with tf.name_scope('Prediction'): if self.ctc_decoder == 'greedy': decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len) elif self.ctc_decoder == 'beam_search': decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_len) else: raise Exception("model type not supported: {}".format(self.ctc_decoder)) # Inaccuracy: label error rate ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) return optimizer, cost, ler, decoded
def decode(self, predictions, seq_len, k): if self.ctc_decoder == 'greedy': decoded, log_prob = ctc_ops.ctc_greedy_decoder(predictions, seq_len) elif self.ctc_decoder == 'beam_search': decoded, log_prob = ctc_ops.ctc_beam_search_decoder(predictions, seq_len, top_paths=k) else: raise Exception("model type not supported: {}".format(self.ctc_decoder)) return decoded
def setup_decoder(self): with tf.name_scope("decode"): if self.beam_search_decoder == 'default': self.decoded, self.log_prob = ctc_ops.ctc_beam_search_decoder( self.logits, self.seq_length, merge_repeated=False) elif self.beam_search_decoder == 'greedy': self.decoded, self.log_prob = ctc_ops.ctc_greedy_decoder( self.logits, self.seq_length, merge_repeated=False) else: logging.warning("Invalid beam search decoder option selected!")
def ctc_decode(self, y_pred, input_length, greedy=True, beam_width=100, top_paths=1, merge_repeated=False): """Decodes the output of a softmax. Can use either greedy search (also known as best path) or a constrained dictionary search. # Arguments y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, )` containing the sequence length for each batch item in `y_pred`. greedy: perform much faster best-path search if `True`. This does not use a dictionary. beam_width: if `greedy` is `False`: a beam search decoder will be used with a beam of this width. top_paths: if `greedy` is `False`, how many of the most probable paths will be returned. merge_repeated: if `greedy` is `False`, merge repeated classes in the output beams. # Returns Tuple: List: if `greedy` is `True`, returns a list of one element that contains the decoded sequence. If `False`, returns the `top_paths` most probable decoded sequences. Important: blank labels are returned as `-1`. Tensor `(top_paths, )` that contains the log probability of each decoded sequence. """ _EPSILON = 1e-7 y_pred = tf_math_ops.log( tf.transpose(y_pred, perm=[1, 0, 2]) + _EPSILON) input_length = tf.cast(input_length, tf.int32) if greedy: (decoded, log_prob) = ctc_ops.ctc_greedy_decoder( inputs=y_pred, sequence_length=input_length) else: (decoded, log_prob) = ctc_ops.ctc_beam_search_decoder( inputs=y_pred, sequence_length=input_length, beam_width=beam_width, top_paths=top_paths, merge_repeated=merge_repeated) decoded_dense = [] for st in decoded: dense_tensor = tf.sparse.to_dense(st, default_value=-1) decoded_dense.append(dense_tensor) return decoded_dense, log_prob
def build_graph(self, inputs, training=None): params: ModelParams = self._params input_data = tf.cast(inputs["img"], tf.float32) / 255.0 input_sequence_length = K.flatten(inputs["img_len"]) shape = input_sequence_length, -1 # if concat or conv_T layers are present, we need to pad the input to ensure that possible # up-sampling layers work properly require_padding = any([ isinstance(l, (ConcatLayerParams, TransposedConv2DLayerParams)) for l in params.layers ]) if require_padding: s = self._params.compute_max_downscale_factor() padding = calculate_padding(input_data, s.to_tuple()) padded = KL.Lambda(partial(pad, x_only=True), name="padded_input")([input_data, padding]) last_layer_output = padded else: last_layer_output = input_data layers_outputs_by_index = [] for layer in self.layer_instances: layers_outputs_by_index.append(last_layer_output) if isinstance(layer.params, ConcatLayerParams): last_layer_output = layer(layers_outputs_by_index) else: last_layer_output = layer(last_layer_output) lstm_seq_len, lstm_num_features = self._params.compute_downscaled( shape) lstm_seq_len = K.cast(lstm_seq_len, "int32") last_layer_output = self.reshape(last_layer_output) blank_last_logits = self.logits(last_layer_output) blank_last_softmax = self.softmax(blank_last_logits) logits = tf.roll(blank_last_logits, shift=1, axis=-1) softmax = tf.nn.softmax(logits) greedy_decoded = ctc.ctc_greedy_decoder( inputs=array_ops.transpose(blank_last_logits, perm=[1, 0, 2]), sequence_length=tf.cast(K.flatten(lstm_seq_len), "int32"), )[0][0] return { "blank_last_logits": blank_last_logits, "blank_last_softmax": blank_last_softmax, "out_len": lstm_seq_len, "logits": logits, "softmax": softmax, "decoded": tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1, }
def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1): """Decodes the output of a softmax. Can use either greedy search (also known as best path) or a constrained dictionary search. # Arguments y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, )` containing the sequence length for each batch item in `y_pred`. greedy: perform much faster best-path search if `true`. This does not use a dictionary. beam_width: if `greedy` is `false`: a beam search decoder will be used with a beam of this width. top_paths: if `greedy` is `false`, how many of the most probable paths will be returned. # Returns Tuple: List: if `greedy` is `true`, returns a list of one element that contains the decoded sequence. If `false`, returns the `top_paths` most probable decoded sequences. Important: blank labels are returned as `-1`. Tensor `(top_paths, )` that contains the log probability of each decoded sequence. """ y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8) input_length = tf.to_int32(input_length) if greedy: (decoded, log_prob) = ctc.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length, merge_repeated=False) else: (decoded, log_prob) = ctc.ctc_beam_search_decoder(inputs=y_pred, sequence_length=input_length, beam_width=beam_width, top_paths=top_paths, merge_repeated=False) decoded_dense = [ tf.sparse_to_dense(st.indices, st.dense_shape, st.values, default_value=-1) for st in decoded ] return (decoded_dense, log_prob)
def ctc_complete_analysis_lambda_func(args, **arguments): """ Complete CTC analysis using Keras and tensorflow WARNING : tf is required :param args: y_pred, labels, input_length, label_len :param arguments: greedy, beam_width, top_paths :return: ler = label error rate """ y_pred, labels, input_length, label_len = args my_params = arguments assert (K.backend() == 'tensorflow') batch = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8) input_length = tf.to_int32(tf.squeeze(input_length)) greedy = my_params['greedy'] beam_width = my_params['beam_width'] top_paths = my_params['top_paths'] if greedy: (decoded, log_prob) = ctc.ctc_greedy_decoder(inputs=batch, sequence_length=input_length) else: (decoded, log_prob) = ctc.ctc_beam_search_decoder( inputs=batch, sequence_length=input_length, beam_width=beam_width, top_paths=top_paths) cast_decoded = tf.cast(decoded[0], tf.float32) sparse_y = K.ctc_label_dense_to_sparse( labels, tf.cast(tf.squeeze(label_len), tf.int32)) ed_tensor = tf_edit_distance(cast_decoded, sparse_y, norm=True) ler_per_seq = Kreshape_To1D(ed_tensor) return K.cast(ler_per_seq, dtype='float32')
def ctc_decode(y_pred, input_length, max_output_length): """ Cut down from https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L4170 Decodes the output of a softmax. Uses greedy (best path) search. # Arguments y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, )` containing the sequence length for each batch item in `y_pred`. max_output_length: int giving the max output sequence length # Returns List: list of one element that contains the decoded sequence. """ y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon()) input_length = tf.cast((tf.squeeze(input_length, axis=-1)), tf.int32) (decoded, _) = ctc_ops.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length) sparse = decoded[0] decoded_dense = tf.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values, default_value=-1) # Unfortunately, decoded_dense will be of different number of columns, depending on the decodings. # We need to get it all in one standard shape, so let's pad if necessary. max_length = max_output_length + 2 # giving 2 extra characters for CTC leeway cols = tf.shape(decoded_dense)[-1] def pad(): return tf.pad(decoded_dense, [[0, 0], [0, max_length - cols]], constant_values=-1) def noop(): return decoded_dense return tf.cond(tf.less(cols, max_length), pad, noop)
def wrap(inputs): logits, output_len = inputs outputs = { 'blank_last_logits': logits, 'out_len': output_len, 'logits': tf.roll(logits, shift=1, axis=-1), } outputs['blank_last_softmax'] = tf.nn.softmax( outputs['blank_last_logits'], axis=-1) outputs['softmax'] = tf.nn.softmax(outputs['logits']) greedy_decoded = \ ctc_ops.ctc_greedy_decoder(inputs=tf.transpose(outputs['blank_last_logits'], perm=[1, 0, 2]), sequence_length=tf.cast(K.flatten(outputs['out_len']), 'int32'))[0][0] greedy_decoded = tf.cast(greedy_decoded, 'int32', 'greedy_int32') outputs['decoded'] = tf.sparse.to_dense( greedy_decoded, default_value=tf.constant(-1, dtype=greedy_decoded.dtype)) + 1 return outputs
def make_outputs(self, blank_last_softmax, lstm_seq_len, complete_outputs): softmax = tf.roll(blank_last_softmax, shift=1, axis=-1) greedy_decoded = ctc.ctc_greedy_decoder( inputs=tf.transpose(blank_last_softmax, perm=[1, 0, 2]), sequence_length=tf.cast(K.flatten(lstm_seq_len), "int32"), )[0][0] outputs = { "blank_last_logits": tf.math.log(blank_last_softmax), "blank_last_softmax": blank_last_softmax, "logits": tf.math.log(softmax), "softmax": softmax, "out_len": lstm_seq_len, "decoded": tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1, } for i, voter_output in enumerate(complete_outputs): for k, v in voter_output.items(): outputs[f"{k}_{i}"] = v return outputs
# Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Time major logits = tf.transpose(logits, (1, 0, 2)) loss = ctc_ops.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) optimizer = tf.train.MomentumOptimizer(initial_learning_rate, 0.9).minimize(cost) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len) # Inaccuracy: label error rate ler = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) saver = tf.train.Saver() with tf.Session(graph=graph) as session: # Initializate the weights and biases init_op = tf.global_variables_initializer() init_op.run() # saver.restore(session, './orange.ckpt') # print("Model restored.")
def CheckpointTest(): # input_tensor为输入音频数据,由前面分析可知,它的结构是[batch_size, amax_stepsize, n_input + (2 * n_input * n_context)] # 其中,batch_size是batch的长度,amax_stepsize是时序长度,n_input + (2 * n_input * n_context)是MFCC特征数, # batch_size是可变的,所以设为None,由于每一批次的时序长度不固定,所有,amax_stepsize也设为None input_tensor = tf.placeholder(tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op. # targets保存的是音频数据对应的文本的系数张量,所以用sparse_placeholder创建一个稀疏张量 targets = tf.sparse_placeholder(tf.int32, name='targets') # seq_length保存的是当前batch数据的时序长度 seq_length = tf.placeholder(tf.int32, [None], name='seq_length') # keep_dropout则是dropout的参数 keep_dropout = tf.placeholder(tf.float32) # logits is the non-normalized output/activations from the last layer. # logits will be input for the loss function. # nn_model is from the import statement in the load_model function logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout) aa = ctc_ops.ctc_loss(targets, logits, seq_length) # 使用ctc loss计算损失 avg_loss = tf.reduce_mean(aa) # 优化器 learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(avg_loss) # 使用CTC decoder with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_length, merge_repeated=True) # 计算编辑距离 with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') # 迭代次数 epochs = 150 # 模型保存地址 savedir = "saver/" # 如果该目录不存在,新建 if os.path.exists(savedir) == False: os.mkdir(savedir) # 生成saver saver = tf.train.Saver(max_to_keep=1) # 创建session with tf.Session() as sess: # 初始化 sess.run(tf.global_variables_initializer()) # 没有模型的话,就重新初始化 kpt = tf.train.latest_checkpoint(savedir) print("kpt:", kpt) startepo = 0 if kpt != None: saver.restore(sess, kpt) ind = kpt.find("-") startepo = int(kpt[ind + 1:]) # 要识别的语音文件 wav_file = 'input.wav' source, source_lengths, sparse_labels = get_speech_file(wav_file, labels) feed2 = {input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0} d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2) dense_decoded = tf.sparse_tensor_to_dense(d, default_value=-1).eval(session=sess) if (len(dense_decoded) > 0): decoded_str = ndarray_to_text_ch(dense_decoded[0], words) print('Decoded: {}'.format(decoded_str))
def sparse_decoded(logits, output_seq_len): return ctc.ctc_greedy_decoder( inputs=array_ops.transpose(logits, perm=[1, 0, 2]), sequence_length=tf.cast(K.flatten(output_seq_len), 'int32'))[0][0]
def create_network(self, inputs, input_seq_len, dropout_rate, reuse_variables): network_proto = self.network_proto seq_len = input_seq_len batch_size = tf.shape(inputs)[0] gpu_enabled = self.gpu_available with tf.variable_scope("", reuse=reuse_variables) as scope: no_layers = len(network_proto.layers) == 0 if not no_layers: has_conv_or_pool = network_proto.layers[ 0].type != LayerParams.LSTM else: has_conv_or_pool = False if has_conv_or_pool: cnn_inputs = tf.reshape( inputs, [batch_size, -1, network_proto.features, 1]) shape = seq_len, network_proto.features layers = [cnn_inputs] last_num_filters = 1 cnn_layer_index = 0 for layer in [ l for l in network_proto.layers if l.type != LayerParams.LSTM ]: if layer.type == LayerParams.CONVOLUTIONAL: layers.append( tf.layers.conv2d( name="conv2d" if cnn_layer_index == 0 else "conv2d_{}".format(cnn_layer_index), inputs=layers[-1], filters=layer.filters, kernel_size=(layer.kernel_size.x, layer.kernel_size.y), padding="same", activation=tf.nn.relu, reuse=reuse_variables, )) cnn_layer_index += 1 last_num_filters = layer.filters elif layer.type == LayerParams.MAX_POOLING: layers.append( tf.layers.max_pooling2d( inputs=layers[-1], pool_size=(layer.kernel_size.x, layer.kernel_size.y), strides=(layer.stride.x, layer.stride.y), padding="same", )) shape = (tf.to_int32(shape[0] // layer.stride.x), shape[1] // layer.stride.y) else: raise Exception("Unknown layer of type %s" % layer.type) lstm_seq_len, lstm_num_features = shape rnn_inputs = tf.reshape(layers[-1], [ batch_size, tf.shape(layers[-1])[1], last_num_filters * lstm_num_features ]) lstm_num_features = last_num_filters * lstm_num_features else: rnn_inputs = inputs lstm_seq_len = seq_len lstm_num_features = network_proto.features lstm_layers = [ l for l in network_proto.layers if l.type == LayerParams.LSTM ] # Time major inputs required for lstm time_major_inputs = tf.transpose(rnn_inputs, [1, 0, 2]) if len(lstm_layers) > 0: for i, lstm in enumerate(lstm_layers): if lstm.hidden_nodes != lstm_layers[0].hidden_nodes: raise Exception( "Currently all lstm layers must have an equal number of hidden nodes. " "Got {} != {}".format(lstm.hidden_nodes, lstm_layers[0].hidden_nodes)) def cpu_cudnn_compatible_lstm_backend(time_major_inputs, hidden_nodes): def get_lstm_cell(num_hidden): return cudnn_rnn.CudnnCompatibleLSTMCell( num_hidden, reuse=reuse_variables) fw, bw = zip(*[(get_lstm_cell(hidden_nodes), get_lstm_cell(hidden_nodes)) for lstm in lstm_layers]) time_major_outputs, output_fw, output_bw \ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(list(fw), list(bw), time_major_inputs, sequence_length=lstm_seq_len, dtype=tf.float32, scope="{}cudnn_lstm/stack_bidirectional_rnn".format(scope.name), time_major=True, ) return time_major_outputs def gpu_cudnn_lstm_backend(time_major_inputs, hidden_nodes): # Create the Cudnn LSTM factory rnn_lstm = cudnn_rnn.CudnnLSTM( len(lstm_layers), hidden_nodes, direction='bidirectional', kernel_initializer=tf.initializers.random_uniform( -0.1, 0.1)) # TODO: Check if the models are loadable from meta Graph, maybe the next line fixed this rnn_lstm._saveable_cls = cudnn_rnn.CudnnLSTMSaveable # Apply the lstm to the inputs time_major_outputs, ( output_h, output_c) = rnn_lstm(time_major_inputs) return time_major_outputs if network_proto.backend.cudnn: if gpu_enabled: print("Using CUDNN LSTM backend on GPU") time_major_outputs = gpu_cudnn_lstm_backend( time_major_inputs, lstm_layers[0].hidden_nodes) else: print("Using CUDNN compatible LSTM backend on CPU") time_major_outputs = cpu_cudnn_compatible_lstm_backend( time_major_inputs, lstm_layers[0].hidden_nodes) else: raise Exception("Only cudnn based backend supported yet.") # Set the output size output_size = lstm_layers[-1].hidden_nodes * 2 else: output_size = lstm_num_features time_major_outputs = time_major_inputs # flatten to (T * N, F) for matrix multiplication. This will be reversed later time_major_outputs = tf.reshape( time_major_outputs, [-1, time_major_outputs.shape.as_list()[2]]) if network_proto.dropout > 0: time_major_outputs = tf.nn.dropout(time_major_outputs, 1 - dropout_rate, name="dropout") # we need to turn off validate_shape so we can resize the variable on a codec resize w = tf.get_variable('W', validate_shape=False, initializer=tf.random_uniform( [output_size, network_proto.classes], -0.1, 0.1)) b = tf.get_variable('B', validate_shape=False, initializer=tf.constant( 0., shape=[network_proto.classes])) # the output layer time_major_logits = tf.matmul(time_major_outputs, w) + b # reshape back time_major_logits = tf.reshape( time_major_logits, [-1, batch_size, tf.shape(w)[-1]], name="time_major_logits") time_major_softmax = tf.nn.softmax(time_major_logits, -1, "time_major_softmax") logits = tf.transpose(time_major_logits, [1, 0, 2], name="logits") softmax = tf.transpose(time_major_softmax, [1, 0, 2], name="softmax") lstm_seq_len = tf.identity(lstm_seq_len, "seq_len_out") # DECODER # ================================================================ if network_proto.ctc == NetworkParams.CTC_DEFAULT: decoded, log_prob = ctc_ops.ctc_greedy_decoder( time_major_logits, lstm_seq_len, merge_repeated=network_proto.ctc_merge_repeated) elif network_proto.ctc == NetworkParams.CTC_FUZZY: decoded, log_prob = self.fuzzy_module['decoder_op']( softmax, lstm_seq_len) else: raise Exception( "Unknown ctc model: '%s'. Supported are Default and Fuzzy" % network_proto.ctc) decoded = decoded[0] sparse_decoded = ( tf.identity(decoded.indices, name="decoded_indices"), tf.identity(decoded.values, name="decoded_values"), tf.identity(decoded.dense_shape, name="decoded_shape"), ) return lstm_seq_len, time_major_logits, time_major_softmax, logits, softmax, decoded, sparse_decoded
def call(self, inputs, training=None): if training is None: training = K.learning_phase() batch_size = tf.shape(inputs['img_len'])[0] max_lstm_seq_len = self._params.compute_downscaled( tf.shape(inputs['img'])[1]) # only pass folds to selected folds if 'fold_id' in inputs: # Training/Validation graph def training_step(): tf.debugging.assert_greater_equal(inputs['fold_id'], 0) complete_outputs = [ self.fold_graphs[i](inputs) for i in range(len(self.fold_graphs)) ] lstm_seq_len = complete_outputs[0][ 'out_len'] # is the same for all children softmax_outputs = tf.stack( [out['blank_last_softmax'] for out in complete_outputs], axis=0) # Training: Mask out network that does not contribute to a sample to generate strong voters if not self._params.no_masking_out_during_training: mask = [ tf.not_equal(i, inputs['fold_id']) for i in range(len(self.fold_graphs)) ] softmax_outputs *= tf.cast(tf.expand_dims(mask, axis=-1), dtype='float32') blank_last_softmax = tf.reduce_sum( softmax_outputs, axis=0) / ( len(self.fold_graphs) - 1 ) # only n - 1 since one voter is 0 else: # In this case, training behaves similar to prediction blank_last_softmax = tf.reduce_mean(softmax_outputs, axis=0) return blank_last_softmax, lstm_seq_len, complete_outputs def validation_step(): # any dummy output is max length, to get actional outpu length t use reduce_min def gen_empty_output(bs): empty = tf.zeros( shape=[bs, max_lstm_seq_len, self._params.classes], dtype='float32') return { 'blank_last_logits': empty, 'blank_last_softmax': empty, 'out_len': tf.repeat(max_lstm_seq_len, repeats=bs), 'logits': empty, 'softmax': empty, 'decoded': tf.zeros(shape=[bs, max_lstm_seq_len], dtype='int64'), } empty_output = gen_empty_output(1) # Validation: Compute output for each graph but only for its own partition # Per sample this is one CER which is then used e. g. for early stopping def apply_single_model(batch): batch = batch[ 'out_len'] # Take any, all are batch id as input single_batch_data = { k: [tf.gather(v, batch)] for k, v in inputs.items() } complete_outputs = [ tf.cond(tf.equal(i, inputs['fold_id'][batch]), lambda: self.fold_graphs[i](single_batch_data), lambda: empty_output) for i in range(len(self.fold_graphs)) ] outputs = { k: tf.gather( tf.stack([out[k] for out in complete_outputs]), inputs['fold_id'][batch][0])[0] for k in empty_output.keys() if k != 'decoded' } paddings = [ ([0, 0], [0, max_lstm_seq_len - tf.shape(out['decoded'])[1]]) for out in complete_outputs ] outputs['decoded'] = tf.gather( tf.stack([ tf.pad(out['decoded'], padding, 'CONSTANT', constant_values=0) for out, padding in zip(complete_outputs, paddings) ]), inputs['fold_id'][batch][0])[0] return outputs complete_outputs = tf.map_fn(apply_single_model, { k: tf.range(batch_size, dtype=v.dtype) for k, v in empty_output.items() }, parallel_iterations=len( self.fold_graphs), back_prop=False) return complete_outputs[ 'blank_last_softmax'], complete_outputs['out_len'], [ complete_outputs ] * len(self.fold_graphs) if isinstance(training, bool) or isinstance(training, int): blank_last_softmax, lstm_seq_len, complete_outputs = training_step( ) if training else validation_step() else: blank_last_softmax, lstm_seq_len, complete_outputs = tf.cond( training, training_step, validation_step) else: # Prediction Graph: standard voting complete_outputs = [ self.fold_graphs[i](inputs) for i in range(len(self.fold_graphs)) ] lstm_seq_len = complete_outputs[0][ 'out_len'] # is the same for all children softmax_outputs = tf.stack( [out['blank_last_softmax'] for out in complete_outputs], axis=0) blank_last_softmax = tf.reduce_mean(softmax_outputs, axis=0) softmax = tf.roll(blank_last_softmax, shift=1, axis=-1) greedy_decoded = ctc.ctc_greedy_decoder( inputs=tf.transpose(blank_last_softmax, perm=[1, 0, 2]), sequence_length=tf.cast(K.flatten(lstm_seq_len), 'int32'))[0][0] outputs = { 'blank_last_logits': tf.math.log(blank_last_softmax), 'blank_last_softmax': blank_last_softmax, 'logits': tf.math.log(softmax), 'softmax': softmax, "out_len": lstm_seq_len, 'decoded': tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1, } for i, voter_output in enumerate(complete_outputs): for k, v in voter_output.items(): outputs[f"{k}_{i}"] = v return outputs
outputs, last_state = tf.nn.dynamic_rnn(cell, inputList, seqLengths, initial, dtype=tf.float32, scope='rnn') outputs = tf.reshape(outputs, (-1, nHidden)) logits = tf.matmul(outputs, W) + b logits = tf.reshape(logits, (batchSize, -1, nClasses)) logits = tf.transpose(logits, [1,0,2]) ####Optimizing loss = tf.reduce_mean(ctc.ctc_loss(logits, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating predictions = tf.to_int32(ctc.ctc_greedy_decoder(logits, seqLengths)[0][0]) err = tf.edit_distance(predictions, targetY, normalize=True) err.set_shape([None]) err = tf.reduce_mean(err, name='error') ####Run session with tf.Session(graph=graph) as session: print('Initializing') tf.initialize_all_variables().run() for epoch in range(nEpochs): print('Epoch', epoch+1, '...') if epoch % 10 == 0: print('Saving Graph') tf.train.Saver().save(session, "/home/zhihaol/807/model.ckpt") tf.train.write_graph(session.graph_def, "/home/zhihaol/807/", "model_graph.pbtxt", True)
def from_proto(network_proto): reuse_variables = False intra_threads = network_proto.backend.num_intra_threads inter_threads = network_proto.backend.num_inter_threads # load fuzzy ctc module if available if len(network_proto.backend.fuzzy_ctc_library_path ) > 0 and network_proto.ctc == NetworkParams.CTC_FUZZY: from calamari_ocr.ocr.backends.tensorflow_backend.tensorflow_fuzzy_ctc_loader import load as load_fuzzy fuzzy_module = load_fuzzy( network_proto.backend.fuzzy_ctc_library_path) else: fuzzy_module = None graph = tf.Graph() with graph.as_default(): tf.set_random_seed(network_proto.backend.random_seed) session = tf.Session( graph=graph, config=tf.ConfigProto( intra_op_parallelism_threads=intra_threads, inter_op_parallelism_threads=inter_threads, )) gpu_enabled = False for d in session.list_devices(): if d.device_type == "GPU": gpu_enabled = True break inputs = tf.placeholder(tf.float32, shape=(None, None, network_proto.features), name="inputs") batch_size = tf.shape(inputs)[0] seq_len = tf.placeholder(tf.int32, shape=(None, ), name="seq_len") targets = tf.sparse_placeholder(tf.int32, shape=(None, None), name="targets") dropout_rate = tf.placeholder(tf.float32, shape=(), name="dropout_rate") with tf.variable_scope("", reuse=reuse_variables) as scope: no_layers = len(network_proto.layers) == 0 if not no_layers: has_conv_or_pool = network_proto.layers[ 0].type != LayerParams.LSTM else: has_conv_or_pool = False if has_conv_or_pool: cnn_inputs = tf.reshape( inputs, [batch_size, -1, network_proto.features, 1]) shape = seq_len, network_proto.features layers = [cnn_inputs] last_num_filters = 1 for layer in [ l for l in network_proto.layers if l.type != LayerParams.LSTM ]: if layer.type == LayerParams.CONVOLUTIONAL: layers.append( tf.layers.conv2d( inputs=layers[-1], filters=layer.filters, kernel_size=(layer.kernel_size.x, layer.kernel_size.y), padding="same", activation=tf.nn.relu, )) last_num_filters = layer.filters elif layer.type == LayerParams.MAX_POOLING: layers.append( tf.layers.max_pooling2d( inputs=layers[-1], pool_size=(layer.kernel_size.x, layer.kernel_size.y), strides=(layer.stride.x, layer.stride.y), padding="same", )) shape = (tf.to_int32(shape[0] // layer.stride.x), shape[1] // layer.stride.y) else: raise Exception("Unknown layer of type %s" % layer.type) lstm_seq_len, lstm_num_features = shape rnn_inputs = tf.reshape(layers[-1], [ batch_size, tf.shape(layers[-1])[1], last_num_filters * lstm_num_features ]) lstm_num_features = last_num_filters * lstm_num_features else: rnn_inputs = inputs lstm_seq_len = seq_len lstm_num_features = network_proto.features lstm_layers = [ l for l in network_proto.layers if l.type == LayerParams.LSTM ] # Time major inputs required for lstm time_major_inputs = tf.transpose(rnn_inputs, [1, 0, 2]) if len(lstm_layers) > 0: for i, lstm in enumerate(lstm_layers): if lstm.hidden_nodes != lstm_layers[0].hidden_nodes: raise Exception( "Currently all lstm layers must have an equal number of hidden nodes. " "Got {} != {}".format( lstm.hidden_nodes, lstm_layers[0].hidden_nodes)) def cpu_cudnn_compatible_lstm_backend( time_major_inputs, hidden_nodes): def get_lstm_cell(num_hidden): return cudnn_rnn.CudnnCompatibleLSTMCell( num_hidden, reuse=reuse_variables) fw, bw = zip(*[(get_lstm_cell(hidden_nodes), get_lstm_cell(hidden_nodes)) for lstm in lstm_layers]) time_major_outputs, output_fw, output_bw \ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(list(fw), list(bw), time_major_inputs, sequence_length=lstm_seq_len, dtype=tf.float32, scope="{}cudnn_lstm/stack_bidirectional_rnn".format(scope.name), time_major=True, ) return time_major_outputs def gpu_cudnn_lstm_backend(time_major_inputs, hidden_nodes): # Create the Cudnn LSTM factory rnn_lstm = cudnn_rnn.CudnnLSTM( len(lstm_layers), hidden_nodes, direction='bidirectional', kernel_initializer=tf.initializers.random_uniform( -0.1, 0.1)) # TODO: Check if the models are loadable from meta Graph, maybe the next line fixed this rnn_lstm._saveable_cls = cudnn_rnn.CudnnLSTMSaveable # Apply the lstm to the inputs time_major_outputs, ( output_h, output_c) = rnn_lstm(time_major_inputs) return time_major_outputs if network_proto.backend.cudnn: if gpu_enabled: print("Using CUDNN LSTM backend on GPU") time_major_outputs = gpu_cudnn_lstm_backend( time_major_inputs, lstm_layers[0].hidden_nodes) else: print("Using CUDNN compatible LSTM backend on CPU") time_major_outputs = cpu_cudnn_compatible_lstm_backend( time_major_inputs, lstm_layers[0].hidden_nodes) else: raise Exception( "Only cudnn based backend supported yet.") # Set the output size output_size = lstm_layers[-1].hidden_nodes * 2 else: output_size = lstm_num_features time_major_outputs = time_major_inputs # flatten to (T * N, F) for matrix multiplication. This will be reversed later time_major_outputs = tf.reshape( time_major_outputs, [-1, time_major_outputs.shape.as_list()[2]]) if network_proto.dropout > 0: time_major_outputs = tf.nn.dropout(time_major_outputs, 1 - dropout_rate, name="dropout") # we need to turn off validate_shape so we can resize the variable on a codec resize W = tf.get_variable('W', validate_shape=False, initializer=tf.random_uniform( [output_size, network_proto.classes], -0.1, 0.1)) b = tf.get_variable('B', validate_shape=False, initializer=tf.constant( 0., shape=[network_proto.classes])) # the output layer time_major_logits = tf.matmul(time_major_outputs, W) + b # reshape back time_major_logits = tf.reshape( time_major_logits, [-1, batch_size, tf.shape(W)[-1]], name="time_major_logits") time_major_softmax = tf.nn.softmax(time_major_logits, -1, "time_major_softmax") logits = tf.transpose(time_major_logits, [1, 0, 2], name="logits") softmax = tf.transpose(time_major_softmax, [1, 0, 2], name="softmax") # ctc predictions # Note for codec change: the codec size is derived upon creation, therefore the ctc ops must be created # using the true codec size (the W/B-Matrix may change its shape however during loading/codec change # to match the true codec size if network_proto.ctc == NetworkParams.CTC_DEFAULT: loss = ctc_ops.ctc_loss( targets, time_major_logits, lstm_seq_len, time_major=True, ctc_merge_repeated=network_proto.ctc_merge_repeated, ignore_longer_outputs_than_inputs=True) decoded, log_prob = ctc_ops.ctc_greedy_decoder( time_major_logits, lstm_seq_len, merge_repeated=network_proto.ctc_merge_repeated) # decoded, log_prob = ctc_ops.ctc_beam_search_decoder(time_major_logits, lstm_seq_len, merge_repeated=model_settings["merge_repeated"]) elif network_proto.ctc == NetworkParams.CTC_FUZZY: loss, deltas = fuzzy_module['module'].fuzzy_ctc_loss( logits, targets.indices, targets.values, lstm_seq_len, ignore_longer_outputs_than_inputs=True) decoded, log_prob = fuzzy_module['decoder_op']( softmax, lstm_seq_len) else: raise Exception( "Unknown ctc model: '%s'. Supported are Default and Fuzzy" % network_proto.ctc) decoded = decoded[0] sparse_decoded = ( tf.identity(decoded.indices, name="decoded_indices"), tf.identity(decoded.values, name="decoded_values"), tf.identity(decoded.dense_shape, name="decoded_shape"), ) cost = tf.reduce_mean(loss, name='cost') if network_proto.solver == NetworkParams.MOMENTUM_SOLVER: optimizer = tf.train.MomentumOptimizer( network_proto.learning_rate, network_proto.momentum) elif network_proto.solver == NetworkParams.ADAM_SOLVER: optimizer = tf.train.AdamOptimizer( network_proto.learning_rate) else: raise Exception("Unknown solver of type '%s'" % network_proto.solver) gvs = optimizer.compute_gradients(cost) training_ops = [] if network_proto.clipping_mode == NetworkParams.CLIP_NONE: pass elif network_proto.clipping_mode == NetworkParams.CLIP_AUTO: # exponentially follow the global average of gradients to set clipping ema = tf.train.ExponentialMovingAverage(decay=0.999) max_l2 = 1000 max_grads = 1000 grads = [grad for grad, _ in gvs] l2 = tf.minimum(tf.global_norm([grad for grad in grads]), max_l2) l2_ema_op, l2_ema = ema.apply([l2]), ema.average(l2) grads, _ = tf.clip_by_global_norm( grads, clip_norm=tf.minimum(l2_ema / max_l2 * max_grads, max_grads)) gvs = zip(grads, [var for _, var in gvs]) training_ops.append(l2_ema_op) elif network_proto.clipping_mode == NetworkParams.CLIP_CONSTANT: clip = network_proto.clipping_constant if clip <= 0: raise Exception( "Invalid clipping constant. Must be greater than 0, but got {}" .format(clip)) grads = [grad for grad, _ in gvs] grads, _ = tf.clip_by_global_norm(grads, clip_norm=clip) gvs = zip(grads, [var for _, var in gvs]) else: raise Exception("Unsupported clipping mode {}".format( network_proto.clipping_mode)) training_ops.append( optimizer.apply_gradients(gvs, name='grad_update_op')) train_op = tf.group(training_ops, name="train_op") ler = tf.reduce_mean(tf.edit_distance( tf.cast(decoded, tf.int32), targets), name='ler') lstm_seq_len = tf.identity(lstm_seq_len, "seq_len_out") return TensorflowModel(network_proto, graph, session, inputs, seq_len, lstm_seq_len, targets, train_op, cost, ler, sparse_decoded, softmax, dropout_rate)
beam_search_decoder = parser.get(config_header, 'beam_search_decoder') # set up GPU if available tf_device = str(parser.get(config_header, 'tf_device')) # set up the max amount of simultaneous users # this restricts GPU usage to the inverse of self.simultaneous_users_count simultaneous_users_count = parser.getint(config_header, 'simultaneous_users_count') input_tensor = tf.placeholder( tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') seq_length = tf.placeholder(tf.int32, [None], name='seq_length') logits, summary_op = BiRNN_model(conf_path, input_tensor, tf.to_int64(seq_length), n_input, n_context) decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_length, merge_repeated=True) saver = tf.train.Saver() # create the session sess = tf.Session() saver.restore(sess, model_path) print('Model restored') def evaluate(filename='data/test/1_input.npy'): points = np.load(filename) # print("Points before pre",points.shape) NORM_ARGS = [
def call(self, inputs, **kwargs): params: ModelParams = self._params input_data = tf.cast(inputs['img'], tf.float32) / 255.0 input_sequence_length = K.flatten(inputs['img_len']) shape = input_sequence_length, -1 # if concat or conv_T layers are present, we need to pad the input to ensure that possible upsampling layers work properly has_concat = any([ l.type == LayerType.Concat or l.type == LayerType.TransposedConv for l in params.layers ]) if has_concat: sx, sy = 1, 1 for layer_index, layer in enumerate( [l for l in params.layers if l.type == LayerType.MaxPooling]): sx *= layer.stride.x sy *= layer.stride.y padding = calculate_padding(input_data, (sx, sy)) padded = KL.Lambda(pad, name='padded_input')([input_data, padding]) last_layer_output = padded else: last_layer_output = input_data layers_by_index = [] for (lp, layer) in self.conv_layers: layers_by_index.append(last_layer_output) if lp.type == LayerType.Convolutional: last_layer_output = layer(last_layer_output) elif lp.type == LayerType.Concat: last_layer_output = layer( [layers_by_index[i] for i in lp.concat_indices]) elif lp.type == LayerType.DilatedBlock: ds = K.shape(last_layer_output) ss = last_layer_output.shape dilated_layers, concat_layer = layer dilated_layers = [ dl(last_layer_output) for dl in dilated_layers ] last_layer_output = concat_layer(dilated_layers) last_layer_output = K.reshape(last_layer_output, [ds[0], ds[1], ss[2], ss[3]]) elif lp.type == LayerType.TransposedConv: last_layer_output = layer(last_layer_output) elif lp.type == LayerType.MaxPooling: last_layer_output = layer(last_layer_output) shape = (shape[0] // lp.stride.x, shape[1] // lp.stride.y) else: raise Exception("Unknown layer of type %s" % lp.type) lstm_seq_len, lstm_num_features = shape lstm_seq_len = K.cast(lstm_seq_len, 'int32') ds = K.shape(last_layer_output) ss = last_layer_output.shape last_layer_output = K.reshape(last_layer_output, (ds[0], ds[1], ss[2] * ss[3])) if len(self.lstm_layers) > 0: for lstm_params, lstm_layer in self.lstm_layers: last_layer_output = lstm_layer(last_layer_output) if params.dropout > 0: last_layer_output = self.dropout(last_layer_output) blank_last_logits = self.logits(last_layer_output) blank_last_softmax = self.softmax(blank_last_logits) logits = tf.roll(blank_last_logits, shift=1, axis=-1) softmax = tf.nn.softmax(logits) greedy_decoded = ctc.ctc_greedy_decoder( inputs=array_ops.transpose(blank_last_logits, perm=[1, 0, 2]), sequence_length=tf.cast(K.flatten(lstm_seq_len), 'int32'))[0][0] return { 'blank_last_logits': blank_last_logits, 'blank_last_softmax': blank_last_softmax, 'out_len': lstm_seq_len, 'logits': logits, 'softmax': softmax, 'decoded': tf.sparse.to_dense(greedy_decoded, default_value=-1) + 1 }