def create(self, imageHeight, imageWidth, num_classes, evalFLAG): graph = tf.Graph() with graph.as_default(): num_hidden = 256 training = not evalFLAG with tf.name_scope('Inputs'): inputs = tf.placeholder(tf.float32, [None, imageHeight, imageWidth, 1], name='inputs') if evalFLAG: tf.summary.image('inputs', inputs, max_outputs=1) #seq_len should be feed with a list containing the real width of the images before padding to obtain imageWidth seq_len = tf.placeholder(tf.int32, [None], name='seq_len') targets = tf.sparse_placeholder(tf.int32, name='targets') targets_len = tf.placeholder(tf.int32, name='targets_len') conv_keep_prob = 0.8 lstm_keep_prob = 0.5 # Layer 1 with tf.name_scope('Layer_Conv_1'): h_conv1 = CNN(x=inputs, filters=16, kernel_size=[3, 3], strides=[1, 1], name='conv1', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool1, seq_len_1, imageHeight, imageWidth = max_pool( h_conv1, [2, 2], seq_len, imageHeight, imageWidth, evalFLAG) h_pool1 = tf.layers.dropout(h_pool1, rate=0.0, training=training) # Layer 2 with tf.name_scope('Layer_Conv_2'): h_conv2 = CNN(x=h_pool1, filters=32, kernel_size=[3, 3], strides=[1, 1], name='conv2', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool2, seq_len_2, imageHeight, imageWidth = max_pool( h_conv2, [2, 2], seq_len_1, imageHeight, imageWidth, evalFLAG) h_pool2 = tf.layers.dropout(h_pool2, rate=(1 - conv_keep_prob), training=training) # Layer 3 with tf.name_scope('Layer_Conv_3'): h_conv3 = CNN(x=h_pool2, filters=48, kernel_size=[3, 3], strides=[1, 1], name='conv3', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool3, seq_len_3, imageHeight, imageWidth = max_pool( h_conv3, [2, 2], seq_len_2, imageHeight, imageWidth, evalFLAG) h_pool3 = tf.layers.dropout(h_pool3, rate=(1 - conv_keep_prob), training=training) # Layer 4 with tf.name_scope('Layer_Conv_4'): h_conv4 = CNN(x=h_pool3, filters=64, kernel_size=[3, 3], strides=[1, 1], name='conv4', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool4, seq_len_4, imageHeight, imageWidth = max_pool( h_conv4, [1, 1], seq_len_3, imageHeight, imageWidth, evalFLAG) h_pool4 = tf.layers.dropout(h_pool4, rate=(1 - conv_keep_prob), training=training) # Layer 5 with tf.name_scope('Layer_Conv_5'): h_conv5 = CNN(x=h_pool4, filters=80, kernel_size=[3, 3], strides=[1, 1], name='conv5', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool5, seq_len_5, imageHeight, imageWidth = max_pool( h_conv5, [1, 1], seq_len_4, imageHeight, imageWidth, evalFLAG) h_pool5 = tf.layers.dropout(h_pool5, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Reshaping_step'): h_cw_concat = tf.transpose(h_pool5, (2, 0, 1, 3)) h_cw_concat = tf.reshape( h_cw_concat, (int(imageWidth), -1, int(imageHeight * 80))) h_cw_concat = tf.transpose(h_cw_concat, (1, 0, 2)) with tf.name_scope('Layer_BLSTM_1'): h_bilstm1 = bidirectionalLSTM(h_cw_concat, num_hidden, seq_len_5, '1', evalFLAG) h_bilstm1 = tf.concat(h_bilstm1, 2) h_bilstm1 = tf.layers.dropout(h_bilstm1, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_2'): h_bilstm2 = bidirectionalLSTM(h_bilstm1, num_hidden, seq_len_5, '2', evalFLAG) h_bilstm2 = tf.concat(h_bilstm2, 2) h_bilstm2 = tf.layers.dropout(h_bilstm2, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_3'): h_bilstm3 = bidirectionalLSTM(h_bilstm2, num_hidden, seq_len_5, '3', evalFLAG) h_bilstm3 = tf.concat(h_bilstm3, 2) h_bilstm3 = tf.layers.dropout(h_bilstm3, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_4'): h_bilstm4 = bidirectionalLSTM(h_bilstm3, num_hidden, seq_len_5, '4', evalFLAG) h_bilstm4 = tf.concat(h_bilstm4, 2) h_bilstm4 = tf.layers.dropout(h_bilstm4, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_5'): h_bilstm5 = bidirectionalLSTM(h_bilstm4, num_hidden, seq_len_5, '5', evalFLAG) h_bilstm5 = tf.concat(h_bilstm5, 2) h_bilstm5 = tf.layers.dropout(h_bilstm5, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_Linear') as ns: outputs = tf.transpose(h_bilstm5, (1, 0, 2)) outputs = tf.reshape(outputs, (-1, 2 * num_hidden)) logits = FNN(outputs, num_classes, ns, None, evalFLAG) with tf.name_scope('Logits'): logits = tf.reshape(logits, (int(imageWidth), -1, num_classes)) seq_len_5 = tf.maximum(seq_len_5, targets_len) n_batches = tf.placeholder(tf.float32, name='n_batches') previousCost = tf.placeholder(tf.float32, name='previous_cost') with tf.name_scope('CTC_Loss'): loss = tf.nn.ctc_loss(targets, logits, seq_len_5, preprocess_collapse_repeated=False, ctc_merge_repeated=True) with tf.name_scope('total'): batch_cost = tf.reduce_mean(loss) cost = batch_cost / n_batches + previousCost tf.summary.scalar('CTC_loss', cost) with tf.name_scope('train'): train_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='Layer_Linear') + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='BLSTM[12345]') + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='conv[12345]') print(train_vars) learning_rate = tf.placeholder(tf.float32, name='learning_rate') optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(batch_cost) with tf.name_scope('predictions'): predictions, log_prob = tf.nn.ctc_beam_search_decoder( logits, seq_len_5, merge_repeated=False) with tf.name_scope('CER'): with tf.name_scope('Mean_CER_per_word'): previousEDnorm = tf.placeholder(tf.float32, name='previousEDnorm') EDnorm = tf.reduce_mean( tf.edit_distance( tf.cast(predictions[0], tf.int32), targets, normalize=True)) / n_batches + previousEDnorm if evalFLAG: tf.summary.scalar('EDnorm', EDnorm) with tf.name_scope('Absolute_CER_total_set'): setTotalChars = tf.placeholder(tf.float32, name='setTotalChars') previousEDabs = tf.placeholder(tf.float32, name='previousEDabs') errors = tf.edit_distance(tf.cast(predictions[0], tf.int32), targets, normalize=False) EDabs = tf.reduce_sum( errors) / setTotalChars + previousEDabs if evalFLAG: tf.summary.scalar('EDabs', EDabs) ED = [EDnorm, EDabs] saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=24) transferred_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="BLSTM[12345]") + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="conv") transferred_vars_dict = dict([(var.op.name, var) for var in transferred_vars]) transfer_saver = tf.train.Saver(transferred_vars_dict) merged = tf.summary.merge_all() return graph, [ saver, transfer_saver ], inputs, seq_len, targets, targets_len, learning_rate, n_batches, setTotalChars, previousEDabs, previousEDnorm, previousCost, optimizer, batch_cost, cost, errors, ED, predictions, merged
def main(): print("bonjour") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') N_EPOCHS = 8 WIN_SIZES = [3,4,5] BATCH_SIZE = 64 EMB_DIM = 300 OUT_DIM = 1 L2_NORM_LIM = 3.0 NUM_FIL = 100 DROPOUT_PROB = 0.5 V_STRATEGY = 'static' ALLOC_MEM = 4096 if V_STRATEGY in ['rand', 'static', 'non-static']: NUM_CHA = 1 else: NUM_CHA = 2 # FILE paths W2V_PATH = 'GoogleNews-vectors-negative300.bin' TRAIN_X_PATH = 'train_x.txt' TRAIN_Y_PATH = 'train_y.txt' VALID_X_PATH = 'valid_x.txt' VALID_Y_PATH = 'valid_y.txt' # Load pretrained embeddings pretrained_model = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True) vocab = pretrained_model.wv.vocab.keys() w2v = pretrained_model.wv # Build dataset ======================================================================================================= w2c = build_w2c(TRAIN_X_PATH, vocab=vocab) w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk') train_x, train_y = build_dataset(TRAIN_X_PATH, TRAIN_Y_PATH, w2i, unk='unk') valid_x, valid_y = build_dataset(VALID_X_PATH, VALID_Y_PATH, w2i, unk='unk') train_x, train_y = sort_data_by_length(train_x, train_y) valid_x, valid_y = sort_data_by_length(valid_x, valid_y) VOCAB_SIZE = len(w2i) print('VOCAB_SIZE:', VOCAB_SIZE) V_init = init_V(w2v, w2i) with open(os.path.join(RESULTS_DIR, './w2i.dump'), 'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'), 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w) # Build model ================================================================================= model=CNN(VOCAB_SIZE, EMB_DIM, NUM_FIL, WIN_SIZES, OUT_DIM, DROPOUT_PROB, len(w2i)) # Train model ================================================================================ pretrained_embeddings = torch.tensor(V_init) model.embedding.weight.data.copy_(pretrained_embeddings) model.embedding.weight.data[len(w2i)-1] = torch.zeros(EMB_DIM) optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss() model = model.to(device) criterion = criterion.to(device) n_batches_train = int(len(train_x)/BATCH_SIZE) n_batches_valid = int(len(valid_x)/BATCH_SIZE) #print(len(train_x)) best_valid_loss = float('inf') for j in range(N_EPOCHS): start_time = time.time() epoch_loss = 0 epoch_acc = 0 epoch_loss = 0 epoch_acc = 0 for i in range(n_batches_train-1): start = i*BATCH_SIZE end = start+BATCH_SIZE train_loss, train_acc = train(model,train_x[start:end],train_y[start:end], criterion,optimizer) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) #if valid_loss < best_valid_loss: # best_valid_loss = valid_loss # torch.save(model.state_dict(), 'tut4-model.pt') for k in range(n_batches_valid-1): start = k*BATCH_SIZE end = start+BATCH_SIZE valid_loss, valid_acc = evaluate(model,valid_x[start:end],valid_y[start:end], criterion,optimizer) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) print(f'Epoch: {j } | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') torch.save(model.state_dict(), 'training.pt') return model