max_word_length, 0.015, 0.5) print('Start training...') print('Train size = %d' % len(train_x)) print('Val size = %d' % len(val_x)) print('Test size = %d' % len(test_x)) print('Num classes = %d' % num_classes) start_epoch = 1 max_epoch = 100 saver = tf.train.Saver() best_saver = BestCheckpointSaver(save_dir='checkpoints/best', num_to_keep=1, maximize=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M', handlers=[logging.FileHandler('logs/train.log'), logging.StreamHandler()]) latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir='checkpoints') if latest_checkpoint:
max_word_length, 0.015, 0.5) print('Start training...') print('Train size = %d' % len(train_x)) print('Val size = %d' % len(val_x)) print('Test size = %d' % len(test_x)) print('Num classes = %d' % num_classes) start_epoch = 1 max_epoch = 1000 saver = tf.train.Saver() best_saver = BestCheckpointSaver(save_dir='checkpoints/best', num_to_keep=1, maximize=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) best_checkpoint = best_checkpoint('checkpoints/best/', True) sess.run(tf.tables_initializer()) saver.restore(sess, best_checkpoint) train_feeder = LSTMCNNCRFeeder(train_x, train_chars, train_la, max_seq_length, max_word_length, 16) val_feeder = LSTMCNNCRFeeder(val_x, val_chars, val_la, max_seq_length, max_word_length, 16) test_feeder = LSTMCNNCRFeeder(test_x, test_chars, test_la, max_seq_length,
def main(_): # specify GPU if FLAGS.gpu_index: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_index # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) tf.reset_default_graph() X = tf.placeholder(tf.float32, shape=[None, FLAGS.img_size, FLAGS.img_size, 3], name="X") GT = tf.placeholder(tf.float32, shape=[None, FLAGS.label_size, FLAGS.label_size, 1], name="GT") mode = tf.placeholder(tf.bool, name="mode") # training or not if FLAGS.use_64_channel: pred = Unet_64_1024(X, mode, FLAGS) else: pred = Unet_32_512(X, mode, FLAGS) tf.add_to_collection("inputs", X) tf.add_to_collection("inputs", mode) tf.add_to_collection("outputs", pred) tf.summary.histogram("Predicted Mask", pred) tf.summary.image("Predicted Mask", pred) # IOU is # # (the area of intersection) # -------------------------- # (the area of two boxes) iou_op = IOU(pred, GT) loss = -iou_op tf.summary.scalar("loss", loss) # Updates moving mean and moving variance for BatchNorm (train/inference) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # other optimizer will be used train_op = tf.train.MomentumOptimizer(0.001, 0.99).minimize(loss) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) sess = tf.Session() sess.run(tf.global_variables_initializer()) summary_op = tf.summary.merge_all() train_summary_writer = tf.summary.FileWriter(FLAGS.logdir + '/train', sess.graph) val_summary_writer = tf.summary.FileWriter(FLAGS.logdir + '/validation') saver = tf.train.Saver() # For, checkpoint saver if FLAGS.best_train_dir: best_ckpt_saver = BestCheckpointSaver(title='unet.ckpt', save_dir=FLAGS.best_train_dir, num_to_keep=3, maximize=True) start_epoch = 1 epoch_from_ckpt = 0 if FLAGS.ckpt_path: saver.restore(sess, FLAGS.ckpt_path) tmp = FLAGS.ckpt_path tmp = tmp.split('-') tmp.reverse() epoch_from_ckpt = int(tmp[0]) start_epoch = epoch_from_ckpt + 1 if epoch_from_ckpt != FLAGS.epochs + 1: tf.logging.info('Training from epoch: %d ', start_epoch) # Saving as Protocol Buffer (pb) tf.train.write_graph(sess.graph_def, FLAGS.train_dir, 'unet.pbtxt', as_text=True) ############################ # Get data ############################ raw = Data(FLAGS.data_dir, FLAGS.validation_percentage) tr_data = DataLoader(raw.data_dir, raw.get_data('training'), FLAGS.img_size, FLAGS.label_size, FLAGS.batch_size) val_data = DataLoader(raw.data_dir, raw.get_data('validation'), FLAGS.img_size, FLAGS.label_size, FLAGS.batch_size) iterator = tf.data.Iterator.from_structure(tr_data.dataset.output_types, tr_data.dataset.output_shapes) next_batch = iterator.get_next() # Ops for initializing the two different iterators tr_init_op = iterator.make_initializer(tr_data.dataset) val_init_op = iterator.make_initializer(val_data.dataset) tr_batches_per_epoch = int(tr_data.data_size / FLAGS.batch_size) if tr_data.data_size % FLAGS.batch_size > 0: tr_batches_per_epoch += 1 val_batches_per_epoch = int(val_data.data_size / FLAGS.batch_size) if val_data.data_size % FLAGS.batch_size > 0: val_batches_per_epoch += 1 ############################ # Training ############################ print("{} Training start ... ".format(datetime.datetime.now())) for epoch in xrange(start_epoch, FLAGS.epochs + 1): print('{} Training epoch-{} start >> '.format(datetime.datetime.now(), epoch)) sess.run(tr_init_op) for step in range(tr_batches_per_epoch): X_train, y_train = sess.run(next_batch) train_summary, accuracy, _, _ = \ sess.run([summary_op, iou_op, train_op, increment_global_step], feed_dict={X: X_train, GT: y_train, mode: True} ) train_summary_writer.add_summary( train_summary, (epoch - start_epoch) * tr_batches_per_epoch + step) tf.logging.info('epoch #%d, step #%d/%d, accuracy(iou) %.5f%%' % (epoch, step, tr_batches_per_epoch, accuracy)) print("{} Validation start ... ".format(datetime.datetime.now())) total_val_accuracy = 0 val_count = 0 sess.run(val_init_op) for n in range(val_batches_per_epoch): X_val, y_val = sess.run(next_batch) val_summary, val_accuracy = \ sess.run([summary_op, iou_op], feed_dict={X: X_val, GT: y_val, mode: False} ) # total_val_accuracy += val_step_iou * X_val.shape[0] total_val_accuracy += val_accuracy val_count += 1 val_summary_writer.add_summary( val_summary, (epoch - start_epoch) * val_batches_per_epoch + n) tf.logging.info('step #%d/%d, accuracy(iou) %.5f%%' % (n, val_batches_per_epoch, val_accuracy * 100)) total_val_accuracy /= val_count tf.logging.info( 'step %d: Validation accuracy = %.2f%% (N=%d)' % (epoch, total_val_accuracy * 100, raw.get_size('validation'))) # save checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, 'unet.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, epoch) saver.save(sess, checkpoint_path, global_step=epoch) # save best checkpoint if FLAGS.best_train_dir: best_ckpt_saver.handle(total_val_accuracy, sess, global_step, epoch)
def train(self, data, *args, **kwargs): if not os.path.isfile( kwargs.get("parsedDumpPath", '../dev/parsedDataDump.pkl')): self.data_converter(data, *args, **kwargs) with open(kwargs.get("parsedDumpPath", '../dev/parsedDataDump.pkl'), 'rb') as fp: train_set, val_set, test_set, dicts = pickle.load(fp) w2idx, la2idx = dicts['words2idx'], dicts['labels2idx'] idx2w = {w2idx[k]: k for k in w2idx} idx2la = {la2idx[k]: k for k in la2idx} train_x, train_chars, train_la = train_set val_x, val_chars, val_la = val_set test_x, test_chars, test_la = test_set self.log.debug('Loading elmo!') elmo_batcher = Batcher(kwargs.get("vocabPath", '../dev/vocab.txt'), 50) elmo_bilm = BidirectionalLanguageModel( kwargs.get( "elmoOptionsFile", '../resources/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' ), kwargs.get( "elmoWeightFile", '../resources/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' )) self.log.debug('Loading model!') num_classes = len(la2idx.keys()) max_seq_length = max( max(map(len, train_x)), max(map(len, test_x)), ) max_word_length = max( max([len(ssc) for sc in train_chars for ssc in sc]), max([len(ssc) for sc in test_chars for ssc in sc])) model = ElmoModel( True, kwargs.get("wordEmbeddingSize", 50), # Word embedding size kwargs.get("charEmbeddingSize", 16), # Character embedding size kwargs.get("LSTMStateSize", 200), # LSTM state size kwargs.get("filterNum", 128), # Filter num kwargs.get("filterSize", 3), # Filter size num_classes, max_seq_length, max_word_length, kwargs.get("learningRate", 0.015), kwargs.get("dropoutRate", 0.5), elmo_bilm, 1, # elmo_mode elmo_batcher, **kwargs) self.log.debug('Start training...') self.log.debug('Train size = %d' % len(train_x)) self.log.debug('Val size = %d' % len(val_x)) self.log.debug('Test size = %d' % len(test_x)) self.log.debug('Num classes = %d' % num_classes) start_epoch = 1 max_epoch = kwargs.get("maxEpoch", 100) self.log.debug('Epoch = %d' % max_epoch) saver = tf.train.Saver() best_saver = BestCheckpointSaver(save_dir=kwargs.get( "bestCheckpointPath", "../results/checkpoints/best"), num_to_keep=1, maximize=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) latest_checkpoint = tf.train.latest_checkpoint( checkpoint_dir=kwargs.get("checkpointPath", "../results/checkpoints")) if latest_checkpoint: saver.restore(sess, latest_checkpoint) else: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) train_feeder = LSTMCNNCRFeeder(train_x, train_chars, train_la, max_seq_length, max_word_length, kwargs.get("epochWidth", 16)) val_feeder = LSTMCNNCRFeeder(val_x, val_chars, val_la, max_seq_length, max_word_length, kwargs.get("epochWidth", 16)) for epoch in range(start_epoch, max_epoch + 1): loss = 0 for step in range(train_feeder.step_per_epoch): tokens, chars, labels = train_feeder.feed() step_loss = model.train_step(sess, tokens, chars, labels) loss += step_loss self.log.debug( 'epoch: %d, size: %d/%d, step_loss: %f, epoch_loss: %f', epoch, train_feeder.offset, train_feeder.size, step_loss, loss) preds = [] for step in range(val_feeder.step_per_epoch): tokens, chars, labels = val_feeder.feed() pred = model.test(sess, tokens, chars) preds.extend(pred) true_seqs = [idx2la[la] for sl in val_la for la in sl] pred_seqs = [idx2la[la] for sl in preds for la in sl] ll = min(len(true_seqs), len(pred_seqs)) self.log.debug(true_seqs[:ll]) self.log.debug(pred_seqs[:ll]) prec, rec, f1 = evaluate(true_seqs[:ll], pred_seqs[:ll], False) self.log.debug("Epoch: %d, val_p: %f, val_r: %f, val_f1: %f", epoch, prec, rec, f1) val_feeder.next_epoch(False) saver.save(sess, kwargs.get("checkpointPath", "../results/checkpoints") + '/model.ckpt', global_step=epoch) best_saver.handle(f1, sess, epoch) logging.info('') train_feeder.next_epoch() self.log.debug("Training done! ... Saving trained model") return model, sess, saver
def train(self, Data, n_epochs, l_bs, u_bs, lr, eval_samps=None, binarize=False, verbose=1): """ Method for training the models """ self.data_init(Data, eval_samps, l_bs, u_bs) self.lr = self.set_learning_rate(lr) # define optimizer optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) gvs = optimizer.compute_gradients(self.loss) # clip gradients capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs] update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.optimizer = optimizer.apply_gradients( capped_gvs, global_step=self.global_step) self.y_pred = self.predict(self.x) self.curve_array = np.zeros((n_epochs + 1, 14)) if self.learning_paradigm == 'unsupervised': self.elbo_l_curve = tf.reduce_mean( self.unlabelled_loss(self.x)) self.qy_ll_curve = tf.reduce_mean( self.qy_loss(self.x)) self.elbo_u_curve = tf.reduce_mean( self.unlabelled_loss(self.x)) else: self.elbo_l_curve = tf.reduce_mean( self.labelled_loss(self.x, self.y)) self.qy_ll_curve = tf.reduce_mean( self.qy_loss(self.x, self.y)) self.elbo_u_curve = tf.reduce_mean( self.unlabelled_loss(self.x)) self.compute_accuracies() # initialize session and train epoch = 0 with self.session as sess: sess.run(tf.global_variables_initializer()) self.curve_array[epoch] = self.calc_curve_vals(sess, Data) saver = BestCheckpointSaver(save_dir=self.ckpt_dir, num_to_keep=5, maximize=True) while epoch < n_epochs: x_labelled, labels, x_unlabelled, _ = \ Data.next_batch(l_bs, u_bs) if binarize is True: x_labelled = self.binarize(x_labelled) x_unlabelled = self.binarize(x_unlabelled) fd = self.training_fd(x_labelled, labels, x_unlabelled) _, loss_batch = sess.run([self.optimizer, self.loss], fd) if Data._epochs_unlabelled > epoch: self.curve_array[epoch + 1] = \ self.calc_curve_vals(sess, Data) saver.handle(self.curve_array[epoch, 6], sess, self.global_step) epoch += 1 if verbose == 1: fd = self._printing_feed_dict(Data, x_labelled, x_unlabelled, labels, eval_samps, binarize) self.print_verbose1(epoch, fd, sess) elif verbose == 2: fd = self._printing_feed_dict(Data, x_labelled, x_unlabelled, labels, eval_samps, binarize) self.print_verbose2(epoch, fd, sess) elif verbose == 3: self.print_verbose3(epoch) y_pred_test = sess.run([self.y_pred], {self.x: Data.data['x_test'], K.learning_phase(): 0})[0] conf_mat = confusion_matrix( Data.data['y_test'].argmax(1), y_pred_test.argmax(1)) np.save(os.path.join( self.output_dir, 'conf_mat_' + self.name + '_' + str(epoch) + '.npy'), conf_mat) np.save(os.path.join( self.output_dir, 'y_pred' + self.name + '_' + str(epoch) + '.npy'), y_pred_test) np.save(os.path.join( self.output_dir, 'y_true' + self.name + '_' + str(epoch) + '.npy'), Data.data['y_test']) return self.curve_array
def train(self, Data, n_epochs, l_bs, u_bs, lr, eval_samps=None, binarize=False, verbose=1, decay_ratio=0.75, decay_period=200, h_opt=False, keep_ckpt=True, restore=False): """ Method for training the models """ self.data_init(Data, eval_samps, l_bs, u_bs) self.global_step = tf.Variable(0, trainable=False, name='global_step') # self.global_epoch = tf.Variable(0, trainable=False, name='global_epoch') self.epoch = 0 #self.lr = self.set_learning_rate([lr[0], 1600, lr[0] / 10.0]) self.lr = self.set_learning_rate( [lr[0], lr[0] / 10.0, decay_period, decay_ratio], 'exp') # define optimizer optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) gvs = optimizer.compute_gradients(self.loss) # clip gradients capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs] update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.optimizer = optimizer.apply_gradients( capped_gvs, global_step=self.global_step) self.y_pred = self.predict(self.x) self.curve_array = np.zeros((n_epochs + 1, 14)) if self.learning_paradigm == 'unsupervised': self.elbo_l_curve = tf.reduce_mean(self.unlabelled_loss(self.x)) self.qy_ll_curve = tf.reduce_mean(self.qy_loss(self.x)) self.elbo_u_curve = tf.reduce_mean(self.unlabelled_loss(self.x)) else: if self.model_name == 'adgm' or self.model_name == 'adg_dgm': self.elbo_l_curve = tf.reduce_mean( self.labelled_loss(self.x, self.y)[0]) self.qy_ll_curve = tf.reduce_mean( self.labelled_loss(self.x, self.y)[1]) else: self.elbo_l_curve = tf.reduce_mean( self.labelled_loss(self.x, self.y)) self.qy_ll_curve = tf.reduce_mean(self.qy_loss(self.x, self.y)) self.elbo_u_curve = tf.reduce_mean(self.unlabelled_loss(self.x)) self.compute_accuracies() # initialize session and train with self.session as sess: sess.run(tf.global_variables_initializer()) if restore == True: saver_for_restore = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(self.ckpt_dir) best_ckpt = get_best_checkpoint(self.ckpt_dir) best_epoch = int(re.match('.*?([0-9]+)$', best_ckpt).group(1)) best_ckpt_usable = re.sub('-([0-9]+)$', "", best_ckpt) saver_for_restore.restore(sess, best_ckpt_usable) self.epoch = best_epoch self.curve_array[self.epoch] = self.calc_curve_vals(sess, Data) if verbose == 3: self.print_verbose3(self.epoch) if keep_ckpt == True: saver = BestCheckpointSaver(save_dir=self.ckpt_dir, num_to_keep=2, maximize=True) while self.epoch < n_epochs: x_labelled, labels, x_unlabelled, _ = \ Data.next_batch(l_bs, u_bs) if binarize is True: x_labelled = self.binarize(x_labelled) x_unlabelled = self.binarize(x_unlabelled) fd = self.training_fd(x_labelled, labels, x_unlabelled) _, loss_batch = sess.run([self.optimizer, self.loss], fd) if Data._epochs_unlabelled > self.epoch: self.epoch += 1 # sess.run(self.global_epoch.assign(self.epoch) self.curve_array[self.epoch] = \ self.calc_curve_vals(sess, Data) if h_opt == True and self.epoch > 20: if self.curve_array[self.epoch, 12] < 0.07: raise Exception('results too bad') if h_opt == True and self.epoch > 40: if self.curve_array[self.epoch, 12] < 0.1: raise Exception('results too bad') if keep_ckpt == True: saver.handle(self.curve_array[self.epoch, 6], sess, self.global_step, self.epoch) if verbose == 1: fd = self._printing_feed_dict(Data, x_labelled, x_unlabelled, labels, eval_samps, binarize) self.print_verbose1(self.epoch, fd, sess) elif verbose == 2: fd = self._printing_feed_dict(Data, x_labelled, x_unlabelled, labels, eval_samps, binarize) self.print_verbose2(self.epoch, fd, sess) elif verbose == 3: self.print_verbose3(self.epoch) if self.epoch % 10 == 0: y_pred_test = sess.run([self.y_pred], { self.x: Data.data['x_test'], K.learning_phase(): 0 })[0] conf_mat = confusion_matrix( Data.data['y_test'].argmax(1), y_pred_test.argmax(1)) np.save( os.path.join( self.output_dir, 'conf_mat_' + self.name + '_' + str(self.epoch) + '.npy'), conf_mat) np.save( os.path.join( self.output_dir, 'y_pred_' + self.name + '_' + str(self.epoch) + '.npy'), y_pred_test) np.save( os.path.join( self.output_dir, 'y_true_' + self.name + '_' + str(self.epoch) + '.npy'), Data.data['y_test']) if np.sum(np.isnan(self.curve_array)) > 0: print( 'loss is nan, going back to previous best checkpoint' ) best_ckpt = get_best_checkpoint(self.ckpt_dir) best_epoch = int( re.match('.*?([0-9]+)$', best_ckpt).group(1)) best_ckpt_usable = re.sub('-([0-9]+)$', "", best_ckpt) self.epoch = best_epoch saver._saver.restore(sess, best_ckpt_usable) return self.curve_array