def load(self, sess, checkpoint_dir): ckpt_names = get_ckpt_list(checkpoint_dir) if not ckpt_names: # list is empty print_("No checkpoints found in {}\n".format(checkpoint_dir), 'm') return False else: print_("Found checkpoints:\n", 'm') for name in ckpt_names: print(" {}".format(name)) # Ask user if they prefer to start training from scratch or resume training on a specific ckeckpoint while True: mode = str( raw_input( 'Start training from scratch (start) or resume training from a previous checkpoint (choose one of the above): ' )) if mode == 'start' or mode in ckpt_names: break else: print( "Answer should be 'start' or one of the following checkpoints: {}" .format(ckpt_names)) continue if mode == 'start': return False elif mode in ckpt_names: # Try to load given intermediate checkpoint print_("Loading trained model...\n", 'm') self.saver.restore(sess, os.path.join(checkpoint_dir, mode)) print_("...Checkpoint {} loaded\n".format(mode), 'm') return True else: raise ValueError( "User input is neither 'start' nor a valid checkpoint")
def load(self, sess, checkpoint_dir): # Check if empty or invalid checkpoint name if self.checkpoint_name == '': ckpt_names = get_ckpt_list(self.checkpoints_dir) if not ckpt_names: raise ValueError("No checkpoints found in {}".format( self.checkpoints_dir)) else: raise ValueError( "Empty checkpoint name, try an available checkpoint in {} (ex: {})" .format(self.checkpoints_dir, ckpt_names[-1])) print_("Loading trained model checkpoint...\n", 'm') # Load from given checkpoint file name self.saver.restore(sess, os.path.join(checkpoint_dir, self.checkpoint_name)) print_("...Checkpoint {} loaded\n".format(self.checkpoint_name), 'm')
def load_model(self): # Check if empty or invalid checkpoint name if self.checkpoint_name == '': ckpt_names = get_saved_model_list(self.checkpoints_dir) if not ckpt_names: raise ValueError("No checkpoints found in {}".format( self.checkpoints_dir)) else: raise ValueError( "Empty checkpoint name, try an available checkpoint in {} (ex: {})" .format(self.checkpoints_dir, ckpt_names[-1])) print_("Loading trained model checkpoint...\n", 'm') # Load from given checkpoint file name model = tf.keras.models.load_model( os.path.join(self.checkpoints_dir, self.checkpoint_name)) print_("...Checkpoint {} loaded\n".format(self.checkpoint_name), 'm') return model
def train(self): # Build model self.model = util.model_builder.mobilenet_transfer( len(self.train_labels)) # Configure the model for training self.model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy']) # Print current model layers # self.model.summary() # Set preprocessing function datagen = tf.keras.preprocessing.image.ImageDataGenerator( # scale pixels between -1 and 1, sample-wise preprocessing_function=tf.keras.applications.mobilenet. preprocess_input, validation_split=self.validation_split) # Get classification data train_generator = datagen.flow_from_directory( self.train_data_path, target_size=(224, 224), color_mode='rgb', batch_size=self.batch_size, class_mode='categorical', shuffle=True, subset='training') if self.has_val_data: validation_generator = datagen.flow_from_directory( self.val_data_path, target_size=(224, 224), color_mode='rgb', batch_size=self.batch_size, class_mode='categorical', shuffle=True) else: # Generate a split of the training data as validation data validation_generator = datagen.flow_from_directory( self.train_data_path, # subset from training data path target_size=(224, 224), color_mode='rgb', batch_size=self.batch_size, class_mode='categorical', shuffle=True, subset='validation') # Callback for creating Tensorboard summary summary_name = "classif_data{}_bch{}_ep{}".format( len(self.train_gt_data_list), self.batch_size, self.epoch) tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=os.path.join(self.summaries_dir, summary_name)) # Callback for saving models periodically class_labels_save = '_'.join(self.train_labels) + '.' # 'acc' is the training accuracy and 'val_acc' is the validation set accuracy self.ckpt_save_name = class_labels_save + self.ckpt_save_name + "-val_acc{val_acc:.2f}-acc{acc:.2f}-ep{epoch:04d}.h5" checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=os.path.join(self.checkpoints_dir, self.ckpt_save_name), save_weights_only=False, period=self.save_model_period, save_best_only=True, monitor='val_acc', mode='max') # Check if there are intermediate trained model to load # Uncomment following lines if you want to resume from a previous saved model # if not self.load_model(): # print_("Starting training from scratch\n", 'm') # Train the model fit_history = self.model.fit_generator( generator=train_generator, steps_per_epoch=train_generator.n // self.batch_size, validation_data=validation_generator, validation_steps=validation_generator.n // self.batch_size, epochs=self.epoch, callbacks=[checkpoint_callback, tensorboard_callback]) print_("--------End of training--------\n", 'm')
def __init__(self, args): # Training hyperparameters self.learning_rate = args.learning_rate self.batch_size = args.batch_size self.epoch = args.epoch self.save_model_period = 1 # save model weights every N epochs # Training and validation dataset paths self.train_data_path = './data/train' self.val_data_path = './data/validation' # Where to save and load model weights (=checkpoints) self.checkpoints_dir = './checkpoints' if not os.path.exists(self.checkpoints_dir): os.makedirs(self.checkpoints_dir) self.ckpt_save_name = 'classTemplate' # Where to save tensorboard summaries self.summaries_dir = './summaries/' # Get training dataset as lists of image paths self.train_gt_data_list = get_filepaths_from_dir(self.train_data_path) if len(self.train_gt_data_list) is 0: raise ValueError("No training data found in folder {}".format( self.train_data_path)) elif (len(self.train_gt_data_list) < self.batch_size): raise ValueError( "Batch size must be smaller than the dataset (batch size = {}, number of training data = {})" .format(self.batch_size, len(self.train_gt_data_list))) # Get validation dataset if provided self.has_val_data = True self.val_gt_data_list = get_filepaths_from_dir(self.val_data_path) if len(self.val_gt_data_list) is 0: print( "No validation data found in {}, 20% of training data will be used as validation data" .format(self.val_data_path)) self.has_val_data = False self.validation_split = 0.2 elif (len(self.val_gt_data_list) < self.batch_size): raise ValueError( "Batch size must be smaller than the dataset (batch size = {}, number of validation data = {})" .format(self.batch_size, len(self.val_gt_data_list))) else: print_( "Number of validation data: {}\n".format( len(self.val_gt_data_list)), 'm') self.validation_split = 0.0 self.train_labels = get_labels_from_dir(self.train_data_path) # Check class labels are the same if self.has_val_data: self.val_labels = get_labels_from_dir(self.val_data_path) if self.train_labels != self.val_labels: if len(self.train_labels) != len(self.val_labels): raise ValueError( "{} and {} should have the same number of subdirectories ({}!={})" .format(self.train_data_path, self.val_data_path, len(self.train_labels), len(self.val_labels))) raise ValueError( "{} and {} should have the same subdirectory label names ({}!={})" .format(self.train_data_path, self.val_data_path, self.train_labels, self.val_labels)) # Compute and print training hyperparameters self.batch_per_epoch = int( np.ceil(len(self.train_gt_data_list) / float(self.batch_size))) self.max_steps = int(self.epoch * (self.batch_per_epoch)) print_( "Number of training data: {}\nNumber of batches per epoch: {} (batch size = {})\nNumber of training steps for {} epochs: {}\n" .format(len(self.train_gt_data_list), self.batch_per_epoch, self.batch_size, self.epoch, self.max_steps), 'm') print("Class labels: {}".format(self.train_labels))
def __init__(self, args): # Training hyperparameters self.learning_rate = args.learning_rate self.batch_size = args.batch_size self.epoch = args.epoch self.crop_size = 256 self.n_levels = 3 self.scale = 0.5 self.channels = 3 # input / output channels # Training and validation dataset paths train_in_data_path = './data/train/input' train_gt_data_path = './data/train/groundtruth' val_in_data_path = './data/val/input' val_gt_data_path = './data/val/groundtruth' # Where to save and load model weights (=checkpoints) self.checkpoints_dir = './checkpoints' if not os.path.exists(self.checkpoints_dir): os.makedirs(self.checkpoints_dir) self.ckpt_save_name = 'trainingTemplateTF.model' # Where to save tensorboard summaries self.summaries_dir = './summaries/' # Get training dataset as lists of image paths self.train_in_data_list = get_filepaths_from_dir(train_in_data_path) self.train_gt_data_list = get_filepaths_from_dir(train_gt_data_path) if len(self.train_in_data_list) is 0 or len( self.train_gt_data_list) is 0: raise ValueError( "No training data found in folders {} or {}".format( train_in_data_path, train_gt_data_path)) elif len(self.train_in_data_list) != len(self.train_gt_data_list): raise ValueError( "{} ({} data) and {} ({} data) should have the same number of input data" .format(train_in_data_path, len(self.train_in_data_list), train_gt_data_path, len(self.train_gt_data_list))) elif (len(self.train_in_data_list) < self.batch_size): raise ValueError( "Batch size must be smaller than the dataset (batch size = {}, number of training data = {})" .format(self.batch_size, len(self.train_in_data_list))) self.is_exr = is_exr(self.train_in_data_list[0]) # Get validation dataset if provided self.has_val_data = True self.val_in_data_list = get_filepaths_from_dir(val_in_data_path) self.val_gt_data_list = get_filepaths_from_dir(val_gt_data_path) if len(self.val_in_data_list) is 0 or len(self.val_gt_data_list) is 0: print("No validation data found in {} or {}".format( val_in_data_path, val_gt_data_path)) self.has_val_data = False elif len(self.val_in_data_list) != len(self.val_gt_data_list): raise ValueError( "{} ({} data) and {} ({} data) should have the same number of input data" .format(val_in_data_path, len(self.val_in_data_list), val_gt_data_path, len(self.val_gt_data_list))) elif (len(self.val_in_data_list) < self.batch_size): raise ValueError( "Batch size must be smaller than the dataset (batch size = {}, number of validation data = {})" .format(self.batch_size, len(self.val_in_data_list))) else: val_is_exr = is_exr(self.val_in_data_list[0]) if (val_is_exr and not self.is_exr) or (not val_is_exr and self.is_exr): raise TypeError( "Train and validation data should have the same file format" ) print("Number of validation data: {}".format( len(self.val_in_data_list))) # Compute and print training hyperparameters batch_per_epoch = (len(self.train_in_data_list)) // self.batch_size self.max_steps = int(self.epoch * (batch_per_epoch)) print_( "Number of training data: {}\nNumber of batches per epoch: {} (batch size = {})\nNumber of training steps for {} epochs: {}\n" .format(len(self.train_in_data_list), batch_per_epoch, self.batch_size, self.epoch, self.max_steps), 'm')
def train(self): # Build model model = EncoderDecoder(self.n_levels, self.scale, self.channels) # Learning rate decay global_step = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False) self.lr = tf.train.polynomial_decay(self.learning_rate, global_step, self.max_steps, end_learning_rate=0.0, power=0.3) tf.summary.scalar('learning_rate', self.lr) # Training operator adam = tf.train.AdamOptimizer(self.lr) # Get next data from preprocessed training dataset img_in, img_gt = self.get_data(self.train_in_data_list, self.train_gt_data_list, self.batch_size, self.epoch) tf.summary.image('img_in', im2uint8(img_in)) tf.summary.image('img_gt', im2uint8(img_gt)) print('img_in, img_gt', img_in.shape, img_gt.shape) # Compute image loss n_outputs = model(img_in, reuse=False) loss_op = self.loss(n_outputs, img_gt) # By default, adam uses the current graph trainable_variables to optimise training, # thus train_op should be the last operation of the graph for training. train_op = adam.minimize(loss_op, global_step) # Create session sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # Initialise all the variables in current session init = tf.global_variables_initializer() sess.run(init) self.saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=1) # Check if there are intermediate trained model to load if not self.load(sess, self.checkpoints_dir): print_("Starting training from scratch\n", 'm') # Tensorboard summary summary_op = tf.summary.merge_all() summary_name = "data{}_bch{}_ep{}".format(len(self.train_in_data_list), self.batch_size, self.epoch) summary_writer = tf.summary.FileWriter(self.summaries_dir + summary_name, graph=sess.graph, flush_secs=30) # Compute loss on validation dataset to check overfitting if self.has_val_data: val_loss_op = self.validate(model) # Save validation loss to tensorboard val_summary_op = tf.summary.scalar('val_loss', val_loss_op) # Compute initial loss val_loss, val_summary = sess.run([val_loss_op, val_summary_op]) summary_writer.add_summary(val_summary, global_step=0) print( "Initial Loss on validation dataset: {:.4f}".format(val_loss)) for step in xrange(sess.run(global_step), self.max_steps): start_time = time.time() val_str = '' if step % 50 == 0 or step == self.max_steps - 1: # Train model and record summaries _, loss_total, summary = sess.run( [train_op, loss_op, summary_op]) summary_writer.add_summary(summary, global_step=step) duration = time.time() - start_time if self.has_val_data and step != 0: # Compute validation loss val_loss, val_summary = sess.run( [val_loss_op, val_summary_op]) summary_writer.add_summary(val_summary, global_step=step) val_str = ', val loss: {:.4f}'.format(val_loss) else: # Train only _, loss_total = sess.run([train_op, loss_op]) duration = time.time() - start_time assert not np.isnan(loss_total), 'Model diverged with loss = NaN' if step % 10 == 0 or step == self.max_steps - 1: examples_per_sec = self.batch_size / duration sec_per_batch = float(duration) format_str = ( '{}: step {}, loss: {:.4f} ({:.1f} data/s; {:.3f} s/bch)'. format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), step, loss_total, examples_per_sec, sec_per_batch)) print(format_str + val_str) if step % 1000 == 0 or step == self.max_steps - 1: # Save current model in a checkpoint self.save(sess, self.checkpoints_dir, step) print_("--------End of training--------\n", 'm') # Free all resources associated with the session sess.close()