Esempio n. 1
0
 def load(self, sess, checkpoint_dir):
     ckpt_names = get_ckpt_list(checkpoint_dir)
     if not ckpt_names:  # list is empty
         print_("No checkpoints found in {}\n".format(checkpoint_dir), 'm')
         return False
     else:
         print_("Found checkpoints:\n", 'm')
         for name in ckpt_names:
             print("    {}".format(name))
         # Ask user if they prefer to start training from scratch or resume training on a specific ckeckpoint
         while True:
             mode = str(
                 raw_input(
                     'Start training from scratch (start) or resume training from a previous checkpoint (choose one of the above): '
                 ))
             if mode == 'start' or mode in ckpt_names:
                 break
             else:
                 print(
                     "Answer should be 'start' or one of the following checkpoints: {}"
                     .format(ckpt_names))
                 continue
         if mode == 'start':
             return False
         elif mode in ckpt_names:
             # Try to load given intermediate checkpoint
             print_("Loading trained model...\n", 'm')
             self.saver.restore(sess, os.path.join(checkpoint_dir, mode))
             print_("...Checkpoint {} loaded\n".format(mode), 'm')
             return True
         else:
             raise ValueError(
                 "User input is neither 'start' nor a valid checkpoint")
Esempio n. 2
0
 def load(self, sess, checkpoint_dir):
     # Check if empty or invalid checkpoint name
     if self.checkpoint_name == '':
         ckpt_names = get_ckpt_list(self.checkpoints_dir)
         if not ckpt_names:
             raise ValueError("No checkpoints found in {}".format(
                 self.checkpoints_dir))
         else:
             raise ValueError(
                 "Empty checkpoint name, try an available checkpoint in {} (ex: {})"
                 .format(self.checkpoints_dir, ckpt_names[-1]))
     print_("Loading trained model checkpoint...\n", 'm')
     # Load from given checkpoint file name
     self.saver.restore(sess,
                        os.path.join(checkpoint_dir, self.checkpoint_name))
     print_("...Checkpoint {} loaded\n".format(self.checkpoint_name), 'm')
Esempio n. 3
0
 def load_model(self):
     # Check if empty or invalid checkpoint name
     if self.checkpoint_name == '':
         ckpt_names = get_saved_model_list(self.checkpoints_dir)
         if not ckpt_names:
             raise ValueError("No checkpoints found in {}".format(
                 self.checkpoints_dir))
         else:
             raise ValueError(
                 "Empty checkpoint name, try an available checkpoint in {} (ex: {})"
                 .format(self.checkpoints_dir, ckpt_names[-1]))
     print_("Loading trained model checkpoint...\n", 'm')
     # Load from given checkpoint file name
     model = tf.keras.models.load_model(
         os.path.join(self.checkpoints_dir, self.checkpoint_name))
     print_("...Checkpoint {} loaded\n".format(self.checkpoint_name), 'm')
     return model
    def train(self):
        # Build model
        self.model = util.model_builder.mobilenet_transfer(
            len(self.train_labels))
        # Configure the model for training
        self.model.compile(optimizer=tf.keras.optimizers.Adam(),
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])
        # Print current model layers
        # self.model.summary()

        # Set preprocessing function
        datagen = tf.keras.preprocessing.image.ImageDataGenerator(
            # scale pixels between -1 and 1, sample-wise
            preprocessing_function=tf.keras.applications.mobilenet.
            preprocess_input,
            validation_split=self.validation_split)
        # Get classification data
        train_generator = datagen.flow_from_directory(
            self.train_data_path,
            target_size=(224, 224),
            color_mode='rgb',
            batch_size=self.batch_size,
            class_mode='categorical',
            shuffle=True,
            subset='training')
        if self.has_val_data:
            validation_generator = datagen.flow_from_directory(
                self.val_data_path,
                target_size=(224, 224),
                color_mode='rgb',
                batch_size=self.batch_size,
                class_mode='categorical',
                shuffle=True)
        else:  # Generate a split of the training data as validation data
            validation_generator = datagen.flow_from_directory(
                self.train_data_path,  # subset from training data path
                target_size=(224, 224),
                color_mode='rgb',
                batch_size=self.batch_size,
                class_mode='categorical',
                shuffle=True,
                subset='validation')

        # Callback for creating Tensorboard summary
        summary_name = "classif_data{}_bch{}_ep{}".format(
            len(self.train_gt_data_list), self.batch_size, self.epoch)
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=os.path.join(self.summaries_dir, summary_name))
        # Callback for saving models periodically
        class_labels_save = '_'.join(self.train_labels) + '.'
        # 'acc' is the training accuracy and 'val_acc' is the validation set accuracy
        self.ckpt_save_name = class_labels_save + self.ckpt_save_name + "-val_acc{val_acc:.2f}-acc{acc:.2f}-ep{epoch:04d}.h5"
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(self.checkpoints_dir, self.ckpt_save_name),
            save_weights_only=False,
            period=self.save_model_period,
            save_best_only=True,
            monitor='val_acc',
            mode='max')

        # Check if there are intermediate trained model to load
        # Uncomment following lines if you want to resume from a previous saved model
        # if not self.load_model():
        #     print_("Starting training from scratch\n", 'm')

        # Train the model
        fit_history = self.model.fit_generator(
            generator=train_generator,
            steps_per_epoch=train_generator.n // self.batch_size,
            validation_data=validation_generator,
            validation_steps=validation_generator.n // self.batch_size,
            epochs=self.epoch,
            callbacks=[checkpoint_callback, tensorboard_callback])

        print_("--------End of training--------\n", 'm')
    def __init__(self, args):
        # Training hyperparameters
        self.learning_rate = args.learning_rate
        self.batch_size = args.batch_size
        self.epoch = args.epoch
        self.save_model_period = 1  # save model weights every N epochs
        # Training and validation dataset paths
        self.train_data_path = './data/train'
        self.val_data_path = './data/validation'
        # Where to save and load model weights (=checkpoints)
        self.checkpoints_dir = './checkpoints'
        if not os.path.exists(self.checkpoints_dir):
            os.makedirs(self.checkpoints_dir)
        self.ckpt_save_name = 'classTemplate'
        # Where to save tensorboard summaries
        self.summaries_dir = './summaries/'

        # Get training dataset as lists of image paths
        self.train_gt_data_list = get_filepaths_from_dir(self.train_data_path)
        if len(self.train_gt_data_list) is 0:
            raise ValueError("No training data found in folder {}".format(
                self.train_data_path))
        elif (len(self.train_gt_data_list) < self.batch_size):
            raise ValueError(
                "Batch size must be smaller than the dataset (batch size = {}, number of training data = {})"
                .format(self.batch_size, len(self.train_gt_data_list)))

        # Get validation dataset if provided
        self.has_val_data = True
        self.val_gt_data_list = get_filepaths_from_dir(self.val_data_path)
        if len(self.val_gt_data_list) is 0:
            print(
                "No validation data found in {}, 20% of training data will be used as validation data"
                .format(self.val_data_path))
            self.has_val_data = False
            self.validation_split = 0.2
        elif (len(self.val_gt_data_list) < self.batch_size):
            raise ValueError(
                "Batch size must be smaller than the dataset (batch size = {}, number of validation data = {})"
                .format(self.batch_size, len(self.val_gt_data_list)))
        else:
            print_(
                "Number of validation data: {}\n".format(
                    len(self.val_gt_data_list)), 'm')
            self.validation_split = 0.0

        self.train_labels = get_labels_from_dir(self.train_data_path)
        # Check class labels are the same
        if self.has_val_data:
            self.val_labels = get_labels_from_dir(self.val_data_path)
            if self.train_labels != self.val_labels:
                if len(self.train_labels) != len(self.val_labels):
                    raise ValueError(
                        "{} and {} should have the same number of subdirectories ({}!={})"
                        .format(self.train_data_path, self.val_data_path,
                                len(self.train_labels), len(self.val_labels)))
                raise ValueError(
                    "{} and {} should have the same subdirectory label names ({}!={})"
                    .format(self.train_data_path, self.val_data_path,
                            self.train_labels, self.val_labels))

        # Compute and print training hyperparameters
        self.batch_per_epoch = int(
            np.ceil(len(self.train_gt_data_list) / float(self.batch_size)))
        self.max_steps = int(self.epoch * (self.batch_per_epoch))
        print_(
            "Number of training data: {}\nNumber of batches per epoch: {} (batch size = {})\nNumber of training steps for {} epochs: {}\n"
            .format(len(self.train_gt_data_list), self.batch_per_epoch,
                    self.batch_size, self.epoch, self.max_steps), 'm')
        print("Class labels: {}".format(self.train_labels))
Esempio n. 6
0
    def __init__(self, args):
        # Training hyperparameters
        self.learning_rate = args.learning_rate
        self.batch_size = args.batch_size
        self.epoch = args.epoch
        self.crop_size = 256
        self.n_levels = 3
        self.scale = 0.5
        self.channels = 3  # input / output channels
        # Training and validation dataset paths
        train_in_data_path = './data/train/input'
        train_gt_data_path = './data/train/groundtruth'
        val_in_data_path = './data/val/input'
        val_gt_data_path = './data/val/groundtruth'
        # Where to save and load model weights (=checkpoints)
        self.checkpoints_dir = './checkpoints'
        if not os.path.exists(self.checkpoints_dir):
            os.makedirs(self.checkpoints_dir)
        self.ckpt_save_name = 'trainingTemplateTF.model'
        # Where to save tensorboard summaries
        self.summaries_dir = './summaries/'

        # Get training dataset as lists of image paths
        self.train_in_data_list = get_filepaths_from_dir(train_in_data_path)
        self.train_gt_data_list = get_filepaths_from_dir(train_gt_data_path)
        if len(self.train_in_data_list) is 0 or len(
                self.train_gt_data_list) is 0:
            raise ValueError(
                "No training data found in folders {} or {}".format(
                    train_in_data_path, train_gt_data_path))
        elif len(self.train_in_data_list) != len(self.train_gt_data_list):
            raise ValueError(
                "{} ({} data) and {} ({} data) should have the same number of input data"
                .format(train_in_data_path, len(self.train_in_data_list),
                        train_gt_data_path, len(self.train_gt_data_list)))
        elif (len(self.train_in_data_list) < self.batch_size):
            raise ValueError(
                "Batch size must be smaller than the dataset (batch size = {}, number of training data = {})"
                .format(self.batch_size, len(self.train_in_data_list)))
        self.is_exr = is_exr(self.train_in_data_list[0])

        # Get validation dataset if provided
        self.has_val_data = True
        self.val_in_data_list = get_filepaths_from_dir(val_in_data_path)
        self.val_gt_data_list = get_filepaths_from_dir(val_gt_data_path)
        if len(self.val_in_data_list) is 0 or len(self.val_gt_data_list) is 0:
            print("No validation data found in {} or {}".format(
                val_in_data_path, val_gt_data_path))
            self.has_val_data = False
        elif len(self.val_in_data_list) != len(self.val_gt_data_list):
            raise ValueError(
                "{} ({} data) and {} ({} data) should have the same number of input data"
                .format(val_in_data_path, len(self.val_in_data_list),
                        val_gt_data_path, len(self.val_gt_data_list)))
        elif (len(self.val_in_data_list) < self.batch_size):
            raise ValueError(
                "Batch size must be smaller than the dataset (batch size = {}, number of validation data = {})"
                .format(self.batch_size, len(self.val_in_data_list)))
        else:
            val_is_exr = is_exr(self.val_in_data_list[0])
            if (val_is_exr and not self.is_exr) or (not val_is_exr
                                                    and self.is_exr):
                raise TypeError(
                    "Train and validation data should have the same file format"
                )
            print("Number of validation data: {}".format(
                len(self.val_in_data_list)))

        # Compute and print training hyperparameters
        batch_per_epoch = (len(self.train_in_data_list)) // self.batch_size
        self.max_steps = int(self.epoch * (batch_per_epoch))
        print_(
            "Number of training data: {}\nNumber of batches per epoch: {} (batch size = {})\nNumber of training steps for {} epochs: {}\n"
            .format(len(self.train_in_data_list), batch_per_epoch,
                    self.batch_size, self.epoch, self.max_steps), 'm')
Esempio n. 7
0
    def train(self):
        # Build model
        model = EncoderDecoder(self.n_levels, self.scale, self.channels)

        # Learning rate decay
        global_step = tf.Variable(initial_value=0,
                                  dtype=tf.int32,
                                  trainable=False)
        self.lr = tf.train.polynomial_decay(self.learning_rate,
                                            global_step,
                                            self.max_steps,
                                            end_learning_rate=0.0,
                                            power=0.3)
        tf.summary.scalar('learning_rate', self.lr)
        # Training operator
        adam = tf.train.AdamOptimizer(self.lr)

        # Get next data from preprocessed training dataset
        img_in, img_gt = self.get_data(self.train_in_data_list,
                                       self.train_gt_data_list,
                                       self.batch_size, self.epoch)
        tf.summary.image('img_in', im2uint8(img_in))
        tf.summary.image('img_gt', im2uint8(img_gt))
        print('img_in, img_gt', img_in.shape, img_gt.shape)
        # Compute image loss
        n_outputs = model(img_in, reuse=False)
        loss_op = self.loss(n_outputs, img_gt)
        # By default, adam uses the current graph trainable_variables to optimise training,
        # thus train_op should be the last operation of the graph for training.
        train_op = adam.minimize(loss_op, global_step)

        # Create session
        sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))
        # Initialise all the variables in current session
        init = tf.global_variables_initializer()
        sess.run(init)
        self.saver = tf.train.Saver(max_to_keep=100,
                                    keep_checkpoint_every_n_hours=1)

        # Check if there are intermediate trained model to load
        if not self.load(sess, self.checkpoints_dir):
            print_("Starting training from scratch\n", 'm')

        # Tensorboard summary
        summary_op = tf.summary.merge_all()
        summary_name = "data{}_bch{}_ep{}".format(len(self.train_in_data_list),
                                                  self.batch_size, self.epoch)
        summary_writer = tf.summary.FileWriter(self.summaries_dir +
                                               summary_name,
                                               graph=sess.graph,
                                               flush_secs=30)

        # Compute loss on validation dataset to check overfitting
        if self.has_val_data:
            val_loss_op = self.validate(model)
            # Save validation loss to tensorboard
            val_summary_op = tf.summary.scalar('val_loss', val_loss_op)
            # Compute initial loss
            val_loss, val_summary = sess.run([val_loss_op, val_summary_op])
            summary_writer.add_summary(val_summary, global_step=0)
            print(
                "Initial Loss on validation dataset: {:.4f}".format(val_loss))

        for step in xrange(sess.run(global_step), self.max_steps):
            start_time = time.time()
            val_str = ''
            if step % 50 == 0 or step == self.max_steps - 1:
                # Train model and record summaries
                _, loss_total, summary = sess.run(
                    [train_op, loss_op, summary_op])
                summary_writer.add_summary(summary, global_step=step)
                duration = time.time() - start_time
                if self.has_val_data and step != 0:
                    # Compute validation loss
                    val_loss, val_summary = sess.run(
                        [val_loss_op, val_summary_op])
                    summary_writer.add_summary(val_summary, global_step=step)
                    val_str = ', val loss: {:.4f}'.format(val_loss)
            else:  # Train only
                _, loss_total = sess.run([train_op, loss_op])
                duration = time.time() - start_time
            assert not np.isnan(loss_total), 'Model diverged with loss = NaN'

            if step % 10 == 0 or step == self.max_steps - 1:
                examples_per_sec = self.batch_size / duration
                sec_per_batch = float(duration)
                format_str = (
                    '{}: step {}, loss: {:.4f} ({:.1f} data/s; {:.3f} s/bch)'.
                    format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), step,
                           loss_total, examples_per_sec, sec_per_batch))
                print(format_str + val_str)

            if step % 1000 == 0 or step == self.max_steps - 1:
                # Save current model in a checkpoint
                self.save(sess, self.checkpoints_dir, step)

        print_("--------End of training--------\n", 'm')
        # Free all resources associated with the session
        sess.close()