Exemple #1
0
def quick_test():
    input = tf.ones((4, 512, 512, 3)) # dimensions should be equal
    model = HRNet(FLAGS.net_cfg)
    output = model.forward_eval(input)
    print(output)
    target = tf.ones((4, 512 // 4, 512 // 4, output.get_shape()[3]))
    loss, _ = JointsMSELoss()(output, target)
    print(loss)
    with tf.Session() as sess:
        with tf.device("/cpu:0"):
            sess.run(loss)
Exemple #2
0
def full_test():
    cfg = config.load_net_cfg_from_file(FLAGS.net_cfg)
    coco = coco_keypoints_dataset(cfg, "./data/coco/", FLAGS.test_path, False)
    inputs = coco.build(subset=10)
    images, labels = inputs.get_next()
    model = HRNet(FLAGS.net_cfg)
    output = model.forward_eval(images)
    print(output)
    print(labels)
    loss, _ = JointsMSELoss()(output, labels)
    with tf.Session() as sess:
        with tf.device("/cpu:0"):
            real_loss = sess.run(loss)
            print(real_loss)
Exemple #3
0
    def __init__(self, data_path, netcfg):
        self.data_scope = 'DATA'
        self.model_scope = 'HRNET'

        # initialize training & evaluation subsets
        self.dataset_train = Ilsvrc12Dataset(is_train=True, data_dir=data_path)
        self.dataset_eval = Ilsvrc12Dataset(is_train=False, data_dir=data_path)

        # initialize network
        self.hrnet = HRNet(netcfg)

        # learning rate
        self.lr_init = self.hrnet.cfg['COMMON']['lr_rate_init']
        self.model_path = './models'
        self.log_path = './logs'
        self.summ_step = self.hrnet.cfg['COMMON']['summary_step']
        self.save_step = self.hrnet.cfg['COMMON']['save_step']
        self.nb_iters_start = 0
Exemple #4
0
    def __init__(self, netcfg):
        self.data_scope = 'DATA'
        self.model_scope = 'HRNET'
        # initialize network
        self.hrnet = HRNet(netcfg)
        # initialize training & evaluation subsets
        self.dataset_train = coco_keypoints_dataset(self.hrnet.cfg,
                                                    FLAGS.data_path,
                                                    FLAGS.train_path, True)
        self.dataset_eval = coco_keypoints_dataset(self.hrnet.cfg,
                                                   FLAGS.data_path,
                                                   FLAGS.test_path, False)

        # learning rate
        self.lr_init = self.hrnet.cfg['COMMON']['lr_rate_init']
        self.model_path = './models'
        self.log_path = './logs'
        self.summ_step = self.hrnet.cfg['COMMON']['summary_step']
        self.save_step = self.hrnet.cfg['COMMON']['save_step']
        self.nb_iters_start = 0
Exemple #5
0
class Trainer():
    def __init__(self, data_path, netcfg):
        self.data_scope = 'DATA'
        self.model_scope = 'HRNET'

        # initialize training & evaluation subsets
        self.dataset_train = Ilsvrc12Dataset(is_train=True, data_dir=data_path)
        self.dataset_eval = Ilsvrc12Dataset(is_train=False, data_dir=data_path)

        # initialize network
        self.hrnet = HRNet(netcfg)

        # learning rate
        self.lr_init = self.hrnet.cfg['COMMON']['lr_rate_init']
        self.model_path = './models'
        self.log_path = './logs'
        self.summ_step = self.hrnet.cfg['COMMON']['summary_step']
        self.save_step = self.hrnet.cfg['COMMON']['save_step']
        self.nb_iters_start = 0

    def build_graph(self, is_train):
        with tf.Graph().as_default():
            # TensorFlow session
            config = tf.ConfigProto()
            config.gpu_options.visible_device_list = str(
                mgw.local_rank() if FLAGS.enbl_multi_gpu else 0)  # pylint: disable=no-member
            sess = tf.Session(config=config)

            # data input pipeline
            with tf.variable_scope(self.data_scope):
                iterator = self.dataset_train.build(
                ) if is_train else self.dataset_eval.build()
                images, labels = iterator.get_next()
                if not isinstance(images, dict):
                    tf.add_to_collection('images_final', images)
                else:
                    tf.add_to_collection('images_final', images['image'])

            # model definition - primary model
            with tf.variable_scope(self.model_scope):
                # forward pass
                logits = self.hrnet.forward_train(
                    images) if is_train else self.hrnet.forward_eval(images)
                if not isinstance(logits, dict):
                    tf.add_to_collection('logits_final', logits)
                else:
                    for value in logits.values():
                        tf.add_to_collection('logits_final', value)

                # loss & extra evaluation metrics
                loss, metrics = self.hrnet.calc_loss(labels, logits,
                                                     self.trainable_vars)

                tf.summary.scalar('loss', loss)
                for key, value in metrics.items():
                    tf.summary.scalar(key, value)

                # optimizer & gradients
                if is_train:
                    self.global_step = tf.train.get_or_create_global_step()
                    lrn_rate, self.nb_iters_train = self.setup_lrn_rate(
                        self.global_step)

                    optimizer = tf.train.MomentumOptimizer(
                        lrn_rate, self.hrnet.cfg['COMMON']['momentum'])
                    if FLAGS.enbl_multi_gpu:
                        optimizer = mgw.DistributedOptimizer(optimizer)
                    grads = optimizer.compute_gradients(
                        loss, self.trainable_vars)

            # TF operations & model saver
            if is_train:
                self.sess_train = sess

                with tf.control_dependencies(self.update_ops):
                    self.train_op = optimizer.apply_gradients(
                        grads, global_step=self.global_step)

                self.summary_op = tf.summary.merge_all()
                self.sm_writer = tf.summary.FileWriter(logdir=self.log_path)
                self.log_op = [lrn_rate, loss] + list(metrics.values())
                self.log_op_names = ['lr', 'loss'] + list(metrics.keys())
                self.init_op = tf.variables_initializer(self.vars)
                if FLAGS.enbl_multi_gpu:
                    self.bcast_op = mgw.broadcast_global_variables(0)
                self.saver_train = tf.train.Saver(self.vars)
            else:
                self.sess_eval = sess
                self.eval_op = [loss] + list(metrics.values())
                self.eval_op_names = ['loss'] + list(metrics.keys())
                self.saver_eval = tf.train.Saver(self.vars)

    def train(self):
        """Train a model and periodically produce checkpoint files."""

        # initialization
        self.sess_train.run(self.init_op)

        tf.summary.FileWriter(self.model_path, self.sess_train.graph)

        if FLAGS.resume_training:
            save_path = tf.train.latest_checkpoint(
                os.path.dirname(self.model_path + '/model.ckpt'))
            self.saver_train.restore(self.sess_train, save_path)
            self.nb_iters_start = get_global_step_from_ckpt(save_path)

        if FLAGS.enbl_multi_gpu:
            self.sess_train.run(self.bcast_op)

        # train the model through iterations and periodically save & evaluate the model
        time_prev = timer()
        for idx_iter in range(self.nb_iters_start, self.nb_iters_train):
            # train the model
            if (idx_iter + 1) % self.summ_step != 0:
                self.sess_train.run(self.train_op)
            else:
                __, summary, log_rslt = self.sess_train.run(
                    [self.train_op, self.summary_op, self.log_op])
                if self.is_primary_worker('global'):
                    time_step = timer() - time_prev
                    self.__monitor_progress(summary, self.summ_step, log_rslt,
                                            idx_iter, time_step)
                    time_prev = timer()

            # save and eval the model at certain steps
            if self.is_primary_worker('global') and (idx_iter +
                                                     1) % self.save_step == 0:
                # save model
                self.saver_train.save(self.sess_train,
                                      os.path.join(self.model_path,
                                                   'model.ckpt'),
                                      global_step=self.global_step)
                self.eval()

        # save the final model
        if self.is_primary_worker('global'):
            # save model
            self.saver_train.save(self.sess_train,
                                  os.path.join(self.model_path, 'model.ckpt'),
                                  global_step=self.global_step)
            self.eval()

    def eval(self):
        # restore model first
        ckpt_path = self.__restore_model(self.saver_eval, self.sess_eval)
        tf.logging.info('restore from %s' % (ckpt_path))

        # eval
        nb_iters = int(
            np.ceil(float(FLAGS.nb_smpls_eval) / FLAGS.batch_size_eval))
        eval_rslts = np.zeros((nb_iters, len(self.eval_op)))

        for idx_iter in range(nb_iters):
            eval_rslts[idx_iter] = self.sess_eval.run(self.eval_op)

        for idx, name in enumerate(self.eval_op_names):
            tf.logging.info('%s = %.4e' % (name, np.mean(eval_rslts[:, idx])))

    def __restore_model(self, saver, session):
        ckpt_path = tf.train.latest_checkpoint(self.model_path)
        saver.restore(session, ckpt_path)
        return ckpt_path

    def setup_lrn_rate(self, global_step):
        """Setup the learning rate (and number of training iterations)."""

        nb_epochs = 100
        idxs_epoch = [30, 60, 90]
        decay_rates = [1.0, 0.1, 0.01, 0.001]
        nb_epochs_rat = 1.0
        batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else
                                         mgw.size())
        lrn_rate = setup_lrn_rate_piecewise_constant(global_step, self.lr_init,
                                                     batch_size, idxs_epoch,
                                                     decay_rates)
        nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * nb_epochs_rat /
                       batch_size)

        return lrn_rate, nb_iters

    def __monitor_progress(self, summary, summ_step, log_rslt, idx_iter,
                           time_step):
        """Monitor the training progress.

        Args:
        * summary: summary protocol buffer
        * summ_step: step to write summary
        * log_rslt: logging operations' results
        * idx_iter: index of the training iteration
        * time_step: time step between two summary operations
        """

        # write summaries for TensorBoard visualization
        self.sm_writer.add_summary(summary, idx_iter)

        # compute the training speed
        speed = FLAGS.batch_size * summ_step / time_step
        if FLAGS.enbl_multi_gpu:
            speed *= mgw.size()

        # display monitored statistics
        log_str = ' | '.join([
            '%s = %.4e' % (name, value)
            for name, value in zip(self.log_op_names, log_rslt)
        ])
        tf.logging.info('iter #%d: %s | speed = %.2f pics / sec' %
                        (idx_iter + 1, log_str, speed))

    def auto_barrier(self):
        """Automatically insert a barrier for multi-GPU training, or pass for single-GPU training."""

        auto_barrier_impl(self.mpi_comm)

    @classmethod
    def is_primary_worker(cls, scope='global'):
        """Check whether is the primary worker of all nodes (global) or the current node (local).

        Args:
        * scope: check scope ('global' OR 'local')

        Returns:
        * flag: whether is the primary worker
        """

        return is_primary_worker_impl(scope)

    @property
    def vars(self):
        """List of all global variables."""
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                 scope=self.model_scope)

    @property
    def trainable_vars(self):
        """List of all trainable variables."""
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 scope=self.model_scope)

    @property
    def update_ops(self):
        """List of all update operations."""
        return tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                 scope=self.model_scope)
Exemple #6
0
class Trainer():
    def __init__(self, netcfg):
        self.data_scope = 'DATA'
        self.model_scope = 'HRNET'
        # initialize network
        self.hrnet = HRNet(netcfg)
        # initialize training & evaluation subsets
        self.dataset_train = coco_keypoints_dataset(self.hrnet.cfg,
                                                    FLAGS.data_path,
                                                    FLAGS.train_path, True)
        self.dataset_eval = coco_keypoints_dataset(self.hrnet.cfg,
                                                   FLAGS.data_path,
                                                   FLAGS.test_path, False)

        # learning rate
        self.lr_init = self.hrnet.cfg['COMMON']['lr_rate_init']
        self.model_path = './models'
        self.log_path = './logs'
        self.summ_step = self.hrnet.cfg['COMMON']['summary_step']
        self.save_step = self.hrnet.cfg['COMMON']['save_step']
        self.nb_iters_start = 0

    def build_graph(self, is_train):
        with tf.Graph().as_default():
            # TensorFlow session
            config = tf.ConfigProto()
            config.gpu_options.visible_device_list = str(
                mgw.local_rank() if FLAGS.enbl_multi_gpu else 0)  # pylint: disable=no-member
            sess = tf.Session(config=config)

            # data input pipeline
            with tf.variable_scope(self.data_scope):
                iterator = self.dataset_train.build(
                ) if is_train else self.dataset_eval.build()
                images, labels, ids = iterator.get_next()
                if not isinstance(images, dict):
                    tf.add_to_collection('images_final', images)
                else:
                    tf.add_to_collection('images_final', images['image'])

            # model definition - primary model
            with tf.variable_scope(self.model_scope):
                # forward pass
                logits = self.hrnet.forward_train(
                    images) if is_train else self.hrnet.forward_eval(images)
                if not isinstance(logits, dict):
                    tf.add_to_collection('logits_final', logits)
                else:
                    for value in logits.values():
                        tf.add_to_collection('logits_final', value)

                # loss & extra evaluation metrics
                loss, metrics = JointsMSELoss()(logits, labels)
                tf.summary.scalar('loss', loss)
                for key, value in metrics.items():
                    tf.summary.scalar(key, value)

                # optimizer & gradients
                if is_train:
                    self.global_step = tf.train.get_or_create_global_step()
                    lrn_rate, self.nb_iters_train = self.setup_lrn_rate(
                        self.global_step)
                    optimizer = tf.train.MomentumOptimizer(
                        lrn_rate, self.hrnet.cfg['COMMON']['momentum'])
                    if FLAGS.enbl_multi_gpu:
                        optimizer = mgw.DistributedOptimizer(optimizer)
                    grads = optimizer.compute_gradients(
                        loss, self.trainable_vars)

            # TF operations & model saver
            if is_train:
                self.sess_train = sess
                with tf.control_dependencies(self.update_ops):
                    steps = optimizer.apply_gradients(
                        grads, global_step=self.global_step)
                    self.train_op = steps
                self.summary_op = tf.summary.merge_all()
                self.sm_writer = tf.summary.FileWriter(logdir=self.log_path)
                self.log_op = [lrn_rate, loss] + list(metrics.values())
                self.log_op_names = ['lr', 'loss'] + list(metrics.keys())
                self.init_op = tf.variables_initializer(self.vars)
                if FLAGS.enbl_multi_gpu:
                    self.bcast_op = mgw.broadcast_global_variables(0)
                self.saver_train_wo_head = self._create_saver(
                    discard_head=True)
                self.saver_train = self._create_saver()
            else:
                self.sess_eval = sess
                self.eval_op = [logits, labels, ids, loss]
                self.eval_op_names = ['logits', 'labels', 'ids', 'loss']
                self.saver_eval = self._create_saver()
            print("Graph build finished.")

    def train(self):
        """Train a model and periodically produce checkpoint files."""

        # initialization
        self.sess_train.run(self.init_op)
        if FLAGS.resume_training:
            print("Model path: " + self.model_path + '/model.ckpt')
            save_path = tf.train.latest_checkpoint(
                os.path.dirname(self.model_path + '/model.ckpt'))
            if FLAGS.load_head_weights:
                # check if saver will restore only some layers
                self.saver_train.restore(self.sess_train, save_path)
            else:
                self.saver_train_wo_head.restore(
                    self.sess_train, save_path
                )  # restore only weights from other layers except head.
            self.nb_iters_start = get_global_step_from_ckpt(save_path)

        if FLAGS.enbl_multi_gpu:
            self.sess_train.run(self.bcast_op)

        # train the model through iterations and periodically save & evaluate the model
        # one iteration corresponds with a single batch run (# epochs * # batches)
        time_prev = timer()
        for idx_iter in range(self.nb_iters_start, self.nb_iters_train):
            # train the model
            if (idx_iter + 1) % self.summ_step != 0:
                self.sess_train.run(self.train_op)
            else:
                __, summary, log_rslt = self.sess_train.run(
                    [self.train_op, self.summary_op, self.log_op])
                if self.is_primary_worker('global'):
                    time_step = timer() - time_prev
                    self.__monitor_progress(summary, self.summ_step, log_rslt,
                                            idx_iter, time_step)
                    time_prev = timer()

            # save and eval the model at certain steps (at the last iteration too)
            if self.is_primary_worker('global') and \
                    (((idx_iter + 1) % self.save_step == 0) or (idx_iter + 1 == self.nb_iters_train)):
                last_performance = self._save_and_eval(saver=self.saver_train,
                                                       sess=self.sess_train)

    def eval(self):  # TODO: Pass writer_dict for tensorboard when training
        ckpt_path = self.__restore_model(self.saver_eval, self.sess_eval)
        tf.logging.info('restore from %s' % (ckpt_path))
        print("Starting evaluation process")
        # eval
        nb_iters = int(
            np.ceil(float(self.dataset_eval.num_images) / FLAGS.batch_size))
        eval_rslts = np.zeros((nb_iters, 1))
        all_logits = []
        all_targets = []
        all_ids = []
        for idx_iter in range(nb_iters):
            logits, labels, ids, loss = self.sess_eval.run(self.eval_op)
            eval_rslts[idx_iter] = loss
            all_logits.append([x for x in logits])
            all_targets.append([x for x in labels])
            all_ids.append([x for x in ids])

        name_values, perf_indicator = validate(self.hrnet.cfg,
                                               self.dataset_eval,
                                               outputs=all_logits,
                                               targets=all_targets,
                                               ids=all_ids,
                                               output_dir=self.log_path,
                                               writer_dict=None)

        # TODO: Extend in case of more metrics added beyond loss
        tf.logging.info('%s = %.4e' % ('loss', np.mean(eval_rslts)))
        tf.logging.info('%s = %.4e' % ('AP', perf_indicator))

        return perf_indicator

    def __restore_model(self, saver, session):
        ckpt_path = tf.train.latest_checkpoint(self.model_path)
        saver.restore(session, ckpt_path)
        return ckpt_path

    def setup_lrn_rate(self, global_step):
        """Setup the learning rate (and number of training iterations)."""

        nb_epochs = 210  # TODO: move all this to the configuration file
        idxs_epoch = [170, 200]
        decay_rates = [1.0, 0.1, 0.01]  # decay from original
        nb_epochs_rat = 1.0
        batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else
                                         mgw.size())
        lrn_rate = setup_lrn_rate_piecewise_constant(
            global_step, self.lr_init, batch_size, idxs_epoch, decay_rates,
            self.dataset_train.num_images)
        nb_iters = int(self.dataset_train.num_images * nb_epochs *
                       nb_epochs_rat / batch_size)

        return lrn_rate, nb_iters

    def __monitor_progress(self, summary, summ_step, log_rslt, idx_iter,
                           time_step):
        """Monitor the training progress.

        Args:
        * summary: summary protocol buffer
        * summ_step: step to write summary
        * log_rslt: logging operations' results
        * idx_iter: index of the training iteration
        * time_step: time step between two summary operations
        """

        # write summaries for TensorBoard visualization
        self.sm_writer.add_summary(summary, idx_iter)

        # compute the training speed
        speed = FLAGS.batch_size * summ_step / time_step
        if FLAGS.enbl_multi_gpu:
            speed *= mgw.size()

        # display monitored statistics
        log_str = ' | '.join([
            '%s = %.4e' % (name, value)
            for name, value in zip(self.log_op_names, log_rslt)
        ])
        tf.logging.info('iter #%d: %s | speed = %.2f pics / sec' %
                        (idx_iter + 1, log_str, speed))

    def auto_barrier(self):
        """Automatically insert a barrier for multi-GPU training, or pass for single-GPU training."""

        auto_barrier_impl(self.mpi_comm)

    @classmethod
    def is_primary_worker(cls, scope='global'):
        """Check whether is the primary worker of all nodes (global) or the current node (local).

        Args:
        * scope: check scope ('global' OR 'local')

        Returns:
        * flag: whether is the primary worker
        """

        return is_primary_worker_impl(scope)

    @property
    def vars(self):
        """List of all global variables."""
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                 scope=self.model_scope)

    @property
    def trainable_vars(self):
        """List of all trainable variables."""
        trainable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=self.model_scope)
        if FLAGS.freeze_first:
            trainable = [x for x in trainable if 'HEAD' in x.name]
        return trainable

    @property
    def update_ops(self):
        """List of all update operations."""
        return tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                 scope=self.model_scope)

    def _create_saver(self, discard_head=False):
        vars_to_restore = [x for x in self.vars if 'HEAD' not in x.name
                           ] if discard_head else self.vars
        return tf.train.Saver(vars_to_restore)

    def _save_and_eval(
            self, saver,
            sess):  # TODO: save only if performance is better than before
        saver.save(
            sess,
            os.path.join(
                self.model_path, 'model.ckpt'
            ),  # First save and then evaluate because it reads model checkpoint before evaluating
            global_step=self.global_step)
        perf_indicator = self.eval()
        return perf_indicator