Example #1
0
    def _tower_loss(self, hr, lr, tower_index, reuse_variables, non_local):
        """Calculate the total loss on a single tower running the model (with the batch splitting)
		Args:
		  datas:  4D tensor of size [batch_size, 1, image_size, image_size]
		  labels: 1-D integer Tensor of [batch_size]
		  scope: unique prefix string identifying the tower, e.g. 'tower_0'
		
		Returns:
		  tensor of shape [] containing the total loss for a batch of data
		"""
        # build the inference graph
        with tf.variable_scope(tf.get_variable_scope()):
            net = Net(hr,
                      lr,
                      non_local,
                      wl=self.weight_decay,
                      tower=tower_index,
                      reuse=reuse_variables)
            net.build_net()

        # return the total loss for the current tower
        return net.total_loss
Example #2
0
    def __init__(self):
        # training path
        self.train_data = conf.train_data
        self.models_dir = conf.models_dir
        self.logFilename = conf.log_name
        self.num_examples_per_epoch_for_train = conf.num_train_exps

        # for validation
        self.valid_data = conf.valid_data
        self.num_examples_per_epoch_for_valid = conf.num_valid_exps

        # make dirs
        if not path.exists(self.models_dir):
            makedirs(self.models_dir)

        # soft constraint for total epochs
        self.num_epoch = conf.num_epoch

        # device setting
        self.device_id = conf.device_id
        self.num_gpus = conf.num_gpus

        # hyper parameters
        self.batch_size = conf.batch_size
        self.valid_bs = conf.valid_bs
        self.weight_decay = conf.weight_decay

        # learning rate
        self.lr = tf.placeholder(tf.float32)
        self.base_lr = conf.base_lr
        self.power = conf.power
        self.end_lr = conf.end_lr

        # several multiplier
        self.loss_weight = conf.loss_weight
        self.lr_mp = conf.lr_mp
        self.decay_fraction = conf.decay_fraction

        # warming-up
        self.warmup_epoch = conf.warmup_epoch
        self.warmup_from0 = conf.warmup_from0

        # resuming and finetune
        self.resume = conf.resume
        self.finetune = conf.finetune
        self.meta_data = conf.meta_data

        # whether to enable the non-local block
        self.non_local = conf.non_local

        self.iters = conf.iters
        if self.iters == None:
            if self.resume or self.finetune:
                raise ValueError(
                    'iters mush be specified when resume or finetune')
        self.finetune_models_dir = conf.finetune_models_dir

        # create an optimizer that performs gradient descent
        opt = tf.train.AdamOptimizer(self.lr)

        # get the training dataset
        with tf.device('/cpu:0'):
            t_hr_splits, t_lr_splits = self._get_data(mode='train')
            v_hr_splits, v_lr_splits = self._get_data(mode='valid')

        # calculate the gradients for each model tower
        reuse_variables = False
        tower_grads = []
        self.losses = []

        # for multi-gpu training
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(self.device_id, self.device_id + self.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('tower_%d' % i) as scope:
                        # constructs the entire model but shares the variables across all towers
                        loss = self._tower_loss(t_hr_splits[i], t_lr_splits[i],
                                                i, reuse_variables,
                                                self.non_local)

                        # collect the total losses from each tower
                        self.losses += [loss]

                        # reuse variables for the next tower
                        reuse_variables = True
                        tf.get_variable_scope().reuse_variables()

                        # calculate the gradients for the batch of data on this tower
                        grads = opt.compute_gradients(loss)

                        # keep track of the gradients across all towers
                        tower_grads.append(grads)

        # calculate the mean of each gradient
        # note: this is the synchronization point across all towers
        if self.num_gpus > 1:
            grads = self._average_gradients(tower_grads)
        else:
            grads = tower_grads[0]

        # apply the gradients to adjust the shared variables
        self.train_op = opt.apply_gradients(grads)

        # for multi-gpu validation
        v_loss = 0.0
        for i in range(self.device_id, self.device_id + self.num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('vtower_%d' % i) as scope:
                    net = Net(v_hr_splits[i],
                              v_lr_splits[i],
                              self.non_local,
                              wl=self.weight_decay,
                              tower=i,
                              reuse=True)
                    net.build_net()

                    v_loss += net.total_loss

        self.v_loss = v_loss / self.num_gpus