def testParallelApplyGrad(self):
        with self.test_session() as sess:
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))
            elems = [
                10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0
            ]
            accum_ops = [q.apply_grad((x, ), local_step=0) for x in elems]
            takeg_t = q.take_grad(1)

            def apply_grad(accum_op):
                sess.run(accum_op)

            threads = [
                self.checkedThread(target=apply_grad, args=(o, ))
                for o in accum_ops
            ]

            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()

            val = takeg_t.eval()

            self.assertEqual(val, sum(elems) / len(elems))
    def testAccumulatorRepeatedTakeGrad(self):
        with self.test_session():
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))

            elems = [10.0, 20.0]
            elems_ave = sum(elems) / len(elems)
            accum_ops = [q.apply_grad((x, ), local_step=0) for x in elems]
            takeg_t = q.take_grad(1)

            for accum_op in accum_ops:
                accum_op.run()

            val = takeg_t.eval()
            self.assertEqual(elems_ave, val)

            elems = [20.0, 30.0]
            elems_ave = sum(elems) / len(elems)
            accum_ops = [q.apply_grad((x, ), local_step=1) for x in elems]
            takeg_t = q.take_grad(1)

            for accum_op in accum_ops:
                accum_op.run()

            val = takeg_t.eval()
            self.assertEqual(elems_ave + 0.0, val)
    def testParallelTakeGrad(self):
        with self.test_session() as sess:
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))
            elems = [e for e in range(10)]
            accum_ops = [
                q.apply_grad((np.float32(e), ), local_step=e) for e in elems
            ]
            takeg_t = q.take_grad(1)

            def apply_grad():
                for accum_op in accum_ops:
                    time.sleep(1.0)
                    sess.run(accum_op)

            apply_grad_thread = self.checkedThread(target=apply_grad)

            results = []

            def take_grad():
                results.append(sess.run(takeg_t))

            threads = [self.checkedThread(target=take_grad) for _ in range(10)]

            for thread in threads:
                thread.start()
            apply_grad_thread.start()

            for thread in threads:
                thread.join()
            apply_grad_thread.join()

            self.assertItemsEqual(elems, results)
Beispiel #4
0
    def __init__(self, towers, server, aggregation_frequency):
        """
        Args:
            towers (list[int]): list of GPU ids.
            server (tf.train.Server): the server with ps and workers.
                job_name must be 'worker'.
        """
        DataParallelBuilder.__init__(self, towers)
        DistributedBuilderBase.__init__(self, server)

        self.is_chief = (self.task_index == 0)

        worker_prefix = '/job:worker/task:%s' % self.task_index
        self.param_server_device = tf.train.replica_device_setter(
            worker_device=worker_prefix + '/cpu:0', cluster=self.cluster)

        self.nr_gpu = len(self.towers)
        self.cpu_device = '%s/cpu:0' % worker_prefix
        self.raw_devices = ['%s/gpu:%i' % (worker_prefix, i) for i in towers]

        # Device for queues for managing synchronization between servers
        self.sync_queue_devices = [
            '/job:ps/task:%s/cpu:0' % i for i in range(self.num_ps)
        ]

        # How often are parameters synchronized
        self.aggregation_frequency = aggregation_frequency
        assert self.aggregation_frequency > 0

        # This is going to be K x N x 2 data structure holding the queues and vars for aggregated tensors
        self.gpu_shadow_vars = []

        # Used by comm op to know when it can begin reading aggregated values
        self.counter = tf.ConditionalAccumulator(tf.float32)
    def testAccumulatorApplyAndBlockingTake(self):
        with self.test_session() as sess:
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))

            elems = [10.0, 20.0, 30.0]
            elems_ave = sum(elems) / len(elems)
            accum_ops = [q.apply_grad((x, ), local_step=0) for x in elems]
            takeg_t = q.take_grad(3)

            def apply_grad():
                time.sleep(1.0)
                for accum_op in accum_ops:
                    sess.run(accum_op)

            return_array = []

            def take_grad():
                return_array.append(sess.run(takeg_t))

            accum_thread = self.checkedThread(target=apply_grad)
            takeg_thread = self.checkedThread(target=take_grad)
            accum_thread.start()
            takeg_thread.start()
            accum_thread.join()
            takeg_thread.join()

            self.assertEqual([elems_ave], return_array)
 def testAccumulatorApplyGradFloat32(self):
     with self.test_session():
         q = tf.ConditionalAccumulator(tf.float32,
                                       name="Q",
                                       shape=tf.TensorShape([1]))
         accum_op = q.apply_grad((10.0, ))
         accum_op.run()
 def testAccumulatorSetGlobalStep(self):
     with self.test_session():
         q = tf.ConditionalAccumulator(tf.float32,
                                       name="Q",
                                       shape=tf.TensorShape([1]))
         set_global_step_op = q.set_global_step(1)
         set_global_step_op.run()
Beispiel #8
0
    def _optimize(self, loss, acc_count, global_step):
        '''
        :param loss: the network loss
        :return: a train op, a grad_acc_op
        '''

        optimizer = tf.train.AdamOptimizer(self._init_lr)
        grads_vars = optimizer.compute_gradients(loss)

        # create grad accumulator for each variable-grad pair
        grad_accumulator = {}
        for idx in range(len(grads_vars)):
            if grads_vars[idx][0] is not None:
                grad_accumulator[idx] = tf.ConditionalAccumulator(
                    grads_vars[idx][0].dtype)
        # apply gradient to each grad accumulator
        layer_lr = nn.param_lr()
        grad_accumulator_op = []
        for var_idx, grad_acc in grad_accumulator.iteritems():
            var_name = str(grads_vars[var_idx][1].name).split(':')[0]
            var_grad = grads_vars[var_idx][0]
            grad_accumulator_op.append(
                grad_acc.apply_grad(var_grad * layer_lr[var_name],
                                    local_step=global_step))
        # take average gradients for each variable after accumulating count reaches
        mean_grads_vars = []
        for var_idx, grad_acc in grad_accumulator.iteritems():
            mean_grads_vars.append(
                (grad_acc.take_grad(acc_count), grads_vars[var_idx][1]))

        # apply average gradients to variables
        update_op = optimizer.apply_gradients(mean_grads_vars,
                                              global_step=global_step)

        return update_op, grad_accumulator_op
Beispiel #9
0
def add_optimizer(total_loss, iter_mean_grad, learning_rate, momentum,
                  global_step):
    with tf.name_scope('optimization'):
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
        grads_and_vars = optimizer.compute_gradients(total_loss)
        with tf.name_scope('grad_accumulator'):
            grad_accumulator = {}
            for ind in range(0, len(grads_and_vars)):
                if grads_and_vars[ind][0] is not None:
                    grad_accumulator[ind] = tf.ConditionalAccumulator(
                        grads_and_vars[ind][0].dtype)
        with tf.name_scope('apply_gradient'):
            layer_lr = parameter_lr()
            grad_accumulator_ops = []
            for var_ind, grad_acc in grad_accumulator.items():
                var_name = str(grads_and_vars[var_ind][1].name).split(':')[0]
                var_grad = grads_and_vars[var_ind][0]
                grad_accumulator_ops.append(
                    grad_acc.apply_grad(var_grad * layer_lr[var_name],
                                        local_step=global_step))
        with tf.name_scope('take_gradients'):
            mean_grads_and_vars = []
            for var_ind, grad_acc in grad_accumulator.items():
                mean_grads_and_vars.append((grad_acc.take_grad(iter_mean_grad),
                                            grads_and_vars[var_ind][1]))
            apply_gradient_op = optimizer.apply_gradients(
                mean_grads_and_vars, global_step=global_step)
            return grad_accumulator_ops, apply_gradient_op
    def testAccumulatorSizeAfterApplyGradAndTakeGrad(self):
        with self.test_session():
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))
            accum_op = q.apply_grad((10.0, ))
            extract_t = q.take_grad(2)

            # Applying gradient multiple times to increase size from 0 to 2.
            self.assertEqual(q.num_accumulated().eval(), 0)
            accum_op.run()
            self.assertEqual(q.num_accumulated().eval(), 1)
            accum_op.run()
            self.assertEqual(q.num_accumulated().eval(), 2)

            # Extract will reduce size to 0
            extract_t.op.run()
            self.assertEqual(q.num_accumulated().eval(), 0)

            # Take gradients always sets the size back to 0 if successful.
            accum_op = q.apply_grad((10.0, ), local_step=1)
            accum_op.run()
            accum_op.run()
            accum_op.run()
            accum_op.run()
            self.assertEqual(q.num_accumulated().eval(), 4)
            extract_t.op.run()
            self.assertEqual(q.num_accumulated().eval(), 0)
    def testAccumulatorApplyGradWithWrongShape(self):
        q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=(3, 2))

        with self.assertRaises(ValueError):
            q.apply_grad([[1.0, 2.0], [3.0, 4.0]])

        with self.assertRaises(ValueError):
            q.apply_grad([[1.0], [2.0], [3.0]])
 def testAccumulatorSizeAfterApplyGrad(self):
     with self.test_session():
         q = tf.ConditionalAccumulator(tf.float32,
                                       name="Q",
                                       shape=tf.TensorShape([1]))
         accum_op = q.apply_grad((10.0, ))
         self.assertEqual(q.num_accumulated().eval(), 0)
         accum_op.run()
         self.assertEqual(q.num_accumulated().eval(), 1)
         accum_op.run()
         self.assertEqual(q.num_accumulated().eval(), 2)
 def testConstructor(self):
     with tf.Graph().as_default():
         q = tf.ConditionalAccumulator(tf.float32, name="Q")
     self.assertTrue(isinstance(q.accumulator_ref, tf.Tensor))
     self.assertProtoEquals(
         """
   name:'Q' op:'ConditionalAccumulator'
   attr { key: 'dtype' value { type: DT_FLOAT } }
   attr { key: 'shape' value { shape { unknown_rank: true} } }
   attr { key: 'container' value { s: '' } }
   attr { key: 'shared_name' value { s: '' } }
   """, q.accumulator_ref.op.node_def)
    def testAccumulatorMultipleAccumulators(self):
        with self.test_session():
            q_f32_0 = tf.ConditionalAccumulator(tf.float32,
                                                name="Q",
                                                shape=tf.TensorShape([1]))
            q_f32_1 = tf.ConditionalAccumulator(tf.float32,
                                                name="Q",
                                                shape=tf.TensorShape([1]))
            q_f16_0 = tf.ConditionalAccumulator(tf.float16,
                                                name="Q",
                                                shape=tf.TensorShape([1]))
            q_f16_1 = tf.ConditionalAccumulator(tf.float16,
                                                name="Q",
                                                shape=tf.TensorShape([1]))

            accums = [q_f16_0, q_f16_1, q_f32_0, q_f32_1]
            for i in range(len(accums)):
                accums[i].apply_grad((i + 10.0, )).run()

            for i in range(len(accums)):
                result = accums[i].take_grad(1).eval()
                self.assertEqual(result, i + 10.0)
    def testAccumulatorInvalidTakeGrad(self):
        with self.test_session():
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))
            elems = [10.0, 20.0]
            accum_ops = [q.apply_grad((x, )) for x in elems]

            takeg_t = q.take_grad(-1)

            for accum_op in accum_ops:
                accum_op.run()

            with self.assertRaises(tf.errors.InvalidArgumentError):
                takeg_t.eval()
    def testDtypes(self):
        with self.test_session() as sess:
            dtypes = [tf.float16, tf.float32, tf.float64]

            for i in range(len(dtypes)):
                dtype = dtypes[i]
                q = tf.ConditionalAccumulator(dtype, shape=tf.TensorShape([1]))

                elems = np.arange(10).astype(dtype.as_numpy_dtype)
                for e in elems:
                    q.apply_grad((e, )).run()

                result = sess.run(q.take_grad(1))

                self.assertEqual(sum(elems) / len(elems), result)
    def testAccumulatorIncrementGlobalStep(self):
        with self.test_session():
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))

            global_step = tf.Variable(0, name="global_step")
            new_global_step = tf.add(global_step, 1)
            inc_global_step = tf.assign(global_step, new_global_step)

            set_global_step_op = q.set_global_step(new_global_step)

            tf.initialize_all_variables().run()
            for _ in range(3):
                set_global_step_op.run()
                inc_global_step.eval()
    def testAccumulatorCancel(self):
        with self.test_session() as sess:
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))
            takeg_t = q.take_grad(1)

            takeg_thread = self.checkedThread(self._blocking_takeg,
                                              args=(sess, takeg_t))

            takeg_thread.start()

            time.sleep(1.0)

            sess.close()  # Will cancel blocked operation

            takeg_thread.join()
    def testAccumulatorWrongDynamicShape(self):
        with self.test_session() as sess:
            q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=None)

            x = tf.placeholder(tf.float32)

            accum_op = q.apply_grad(x)

            # First successful apply_grad determines shape
            sess.run(accum_op,
                     feed_dict={x: [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]})

            with self.assertRaises(tf.errors.InvalidArgumentError):
                sess.run(accum_op, feed_dict={x: [[1.0, 2.0], [3.0, 4.0]]})

            with self.assertRaises(tf.errors.InvalidArgumentError):
                sess.run(accum_op, feed_dict={x: [[1.0], [2.0], [3.0]]})
 def testConstructorWithShape(self):
     with tf.Graph().as_default():
         q = tf.ConditionalAccumulator(tf.float32,
                                       name="Q",
                                       shape=tf.TensorShape([1, 5, 2, 8]))
     self.assertTrue(isinstance(q.accumulator_ref, tf.Tensor))
     self.assertProtoEquals(
         """
   name:'Q' op:'ConditionalAccumulator'
   attr { key: 'dtype' value { type: DT_FLOAT } }
   attr { key: 'shape' value { shape { dim {size: 1 }
                                       dim {size: 5 }
                                       dim {size: 2 }
                                       dim {size: 8 }
   } } }
   attr { key: 'container' value { s: '' } }
   attr { key: 'shared_name' value { s: '' } }
   """, q.accumulator_ref.op.node_def)
    def testAccumulatorApplyAndTakeGradWithShape(self):
        with self.test_session():
            q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=(3, 2))
            elems = [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                     [[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]]]
            elems_ave = [[(a + b) / len(elems) for a, b in zip(x, y)]
                         for x, y in zip(elems[0], elems[1])]
            accum_ops = [q.apply_grad(x) for x in elems]
            takeg_t = q.take_grad(1)

            for accum_op in accum_ops:
                accum_op.run()

            is_all_equal = True
            val = takeg_t.eval()
            for i in range(len(val)):
                for j in range(len(val[i])):
                    is_all_equal &= (val[i][j] == elems_ave[i][j])
            self.assertTrue(is_all_equal)
    def testAccumulatorDynamicShape(self):
        with self.test_session() as sess:
            q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=None)

            x = tf.placeholder(tf.float32)

            accum_op = q.apply_grad(x)

            elems = [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                     [[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]]]
            elems_ave = [[(a + b) / len(elems) for a, b in zip(c, d)]
                         for c, d in zip(elems[0], elems[1])]
            takeg_t = q.take_grad(1)

            for elem in elems:
                sess.run(accum_op, feed_dict={x: elem})

            is_all_equal = True
            val = takeg_t.eval()
            for i in range(len(val)):
                for j in range(len(val[i])):
                    is_all_equal &= (val[i][j] == elems_ave[i][j])
            self.assertTrue(is_all_equal)
    def testAccumulatorSetGlobalStepPreventsAccumulation(self):
        with self.test_session():
            q = tf.ConditionalAccumulator(tf.float32,
                                          name="Q",
                                          shape=tf.TensorShape([1]))

            local_steps = range(1000, 1005)
            accum_ops = [
                q.apply_grad((0.0 + x, ), local_step=x) for x in local_steps
            ]

            for ls in local_steps:
                set_global_step_op = q.set_global_step(ls)
                set_global_step_op.run()

                for accum_op in accum_ops:
                    accum_op.run()
                takeg_t = q.take_grad(1)

                val = takeg_t.eval()
                self.assertEqual(
                    0.0 + sum(x for x in local_steps if x >= ls) /
                    sum(1 for x in local_steps if x >= ls), val)
Beispiel #24
0
    def __init__(self, average=True, compression=None, aggregation_frequency=1):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
            compression: `hvd.Compression.fp16` or `hvd.Compression.none`
        """
        if 'pyarrow' in sys.modules:
            logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. "
                        "Uninstall pyarrow and use msgpack instead.")
        # lazy import
        import horovod.tensorflow as hvd
        import horovod
        hvd_version = tuple(map(int, horovod.__version__.split('.')))
        self.hvd = hvd

        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
        self._rank = hvd.rank()
        self._average = average
        self._compression = compression
        self._has_compression = hvd_version >= (0, 15, 0)

        # How often are parameters synchronized
        self._aggregation_frequency = aggregation_frequency
        assert self._aggregation_frequency > 0

        # This is going to be N x 2 data structure holding the per-GPU aggregated updates and vars
        # for parameter updates. N is the number of parameters, and there are 2 entries per
        # parameter because each entry contains the gradient update and the original parameter.
        self.gpu_shadow_vars = []

        # Used by comm_op to know when it can begin reading aggregated values.
        self.counter = tf.ConditionalAccumulator(tf.float32)

        logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()
Beispiel #25
0
def _train(dataset,
           valid_dataset,
           num_classes,
           initial_ckpt,
           supervison,
           learning_rate,
           logs_path,
           max_training_iters,
           save_step,
           display_step,
           global_step,
           iter_mean_grad=1,
           batch_size=1,
           momentum=0.9,
           resume_training=False,
           config=None,
           finetune=1,
           test_image_path=None,
           ckpt_name="osvos"):
    """Train OSVOS
    Args:
    dataset: Reference to a Dataset object instance
    initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet)
    supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision
    learning_rate: Value for the learning rate. It can be a number or an instance to a learning rate object.
    logs_path: Path to store the checkpoints
    max_training_iters: Number of training iterations
    save_step: A checkpoint will be created every save_steps
    display_step: Information of the training will be displayed every display_steps
    global_step: Reference to a Variable that keeps track of the training steps
    iter_mean_grad: Number of gradient computations that are average before updating the weights
    batch_size: Size of the training batch
    momentum: Value of the momentum parameter for the Momentum optimizer
    resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False)
    config: Reference to a Configuration object used in the creation of a Session
    finetune: Use to select the type of training, 0 for the parent network and 1 for finetunning
    test_image_path: If image path provided, every save_step the result of the network with this image is stored
    Returns:
    """
    model_name = os.path.join(logs_path, ckpt_name + ".ckpt")
    if config is None:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # config.log_device_placement = True
        config.allow_soft_placement = True

    tf.logging.set_verbosity(tf.logging.INFO)

    # Prepare the input data
    input_image = tf.placeholder(tf.float32, [batch_size, None, None, 3])

    # LIAO: image label for classification part
    image_label = tf.placeholder(tf.float32, [batch_size, num_classes])

    # Create the network
    with slim.arg_scope(osvos_arg_scope()):
        net, fc, fc7, end_points = osvos(input_image, num_classes)

    # Define loss
    with tf.name_scope('losses'):

        fc = tf.nn.softmax(fc)
        classification_loss = tf.reduce_sum(tf.pow(fc - image_label,
                                                   2)) / (2 * batch_size)
        correct_pred = tf.equal(tf.argmax(fc, 1), tf.argmax(image_label, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        tf.summary.scalar('classification_loss', classification_loss)
        tf.summary.scalar('accuracy', accuracy)

        # LIAO: classification loss and l1 loss
        l2_loss = tf.add_n(tf.losses.get_regularization_losses())
        alpha = 0.025
        l1_loss = tf.reduce_sum(
            tf.abs(
                tf.subtract(tf.abs(fc7),
                            tf.ones([fc7.shape[0], 1, 1, fc7.shape[3]
                                     ])))) / batch_size
        total_loss = classification_loss + l2_loss + alpha * l1_loss
        tf.summary.scalar('l1_loss', l1_loss)
        tf.summary.scalar('l2_loss', l2_loss)
        tf.summary.scalar('total_loss', total_loss)

    # Define optimization method
    with tf.name_scope('optimization'):
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
        grads_and_vars = optimizer.compute_gradients(total_loss)
        with tf.name_scope('grad_accumulator'):
            grad_accumulator = {}
            for ind in range(0, len(grads_and_vars)):
                if grads_and_vars[ind][0] is not None:
                    grad_accumulator[ind] = tf.ConditionalAccumulator(
                        grads_and_vars[ind][0].dtype)
        with tf.name_scope('apply_gradient'):
            grad_accumulator_ops = []
            for var_ind, grad_acc in grad_accumulator.iteritems():
                var_name = str(grads_and_vars[var_ind][1].name).split(':')[0]
                var_grad = grads_and_vars[var_ind][0]
                grad_accumulator_ops.append(
                    grad_acc.apply_grad(var_grad, local_step=global_step))
        with tf.name_scope('take_gradients'):
            mean_grads_and_vars = []
            for var_ind, grad_acc in grad_accumulator.iteritems():
                mean_grads_and_vars.append((grad_acc.take_grad(iter_mean_grad),
                                            grads_and_vars[var_ind][1]))
            apply_gradient_op = optimizer.apply_gradients(
                mean_grads_and_vars, global_step=global_step)
    # Log training info
    merged_summary_op = tf.summary.merge_all()

    # Initialize variables
    init = tf.global_variables_initializer()

    # Create objects to record timing and memory of the graph execution
    # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Option in the session options=run_options
    # run_metadata = tf.RunMetadata() # Option in the session run_metadata=run_metadata
    # summary_writer.add_run_metadata(run_metadata, 'step%d' % i)
    with tf.Session(config=config) as sess:
        print 'Init variable'
        sess.run(init)

        test_step = 100

        # op to write logs to Tensorboard
        summary_writer = tf.summary.FileWriter(logs_path,
                                               graph=tf.get_default_graph())
        valid_writer = tf.summary.FileWriter(os.path.join(logs_path, 'valid'),
                                             graph=tf.get_default_graph())

        # Create saver to manage checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        last_ckpt_path = tf.train.latest_checkpoint(logs_path)
        if last_ckpt_path is not None and resume_training:
            # Load last checkpoint
            print('Initializing from previous checkpoint...')
            saver.restore(sess, last_ckpt_path)
            step = global_step.eval() + 1
        else:
            print('Initializing from specified pre-trained model...')
            # init_weights(sess)
            var_list = []
            for var in tf.global_variables():
                # LIAO: ignore lack of fc
                if var.name.find('fc') != -1: continue
                var_type = var.name.split('/')[-1]
                if 'weights' in var_type or 'bias' in var_type:
                    var_list.append(var)
            saver_res = tf.train.Saver(var_list=var_list)
            saver_res.restore(sess, initial_ckpt)
            step = 1
        #sess.run(interp_surgery(tf.global_variables()))
        print('Weights initialized')

        print 'Start training'
        while step < max_training_iters + 1:
            # Average the gradient
            for _ in range(0, iter_mean_grad):
                # LIAO: classification label one-hot encoding
                batch_image, _, batch_cls_label = dataset.next_batch(
                    batch_size, 'train')
                for i in range(batch_size):
                    image = batch_image[i]
                    if type(image) is not np.ndarray:
                        image = np.array(Image.open(image), dtype=np.uint8)
                    image = image[:, :, ::-1]
                    image = np.subtract(
                        image,
                        np.array((104.00699, 116.66877, 122.67892),
                                 dtype=np.float32))
                    batch_image[i] = image
                image = batch_image
                cls_label = slim.one_hot_encoding(
                    batch_cls_label, num_classes).eval(session=sess)

                # LIAO: classification label
                run_res = sess.run([
                    total_loss, merged_summary_op, classification_loss,
                    accuracy, l1_loss, l2_loss
                ] + grad_accumulator_ops,
                                   feed_dict={
                                       input_image: image,
                                       image_label: cls_label
                                   })
                batch_loss = run_res[0]
                summary = run_res[1]
                cls_loss = run_res[2]
                acc = run_res[3]
                lloss = run_res[4]
                l2loss = run_res[5]

            # Apply the gradients
            sess.run(apply_gradient_op)  # Momentum updates here its statistics

            # Save summary reports
            summary_writer.add_summary(summary, step)

            # Display training status
            if step % display_step == 0:
                print >> sys.stderr, "{} Iter {}: Training Loss = {:.4f} l1 loss = {:.4f}, l2 loss = {:.4f}".format(
                    datetime.now(), step, batch_loss, lloss, l2loss)
                print >> sys.stderr, "\t\tClassification Loss = {:.6f}, accuracy = {:.6f}".format(
                    cls_loss, acc)

            # LIAO: validation
            if step % test_step == 0:
                valid_image, _, valid_cls_label = valid_dataset.next_batch(
                    batch_size, 'train')
                for i in range(batch_size):
                    image = valid_image[i]
                    if type(image) is not np.ndarray:
                        image = np.array(Image.open(image), dtype=np.uint8)
                    image = image[:, :, ::-1]
                    image = np.subtract(
                        image,
                        np.array((104.00699, 116.66877, 122.67892),
                                 dtype=np.float32))
                    valid_image[i] = image
                valid_cls_label = slim.one_hot_encoding(
                    valid_cls_label, num_classes).eval(session=sess)
                valid_res = sess.run([
                    total_loss, merged_summary_op, classification_loss,
                    accuracy, l1_loss, l2_loss
                ],
                                     feed_dict={
                                         input_image: valid_image,
                                         image_label: valid_cls_label
                                     })
                valid_total_loss = valid_res[0]
                valid_summary = valid_res[1]
                valid_cls_loss = valid_res[2]
                valid_acc = valid_res[3]
                valid_l1loss = valid_res[4]
                valid_l2loss = valid_res[5]
                valid_writer.add_summary(valid_summary, step)
                print >> sys.stderr, "\n{} ***Test*** {}: Training Loss = {:.4f} l1 loss = {:.4f}, l2 loss = {:.4f} ".format(
                    datetime.now(), step, valid_total_loss, valid_l1loss,
                    valid_l2loss)
                print >> sys.stderr, "\t\tClassification Loss = {:.6f}, accuracy = {:.6f}".format(
                    valid_cls_loss, valid_acc)
                print >> sys.stderr, "\t\t===== learning rate: {:.10f} =====\n".format(
                    sess.run(learning_rate))

            # Save a checkpoint
            if step % save_step == 0:
                if test_image_path is not None:
                    curr_output = sess.run(img_summary,
                                           feed_dict={
                                               input_image:
                                               preprocess_img(test_image_path)
                                           })
                    summary_writer.add_summary(curr_output, step)
                save_path = saver.save(sess,
                                       model_name,
                                       global_step=global_step)
                print "Model saved in file: %s" % save_path

            step += 1

        if (step - 1) % save_step != 0:
            save_path = saver.save(sess, model_name, global_step=global_step)
            print "Model saved in file: %s" % save_path
        print('Finished training.')
def _train(dataset,
           initial_ckpt,
           supervison,
           learning_rate,
           logs_path,
           max_training_iters,
           save_step,
           display_step,
           global_step,
           iter_mean_grad=1,
           batch_size=1,
           momentum=0.9,
           resume_training=False,
           config=None,
           finetune=1,
           test_image_path=None,
           ckpt_name="osvos"):
    """Train OSVOS
    Args:
    dataset: Reference to a Dataset object instance
    initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet)
    supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision
    learning_rate: Value for the learning rate. It can be a number or an instance to a learning rate object.
    logs_path: Path to store the checkpoints
    max_training_iters: Number of training iterations
    save_step: A checkpoint will be created every save_steps
    display_step: Information of the training will be displayed every display_steps
    global_step: Reference to a Variable that keeps track of the training steps
    iter_mean_grad: Number of gradient computations that are average before updating the weights
    batch_size: Size of the training batch
    momentum: Value of the momentum parameter for the Momentum optimizer
    resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False)
    config: Reference to a Configuration object used in the creation of a Session
    finetune: Use to select the type of training, 0 for the parent network and 1 for finetunning
    test_image_path: If image path provided, every save_step the result of the network with this image is stored
    Returns:
    """

    model_name = os.path.join(logs_path, ckpt_name + ".ckpt")

    tf.logging.set_verbosity(tf.logging.INFO)

    # Prepare the input data
    input_image = tf.placeholder(tf.float32, [batch_size, None, None, 3])
    input_label = tf.placeholder(tf.float32, [batch_size, None, None, 1])

    # Create the network
    with slim.arg_scope(osvos_arg_scope()):
        net, end_points = osvos(input_image)
        pass

    # Initialize weights from pre-trained model
    init_weights = load_vgg_imagenet(initial_ckpt) if finetune == 0 else None

    # Define loss
    with tf.name_scope('losses'):
        if supervison == 1 or supervison == 2:
            dsn_2_loss = class_balanced_cross_entropy_loss(
                end_points['osvos/score-dsn_2-cr'], input_label)
            dsn_3_loss = class_balanced_cross_entropy_loss(
                end_points['osvos/score-dsn_3-cr'], input_label)
            dsn_4_loss = class_balanced_cross_entropy_loss(
                end_points['osvos/score-dsn_4-cr'], input_label)
            dsn_5_loss = class_balanced_cross_entropy_loss(
                end_points['osvos/score-dsn_5-cr'], input_label)
            tf.summary.scalar('dsn_2_loss', dsn_2_loss)
            tf.summary.scalar('dsn_3_loss', dsn_3_loss)
            tf.summary.scalar('dsn_4_loss', dsn_4_loss)
            tf.summary.scalar('dsn_5_loss', dsn_5_loss)

        main_loss = class_balanced_cross_entropy_loss(net, input_label)
        tf.summary.scalar('main_loss', main_loss)

        if supervison == 1:
            output_loss = dsn_2_loss + dsn_3_loss + dsn_4_loss + dsn_5_loss + main_loss
        elif supervison == 2:
            output_loss = 0.5 * dsn_2_loss + 0.5 * dsn_3_loss + 0.5 * dsn_4_loss + 0.5 * dsn_5_loss + main_loss
        elif supervison == 3:
            output_loss = main_loss
        else:
            sys.exit(
                'Incorrect supervision id, select 1 for supervision of the side outputs, 2 for weak supervision '
                'of the side outputs and 3 for no supervision of the side outputs'
            )
        total_loss = output_loss + tf.add_n(
            tf.losses.get_regularization_losses())
        tf.summary.scalar('total_loss', total_loss)

    # Define optimization method
    with tf.name_scope('optimization'):
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
        grads_and_vars = optimizer.compute_gradients(total_loss)

        with tf.name_scope('grad_accumulator'):
            grad_accumulator = {}
            for ind in range(0, len(grads_and_vars)):
                if grads_and_vars[ind][0] is not None:
                    grad_accumulator[ind] = tf.ConditionalAccumulator(
                        grads_and_vars[ind][0].dtype)
            pass

        with tf.name_scope('apply_gradient'):
            layer_lr = parameter_lr()
            grad_accumulator_ops = []
            for var_ind, grad_acc in grad_accumulator.items():
                var_name = str(grads_and_vars[var_ind][1].name).split(':')[0]
                var_grad = grads_and_vars[var_ind][0]
                grad_accumulator_ops.append(
                    grad_acc.apply_grad(var_grad * layer_lr[var_name],
                                        local_step=global_step))
            pass

        with tf.name_scope('take_gradients'):
            mean_grads_and_vars = []
            for var_ind, grad_acc in grad_accumulator.items():
                mean_grads_and_vars.append((grad_acc.take_grad(iter_mean_grad),
                                            grads_and_vars[var_ind][1]))
            apply_gradient_op = optimizer.apply_gradients(
                mean_grads_and_vars, global_step=global_step)
            pass

        pass

    # Log training info
    merged_summary_op = tf.summary.merge_all()

    # Log evolution of test image
    img_summary = None
    if test_image_path is not None:
        probabilities = tf.nn.sigmoid(net)
        img_summary = tf.summary.image("Output probabilities",
                                       probabilities,
                                       max_outputs=1)

    # Create objects to record timing and memory of the graph execution
    # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Option in the session options=run_options
    # run_metadata = tf.RunMetadata() # Option in the session run_metadata=run_metadata
    # summary_writer.add_run_metadata(run_metadata, 'step%d' % i)

    if config is None:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # config.log_device_placement = True
        config.allow_soft_placement = True
        pass

    with tf.Session(config=config) as sess:
        print('Init variable')
        sess.run(tf.global_variables_initializer())

        # op to write logs to Tensorboard
        summary_writer = tf.summary.FileWriter(logs_path,
                                               graph=tf.get_default_graph())

        # Create saver to manage checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        last_ckpt_path = tf.train.latest_checkpoint(logs_path)
        if last_ckpt_path is not None and resume_training:  # Load last checkpoint
            print('Initializing from previous checkpoint...')
            saver.restore(sess, last_ckpt_path)
            step = global_step.eval() + 1
        else:
            # Load pre-trained model
            if finetune == 0:
                print('Initializing from pre-trained imagenet model...')
                init_weights(sess)
            else:
                print('Initializing from specified pre-trained model...')
                var_list = []
                for var in tf.global_variables():
                    var_type = var.name.split('/')[-1]
                    if 'weights' in var_type or 'bias' in var_type:
                        var_list.append(var)
                saver_res = tf.train.Saver(var_list=var_list)
                saver_res.restore(sess, initial_ckpt)
                pass
            step = 1

        sess.run(interp_surgery(tf.global_variables()))
        print('Weights initialized')

        print('Start training')
        while step < max_training_iters + 1:
            # Average the gradient
            batch_loss, summary = None, None
            for _ in range(0, iter_mean_grad):
                batch_image, batch_label = dataset.next_batch(
                    batch_size, 'train')
                image = preprocess_img(batch_image[0])
                label = preprocess_labels(batch_label[0])
                run_res = sess.run([total_loss, merged_summary_op] +
                                   grad_accumulator_ops,
                                   feed_dict={
                                       input_image: image,
                                       input_label: label
                                   })
                batch_loss = run_res[0]
                summary = run_res[1]
                pass

            # Apply the gradients
            sess.run(apply_gradient_op)  # Momentum updates here its statistics

            # Save summary reports
            summary_writer.add_summary(summary, step)

            # Display training status
            if step % display_step == 0:
                print("{} Iter {}: Training Loss = {:.4f}".format(
                    datetime.now(), step, batch_loss))

            # Save a checkpoint
            if step % save_step == 0:
                if test_image_path is not None:
                    curr_output = sess.run(img_summary,
                                           feed_dict={
                                               input_image:
                                               preprocess_img(test_image_path)
                                           })
                    summary_writer.add_summary(curr_output, step)
                    pass
                save_path = saver.save(sess,
                                       model_name,
                                       global_step=global_step)
                print("Model saved in file: %s" % save_path)
                pass

            step += 1
            pass

        if (step - 1) % save_step != 0:
            save_path = saver.save(sess, model_name, global_step=global_step)
            print("Model saved in file: %s" % save_path)
            pass

        print('Finished training.')

        pass

    pass
def _train(dataset, initial_ckpt, supervison, learning_rate, logs_path, max_training_iters, save_step, display_step,
           global_step, number_slices=1, volume=False, iter_mean_grad=1, batch_size=1, task_id=2, loss=1, momentum=0.9, resume_training=False, config=None, finetune=1):
    """Train network
    Args:
    dataset: Reference to a Dataset object instance
    initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet)
    supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision
    learning_rate: Value for the learning rate. It can be number or an instance to a learning rate object.
    logs_path: Path to store the checkpoints
    max_training_iters: Number of training iterations
    save_step: A checkpoint will be created every save_steps
    display_step: Information of the training will be displayed every display_steps
    global_step: Reference to a Variable that keeps track of the training steps
    iter_mean_grad: Number of gradient computations that are average before updating the weights
    batch_size:
    momentum: Value of the momentum parameter for the Momentum optimizer
    resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False)
    config: Reference to a Configuration object used in the creation of a Session
    finetune: Use to select to select type of training, 0 for the parent network and 1 for finetunning
    Returns:
    """
    model_name = os.path.join(logs_path, "seg_liver.ckpt")
    if config is None:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # config.log_device_placement = True
        config.allow_soft_placement = True

    tf.logging.set_verbosity(tf.logging.INFO)

    input_depth = 3
    if number_slices > 3:
        input_depth = number_slices

    # Prepare the input data
    input_image = tf.placeholder(tf.float32, [batch_size, None, None, input_depth])
    input_label = tf.placeholder(tf.float32, [batch_size, None, None, number_slices])

    # Create the network
    with slim.arg_scope(seg_liver_arg_scope()):
        net, end_points = seg_liver(input_image, number_slices, volume)

    # Initialize weights from pre-trained model
    if finetune == 0:
        init_weights = load_vgg_imagenet(initial_ckpt, number_slices)

    # Define loss
    with tf.name_scope('losses'):
        dsn_2_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_2-cr'], input_label)
        tf.summary.scalar('losses/dsn_2_loss', dsn_2_loss)
        dsn_3_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_3-cr'], input_label)
        tf.summary.scalar('losses/dsn_3_loss', dsn_3_loss)
        dsn_4_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_4-cr'], input_label)
        tf.summary.scalar('losses/dsn_4_loss', dsn_4_loss)
        dsn_5_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_5-cr'], input_label)
        tf.summary.scalar('losses/dsn_5_loss', dsn_5_loss)

        main_loss = class_balanced_cross_entropy_loss(net, input_label)
        tf.summary.scalar('losses/main_loss', main_loss)

        if supervison == 1:
            output_loss = dsn_2_loss + dsn_3_loss + dsn_4_loss + dsn_5_loss + main_loss
        elif supervison == 2:
            output_loss = 0.5 * dsn_2_loss + 0.5 * dsn_3_loss + 0.5 * dsn_4_loss + 0.5 * dsn_5_loss + main_loss
        elif supervison == 3:
            output_loss = main_loss
        else:
            sys.exit('Incorrect supervision id, select 1 for supervision of the side outputs, 2 for weak supervision '
                     'of the side outputs and 3 for no supervision of the side outputs')
        # total_loss = output_loss + tf.add_n(slim.losses.get_regularization_losses())
        total_loss = output_loss + tf.add_n(tf.losses.get_regularization_losses())
        tf.summary.scalar('losses/total_loss', total_loss)

        # total_loss = output_loss + 0.001 * tf.add_n(slim.losses.get_regularization_losses())
        total_loss = output_loss + 0.001 * tf.add_n(tf.losses.get_regularization_losses())

        tf.summary.scalar('losses/total_loss', total_loss)

    # Define optimization method
    with tf.name_scope('optimization'):
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
        grads_and_vars = optimizer.compute_gradients(total_loss)
        with tf.name_scope('grad_accumulator'):
            grad_accumulator = []
            for ind in range(0, len(grads_and_vars)):
                if grads_and_vars[ind][0] is not None:
                    grad_accumulator.append(tf.ConditionalAccumulator(grads_and_vars[0][0].dtype))
        with tf.name_scope('apply_gradient'):
            layer_lr = parameter_lr()
            grad_accumulator_ops = []
            for ind in range(0, len(grad_accumulator)):
                if grads_and_vars[ind][0] is not None:
                    var_name = str(grads_and_vars[ind][1].name).split(':')[0]
                    var_grad = grads_and_vars[ind][0]
                    grad_accumulator_ops.append(grad_accumulator[ind].apply_grad(var_grad * layer_lr[var_name],
                                                                                 local_step=global_step))
        with tf.name_scope('take_gradients'):
            mean_grads_and_vars = []
            for ind in range(0, len(grad_accumulator)):
                if grads_and_vars[ind][0] is not None:
                    mean_grads_and_vars.append(
                        (grad_accumulator[ind].take_grad(iter_mean_grad), grads_and_vars[ind][1]))
            apply_gradient_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step)
            # Log training info

    with tf.name_scope('metrics'):
        dice_coef_op = dice_coef_theoretical(net, input_label)
        tf.summary.scalar('metrics/dice_coeff', dice_coef_op)

    merged_summary_op = tf.summary.merge_all()

    # Initialize variables
    init = tf.global_variables_initializer()

    # Create objects to record timing and memory of the graph execution
    # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Option in the session options=run_options
    # run_metadata = tf.RunMetadata() # Option in the session run_metadata=run_metadata
    # summary_writer.add_run_metadata(run_metadata, 'step%d' % i)
    with tf.Session(config=config) as sess:
        print 'Init variable'
        sess.run(init)

        # op to write logs to Tensorboard
        summary_writer = tf.summary.FileWriter(logs_path + '/train', graph=tf.get_default_graph())
        test_writer = tf.summary.FileWriter(logs_path + '/test')

        # Create saver to manage checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        last_ckpt_path = tf.train.latest_checkpoint(logs_path)
        if last_ckpt_path is not None and resume_training:
            # Load last checkpoint
            print('Initializing from previous checkpoint...')
            saver.restore(sess, last_ckpt_path)
            step = global_step.eval() + 1
        else:
            # Load pre-trained model
            if finetune == 0:
                print('Initializing from pre-trained imagenet model...')
                init_weights(sess)
            else:
                print('Initializing from pre-trained model...')
                # init_weights(sess)
                var_list = []
                for var in tf.global_variables():
                    var_type = var.name.split('/')[-1]
                    if 'weights' in var_type or 'bias' in var_type:
                        var_list.append(var)
                saver_res = tf.train.Saver(var_list=var_list)
                saver_res.restore(sess, initial_ckpt)
            step = 1
        sess.run(interp_surgery(tf.global_variables()))
        print('Weights initialized')

        print 'Start training'
        while step < max_training_iters + 1:
            # Average the gradient
            for iter_steps in range(0, iter_mean_grad):
                batch_image, batch_label, batch_label_liver = dataset.next_batch(batch_size, 'train')
                batch_image_val, batch_label_val, batch_label_liver_val = dataset.next_batch(batch_size, 'val')
                image = preprocess_img(batch_image, number_slices)
                val_image = preprocess_img(batch_image_val, number_slices)
                if task_id == 2:
                    batch_label = batch_label_liver
                    batch_label_val = batch_label_liver_val
                label = preprocess_labels(batch_label, number_slices)
                label_val = preprocess_labels(batch_label_val, number_slices)
                run_res = sess.run([total_loss, merged_summary_op, dice_coef_op] + grad_accumulator_ops,
                                   feed_dict={input_image: image, input_label: label})
                batch_loss = run_res[0]
                summary = run_res[1]
                train_dice_coef = run_res[2]
                if step % display_step == 0:
                    val_run_res = sess.run([total_loss, merged_summary_op, dice_coef_op],
                                           feed_dict={input_image: val_image, input_label: label_val})
                    val_batch_loss = val_run_res[0]
                    val_summary = val_run_res[1]
                    val_dice_coef = val_run_res[2]

            # Apply the gradients
            sess.run(apply_gradient_op)

            # Save summary reports
            summary_writer.add_summary(summary, step)
            if step % display_step == 0:
                test_writer.add_summary(val_summary, step)

            # Display training status
            if step % display_step == 0:
                print >> sys.stderr, "{} Iter {}: Training Loss = {:.4f}".format(datetime.now(), step, batch_loss)
                print >> sys.stderr, "{} Iter {}: Validation Loss = {:.4f}".format(datetime.now(), step, val_batch_loss)
                print >> sys.stderr, "{} Iter {}: Training Dice = {:.4f}".format(datetime.now(), step, train_dice_coef)
                print >> sys.stderr, "{} Iter {}: Validation Dice = {:.4f}".format(datetime.now(), step, val_dice_coef)

            # Save a checkpoint
            if step % save_step == 0:
                save_path = saver.save(sess, model_name, global_step=global_step)
                print "Model saved in file: %s" % save_path

            step += 1

        if (step - 1) % save_step != 0:
            save_path = saver.save(sess, model_name, global_step=global_step)
            print "Model saved in file: %s" % save_path

        print('Finished training.')
Beispiel #28
0
def train(dataset, initial_ckpt, learning_rate, logs_path, max_training_iters, save_step, display_step,
           global_step, iter_mean_grad=1, batch_size=1, momentum=0.9, resume_training=False, config=None, finetune=1):

    """Train network
    Args:
    dataset: Reference to a Dataset object instance
    initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet)
    supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision
    learning_rate: Value for the learning rate. It can be number or an instance to a learning rate object.
    logs_path: Path to store the checkpoints
    max_training_iters: Number of training iterations
    save_step: A checkpoint will be created every save_steps
    display_step: Information of the training will be displayed every display_steps
    global_step: Reference to a Variable that keeps track of the training steps
    iter_mean_grad: Number of gradient computations that are average before updating the weights
    batch_size:
    momentum: Value of the momentum parameter for the Momentum optimizer
    resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False)
    config: Reference to a Configuration object used in the creation of a Session
    finetune: Use to select to select type of training, 0 for the parent network and 1 for finetunning
    Returns:
    """
    model_name = os.path.join(logs_path, "det_lesion.ckpt")
    if config is None:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

    tf.logging.set_verbosity(tf.logging.INFO)

    # Prepare the input data
    input_image = tf.placeholder(tf.float32, [batch_size, 80, 80, 3])
    input_label = tf.placeholder(tf.float32, [batch_size])
    is_training = tf.placeholder(tf.bool, shape=())
    
    tf.summary.histogram('input_label', input_label)

    # Create the network
    with slim.arg_scope(det_lesion_arg_scope()):
        net, end_points = det_lesion_resnet(input_image, is_training_option=is_training)

    # Initialize weights from pre-trained model
    if finetune == 0:
        init_weights = load_resnet_imagenet(initial_ckpt)

    # Define loss
    with tf.name_scope('losses'):
        loss, output, target = binary_cross_entropy(net, input_label)
        total_loss = loss + tf.add_n(tf.losses.get_regularization_losses())
        tf.summary.scalar('losses/total_loss', total_loss)
        tf.summary.histogram('losses/output', output)
        tf.summary.histogram('losses/target', target)

    # Define optimization method
    with tf.name_scope('optimization'):
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
        #optimizer = tf.train.AdamOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(total_loss)
        with tf.name_scope('grad_accumulator'):
            grad_accumulator = []
            for ind in range(0, len(grads_and_vars)):
                if grads_and_vars[ind][0] is not None:
                    grad_accumulator.append(tf.ConditionalAccumulator(grads_and_vars[0][0].dtype))
        with tf.name_scope('apply_gradient'):
            grad_accumulator_ops = []
            for ind in range(0, len(grad_accumulator)):
                if grads_and_vars[ind][0] is not None:
                    var_name = str(grads_and_vars[ind][1].name).split(':')[0]
                    var_grad = grads_and_vars[ind][0]

                    if "weights" in var_name:
                        aux_layer_lr = 1.0
                    elif "biases" in var_name:
                        aux_layer_lr = 2.0
                    
                    grad_accumulator_ops.append(grad_accumulator[ind].apply_grad(var_grad*aux_layer_lr,
                                                                                 local_step=global_step))
        with tf.name_scope('take_gradients'):
            mean_grads_and_vars = []
            for ind in range(0, len(grad_accumulator)):
                if grads_and_vars[ind][0] is not None:
                    mean_grads_and_vars.append((grad_accumulator[ind].take_grad(iter_mean_grad), grads_and_vars[ind][1]))
            apply_gradient_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step)

    with tf.name_scope('metrics'):
        acc_op = my_accuracy(net, input_label)
        tf.summary.scalar('metrics/accuracy', acc_op)
        
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if update_ops:
        tf.logging.info('Gathering update_ops')
        with tf.control_dependencies(tf.tuple(update_ops)):
            total_loss = tf.identity(total_loss)
       
    merged_summary_op = tf.summary.merge_all()

    # Initialize variables
    init = tf.global_variables_initializer()

    with tf.Session(config=config) as sess:
        print('Init variable')
        sess.run(init)

        # op to write logs to Tensorboard
        logs_path_train = os.path.join(logs_path,'train')
        logs_path_test = os.path.join(logs_path,'test')
        #summary_writer = tf.summary.FileWriter(logs_path + '/train', graph=tf.get_default_graph())
        #test_writer = tf.summary.FileWriter(logs_path + '/test')
        summary_writer = tf.summary.FileWriter(logs_path_train, graph=tf.get_default_graph())
        test_writer = tf.summary.FileWriter(logs_path_test)

        # Create saver to manage checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        last_ckpt_path = tf.train.latest_checkpoint(logs_path)
        if last_ckpt_path is not None and resume_training:
            # Load last checkpoint
            print('Initializing from previous checkpoint...')
            saver.restore(sess, last_ckpt_path)
            step = global_step.eval() + 1
        else:
            # Load pre-trained model
            if finetune == 0:
                print('Initializing from pre-trained imagenet model...')
                init_weights(sess)
            else:
                print('Initializing from pre-trained model...')
                # init_weights(sess)
                var_list = []
                for var in tf.global_variables():
                    var_type = var.name.split('/')[-1]
                    if 'weights' in var_type or 'bias' in var_type:
                        var_list.append(var)
                saver_res = tf.train.Saver(var_list=var_list)
                saver_res.restore(sess, initial_ckpt)
            step = 1
        sess.run(interp_surgery(tf.global_variables()))
        print('Weights initialized')

        print('Start training')
        while step < max_training_iters + 1:
            # Average the gradient
            for iter_steps in range(0, iter_mean_grad):
                batch_image, batch_label, x_bb_train, y_bb_train, ids_train = dataset.next_batch(batch_size, 'train', 0.5)
                batch_image_val, batch_label_val, x_bb_val, y_bb_val, ids_val = dataset.next_batch(batch_size, 'val', 0.5)
                image = preprocess_img(batch_image, x_bb_train, y_bb_train, ids_train)
                label = batch_label
                val_image = preprocess_img(batch_image_val, x_bb_val, y_bb_val)
                label_val = batch_label_val
                run_res = sess.run([total_loss, merged_summary_op, acc_op] + grad_accumulator_ops,
                                   feed_dict={input_image: image, input_label: label, is_training: True})
                batch_loss = run_res[0]
                summary = run_res[1]
                acc = run_res[2]
                if step % display_step == 0:
                    val_run_res = sess.run([total_loss, merged_summary_op, acc_op],
                                           feed_dict={input_image: val_image, input_label: label_val, is_training: False})
                    val_batch_loss = val_run_res[0]
                    val_summary = val_run_res[1]
                    val_acc = val_run_res[2]

            # Apply the gradients
            sess.run(apply_gradient_op)

            # Save summary reports
            summary_writer.add_summary(summary, step)
            if step % display_step == 0:
                test_writer.add_summary(val_summary, step)

            # Display training status
            if step % display_step == 0:
                print("{} Iter {}: Training Loss = {:.4f}".format(datetime.now(), step, batch_loss, file=sys.stderr))
                print("{} Iter {}: Validation Loss = {:.4f}".format(datetime.now(), step, val_batch_loss, file=sys.stderr))
                print("{} Iter {}: Training Accuracy = {:.4f}".format(datetime.now(), step, acc, file=sys.stderr))
                print("{} Iter {}: Validation Accuracy = {:.4f}".format(datetime.now(), step, val_acc, file=sys.stderr)) 

            # Save a checkpoint
            if step % save_step == 0:
                save_path = saver.save(sess, model_name, global_step=global_step)
                print("Model saved in file: %s" % (save_path))

            step += 1

        if (step-1) % save_step != 0:
            save_path = saver.save(sess, model_name, global_step=global_step)
            print("Model saved in file: %s" % (save_path))

        print('Finished training.')
 def testAccumulatorSizeEmpty(self):
     with self.test_session():
         q = tf.ConditionalAccumulator(tf.float32, name="Q")
         self.assertEqual(q.num_accumulated().eval(), 0)
Beispiel #30
0
def train_finetune(dataset, model_params, learning_rate, logs_path, max_training_iters, save_step, display_step,
           global_step, iter_mean_grad=1, batch_size=1, resume_training=False, config=None, 
           use_image_summary=True, ckpt_name="osmn"):
    """Train OSMN
    Args:
    dataset: Reference to a Dataset object instance
    model_params: Model parameters
    initial_ckpt: Path to the checkpoint to initialize the whole network or visual modulator, depend on seg_ckpt
    seg_ckpt: If seg_ckpt is not None, initial_ckpt is used to initialize the visual modulator, and seg_ckpt is used to
            initialize segmentation network
    learning_rate: Value for the learning rate. It can be a number or an instance to a learning rate object.
    logs_path: Path to store the checkpoints
    max_training_iters: Number of training iterations
    save_step: A checkpoint will be created every save_steps
    display_step: Information of the training will be displayed every display_steps
    global_step: Reference to a Variable that keeps track of the training steps
    iter_mean_grad: Number of gradient computations that are average before updating the weights
    batch_size: Size of the training batch
    resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False)
    config: Reference to a Configuration object used in the creation of a Session
    use_image_summary: Boolean to use image summary during training in tensorboard
    ckpt_name: checkpoint name for saving
    Returns:
    """
    model_name = os.path.join(logs_path, ckpt_name+".ckpt")
    if config is None:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # config.log_device_placement = True
        config.allow_soft_placement = True

    tf.logging.set_verbosity(tf.logging.INFO)

    # Prepare the input data
    guide_image = tf.placeholder(tf.float32, [batch_size, 224, 224, 3])
    input_image = tf.placeholder(tf.float32, [batch_size, None, None, 3])
    gb_image = tf.placeholder(tf.float32, [batch_size, None, None, 1])
    input_label = tf.placeholder(tf.float32, [batch_size, None, None, 1])

    model_func = get_model_func(model_params.base_model)
    net, end_points = model_func([guide_image, gb_image, input_image], model_params, is_training=True)


    # Define loss
    with tf.name_scope('losses'):

        main_loss = class_balanced_cross_entropy_loss(net, input_label)
        tf.summary.scalar('main_loss', main_loss)

        total_loss = main_loss + tf.add_n(tf.losses.get_regularization_losses())
        tf.summary.scalar('total_loss', total_loss)

    # Define optimization method
    with tf.name_scope('optimization'):
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(total_loss)
        with tf.name_scope('grad_accumulator'):
            grad_accumulator = {}
            for ind in range(0, len(grads_and_vars)):
                if grads_and_vars[ind][0] is not None:
                    grad_accumulator[ind] = tf.ConditionalAccumulator(grads_and_vars[ind][0].dtype)
        with tf.name_scope('apply_gradient'):
            grad_accumulator_ops = []
            for var_ind, grad_acc in grad_accumulator.items():
                var_name = str(grads_and_vars[var_ind][1].name).split(':')[0]
                var_grad = grads_and_vars[var_ind][0]
                grad_accumulator_ops.append(grad_acc.apply_grad(var_grad,
                                                                local_step=global_step))
        with tf.name_scope('take_gradients'):
            mean_grads_and_vars = []
            for var_ind, grad_acc in grad_accumulator.items():
                mean_grads_and_vars.append(
                    (grad_acc.take_grad(iter_mean_grad), grads_and_vars[var_ind][1]))
            apply_gradient_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step)
    # Log training info
    merged_summary_op = tf.summary.merge_all()

    # Log results on training images
    if use_image_summary:
        probabilities = tf.nn.sigmoid(net)
        input_image_orig = input_image / model_params.scale_value + model_params.mean_value
        guide_image_orig = guide_image / model_params.scale_value + model_params.mean_value
        img_summary = binary_seg_summary(input_image_orig, probabilities, gb_image, input_label)
        vg_summary = visual_guide_summary(guide_image_orig)
    # Initialize variables
    init = tf.global_variables_initializer()

    with tf.Session(config=config) as sess:
        print('Init variable')
        sess.run(init)
        tvars = tf.trainable_variables()
        # op to write logs to Tensorboard
        summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())

        # Create saver to manage checkpoints
        saver = tf.train.Saver(max_to_keep=40)

        last_ckpt_path = tf.train.latest_checkpoint(logs_path)
        if last_ckpt_path is not None and resume_training:
            # Load last checkpoint
            print('Initializing from previous checkpoint...')
            saver.restore(sess, last_ckpt_path)
            step = global_step.eval() + 1
        elif model_params.whole_model_path == '':
            print('Initializing from pre-trained imagenet model...')
            if model_params.use_visual_modulator:
                load_model(model_params.vis_mod_model_path, 'osmn/modulator')(sess)
            if model_params.seg_model_path != '':
                load_model(model_params.seg_model_path, 'osmn/seg')(sess)
            step = 1
        else:
            print('Initializing from pre-trained model...')
            load_model(model_params.whole_model_path, 'osmn')(sess)
            step = 1
        #if model_params.base_model != 'lite':
        sess.run(interp_surgery(tf.global_variables()))
        print('Weights initialized')

        print('Start training')
        while step < max_training_iters + 1:
            # Average the gradient
            for _ in range(0, iter_mean_grad):
                batch_g_image, batch_gb_image, batch_image, batch_label = dataset.next_batch(batch_size, 'train')
                run_res = sess.run([total_loss, merged_summary_op] + grad_accumulator_ops,
                        feed_dict={guide_image: batch_g_image, gb_image: batch_gb_image,
                        input_image: batch_image, input_label: batch_label})
                batch_loss = run_res[0]
                summary = run_res[1]

            # Apply the gradients
            sess.run(apply_gradient_op)  # Momentum updates here its statistics

            # Save summary reports
            summary_writer.add_summary(summary, step)

            # Display training status
            if step % display_step == 0:
                if use_image_summary:
                    #test_g_image, test_gb_image, test_image, _ = dataset.next_batch(batch_size, 'test')
                    curr_img_summary = sess.run([img_summary, vg_summary], feed_dict={guide_image:batch_g_image, gb_image:batch_gb_image,
                        input_image: batch_image, input_label: batch_label})
                    for s in curr_img_summary:
                        summary_writer.add_summary(s, step)
                print("{} Iter {}: Training Loss = {:.4f}".format(datetime.now(), step, batch_loss),file=sys.stderr)

            # Save a checkpoint
            if step % save_step == 0:
                save_path = saver.save(sess, model_name, global_step=global_step)
                print("Model saved in file: %s" % save_path)

            step += 1

        if (step - 1) % save_step != 0:
            save_path = saver.save(sess, model_name, global_step=global_step)
            print("Model saved in file: %s" % save_path)

        print('Finished training.')