Example #1
0
    def test_horovod_allreduce_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different rank or dimension."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            # Same rank, different dimension
            tf.set_random_seed(1234)
            dims = [17 + rank] * 3
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))

            # Same number of elements, different rank
            tf.set_random_seed(1234)
            if rank == 0:
                dims = [17, 23 * 57]
            else:
                dims = [17, 23, 57]
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))
Example #2
0
    def test_horovod_allreduce_cpu(self):
        """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        with self.test_session() as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/cpu:0"):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    break

                diff = session.run(max_difference)
                self.assertTrue(diff <= threshold,
                                "hvd.allreduce produces incorrect results")
Example #3
0
    def test_horovod_allreduce_grad(self):
        """Test the correctness of the allreduce gradient."""
        hvd.init()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/cpu:0"):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [5] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)

                grad_ys = tf.ones([5] * dim)
                grad = tf.gradients(summed, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                expected = np.ones([5] * dim) * size
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" % (grad_out, expected, str(err)))
Example #4
0
    def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
        perform reduction on CPU and GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        device = "/gpu:0" if local_rank % 2 == 0 else "/cpu:0"
        one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
        gpu_config = tf.ConfigProto(gpu_options=one_gpu)
        with self.test_session(config=gpu_config) as session:
            with tf.device(device):
                # Same rank, different dimension
                dims = [17] * 3
                tensor = tf.ones(dims, dtype=tf.int32)
                with self.assertRaises(tf.errors.FailedPreconditionError):
                    session.run(hvd.allreduce(tensor))
Example #5
0
def allreduce(value, name=None, average=True):
    """
    Perform an allreduce on a tensor-compatible value.

    Arguments:
        value: A tensor-compatible value to reduce.
               The shape of the input must be identical across all ranks.
        name: Optional name for the constants created by this operation.
        average: If True, computes the average over all ranks.
                 Otherwise, computes the sum over all ranks.
    """
    allreduce_op = hvd.allreduce(tf.constant(value, name=name), average=average)
    return K.get_session().run(allreduce_op)
Example #6
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_coco_predictor(0)
                self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Example #7
0
    def test_horovod_allreduce_multi_gpu(self):
        """Test that the allreduce works on multiple GPUs.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        iter = 0
        two_gpus = tf.GPUOptions(visible_device_list=(
            '%d,%d' % (local_rank * 2, local_rank * 2 + 1)))
        gpu_config = tf.ConfigProto(gpu_options=two_gpus)
        with self.test_session(config=gpu_config) as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                iter += 1
                with tf.device("/gpu:%d" % ((iter + local_rank) % 2)):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    return

                diff = session.run(max_difference)
                self.assertTrue(diff <= threshold,
                                "hvd.allreduce on GPU produces incorrect results")
Example #8
0
    def test_horovod_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            # Same rank, different dimension
            dims = [17] * 3
            tensor = tf.ones(dims,
                             dtype=tf.int32 if rank % 2 == 0 else tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))
Example #9
0
    def test_horovod_allreduce_gpu_fused(self):
        """Test that the allreduce works on GPUs with Tensor Fusion.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            tests = []
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/gpu:%d" % local_rank):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3 or dtype in [tf.int32, tf.int64]:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    return

                test = max_difference <= threshold
                tests.append(test)
            self.assertTrue(session.run(tf.reduce_all(tests)),
                            "hvd.allreduce produces incorrect results")
Example #10
0
  def get_apply_grads_op(self, loss, var_list):
    """
    :param tf.Tensor loss:
    :param list[tf.Variable] var_list:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
    # The following code is basically extended self.optimizer.minimize(), to optionally modify gradients.
    from Util import make_hashable
    if not var_list:
      return tf.no_op(name="no_grad_vars_no_op")

    grads_and_vars = self._compute_gradients(loss, var_list=var_list)
    if self.config.is_true("use_horovod") and self.config.value("horovod_reduce_type", "") == "grad":
      # noinspection PyPackageRequirements,PyUnresolvedReferences
      import horovod.tensorflow as hvd
      grads_and_vars = [
        (hvd.allreduce(grad, average=self.config.is_true("horovod_avg_grad")) if grad is not None else None, var)
        for (grad, var) in grads_and_vars]

    var_grads = {var: grad for (grad, var) in grads_and_vars if grad is not None}
    if not var_grads:
      raise Exception("no single variable to train")
    global_info = self._GetGlobalInfo(optimizer=self, all_vars=var_list, var_grads=var_grads)
    if self.config.bool_or_other("debug_grad_summaries", False):
      tf.summary.scalar("global_grad_norm", global_info.get_global_grad_norm())
    grads_per_apply_grad_opts = {}  # dict apply_grad_opts -> list of (grad, var)
    for grad, var in grads_and_vars:
      assert var in var_list
      if grad is None:
        continue
      new_grad, apply_grad_opts = self._post_process_grad(grad=grad, var=var, global_info=global_info)
      grads_per_apply_grad_opts.setdefault(make_hashable(apply_grad_opts), []).append((new_grad, var))

    all_apply_grads = []
    assert grads_per_apply_grad_opts
    for apply_grad_opts, grads_and_vars_per_opts in grads_per_apply_grad_opts.items():
      all_apply_grads.append(self._apply_gradients(grads_and_vars_per_opts, **apply_grad_opts))
    if len(all_apply_grads) == 1:
      return all_apply_grads[0]
    return tf.group(*all_apply_grads)
Example #11
0
  def compute_gradients(self, *args, **kwargs):
    """Compute gradients of all trainable variables.
    See Optimizer.compute_gradients() for more info.
    In DistributedOptimizer, compute_gradients() is overriden to also
    allreduce the gradients before returning them.
    """
    gradients = self._optimizer.compute_gradients(*args, **kwargs)
    from horovod.common import size
    from horovod.tensorflow import allreduce

    if size() > 1:
      averaged_gradients = []
      with tf.name_scope(self._name + "_Allreduce"):
        for grad, var in gradients:
          if grad is not None:
            avg_grad = allreduce(grad, device_dense=self._device_dense,
                                 device_sparse=self._device_sparse)
            averaged_gradients.append((avg_grad, var))
          else:
            averaged_gradients.append((None, var))
      return averaged_gradients
    else:
      return gradients
Example #12
0
    def get_gradients(self, loss, params):
        """
        Compute gradients of all trainable variables.

        See Optimizer.get_gradients() for more info.

        In DistributedOptimizer, get_gradients() is overriden to also
        allreduce the gradients before returning them.
        """
        gradients = super(self.__class__, self).get_gradients(loss, params)
        if hvd.size() > 1:
            averaged_gradients = []
            with tf.name_scope(self._name + "_Allreduce"):
                for grad in gradients:
                    if grad is not None:
                        avg_grad = hvd.allreduce(grad, device_dense=self._device_dense,
                                                 device_sparse=self._device_sparse)
                        averaged_gradients.append(avg_grad)
                    else:
                        averaged_gradients.append(None)
                return averaged_gradients
        else:
            return gradients
Example #13
0
def allreduce(model, opt, gradient_accumulator, loss, mlm_loss, mlm_acc,
              sop_loss, sop_acc):
    grads = gradient_accumulator.gradients
    # This, which is equivalent to sparse_as_dense=True, gives a mild 2% speedup from 0.62 it/s to 0.63 it/s
    # onn BERT-large multinode.
    grads = [
        tf.convert_to_tensor(grad)
        if grad is not None and isinstance(grad, tf.IndexedSlices) else grad
        for grad in grads
    ]

    # TODO: Does placing this clip before or after allreduce affect accuracy?
    # Placing before has a regularization effect, no single example can contribute as much.
    # Placing before also gives a 20% speedup when training BERT-large, probably because the
    # gradient operations can be fused by XLA.
    (grads, grad_norm) = tf.clip_by_global_norm(grads,
                                                clip_norm=args.max_grad_norm)

    grads = [
        hvd.allreduce(grad, compression=hvd.Compression.fp16)
        if grad is not None else None for grad in grads
    ]

    opt.apply_gradients([
        (grad, var) for (grad, var) in zip(grads, model.trainable_variables)
        if grad is not None
    ])

    # Clear the gradient accumulator
    gradient_accumulator.reset()

    loss = hvd.allreduce(loss)
    mlm_loss = hvd.allreduce(mlm_loss)
    mlm_acc = hvd.allreduce(mlm_acc)
    sop_loss = hvd.allreduce(sop_loss)
    sop_acc = hvd.allreduce(sop_acc)

    return loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm
Example #14
0
def main():
    hvd.init()
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    tf.config.threading.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
    tf.config.threading.inter_op_parallelism_threads = max(
        2, 40 // hvd.size() - 2)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    cmdline = add_cli_args()
    FLAGS, unknown_args = cmdline.parse_known_args()

    if FLAGS.fine_tune:
        raise NotImplementedError('fine tuning functionality not available')

    if not FLAGS.xla_off:
        tf.config.optimizer.set_jit(True)
    if not FLAGS.fp32:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    preprocessing_type = 'resnet'
    if FLAGS.model == 'resnet50v1_b':
        model = resnet.ResNet50V1_b(weights=None,
                                    weight_decay=FLAGS.l2_weight_decay,
                                    classes=FLAGS.num_classes)
    elif FLAGS.model == 'resnet50v1_c':
        model = resnet.ResNet50V1_c(weights=None,
                                    weight_decay=FLAGS.l2_weight_decay,
                                    classes=FLAGS.num_classes)
    elif FLAGS.model == 'resnet50v1_d':
        model = resnet.ResNet50V1_d(weights=None,
                                    weight_decay=FLAGS.l2_weight_decay,
                                    classes=FLAGS.num_classes)
    elif FLAGS.model == 'resnet101v1_b':
        model = resnet.ResNet101V1_b(weights=None,
                                     weight_decay=FLAGS.l2_weight_decay,
                                     classes=FLAGS.num_classes)
    elif FLAGS.model == 'resnet101v1_c':
        model = resnet.ResNet101V1_c(weights=None,
                                     weight_decay=FLAGS.l2_weight_decay,
                                     classes=FLAGS.num_classes)
    elif FLAGS.model == 'resnet101v1_d':
        model = resnet.ResNet101V1_d(weights=None,
                                     weight_decay=FLAGS.l2_weight_decay,
                                     classes=FLAGS.num_classes)
    elif FLAGS.model == 'darknet53':
        model = darknet.Darknet(weight_decay=FLAGS.l2_weight_decay)
    elif FLAGS.model in ['hrnet_w18c', 'hrnet_w32c']:
        preprocessing_type = 'imagenet'
        model = hrnet.build_hrnet(FLAGS.model)
        model._set_inputs(tf.keras.Input(shape=(None, None, 3)))
    else:
        raise NotImplementedError('Model {} not implemented'.format(
            FLAGS.model))

    model.summary()

    # scale learning rate linearly, base learning rate for batch size of 256 is specified through args
    BASE_LR = FLAGS.learning_rate
    learning_rate = (BASE_LR * hvd.size() * FLAGS.batch_size) / 256
    steps_per_epoch = int(
        (FLAGS.train_dataset_size / (FLAGS.batch_size * hvd.size())))

    # 5 epochs are for warmup
    if FLAGS.schedule == 'piecewise_short':
        scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries=[
                steps_per_epoch * 25, steps_per_epoch * 55,
                steps_per_epoch * 75, step_per_epoch * 100
            ],
            values=[
                learning_rate, learning_rate * 0.1, learning_rate * 0.01,
                learning_rate * 0.001, learning_rate * 0.0001
            ])
    elif FLAGS.schedule == 'piecewise_long':
        scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries=[
                steps_per_epoch * 55, steps_per_epoch * 115,
                steps_per_epoch * 175
            ],
            values=[
                learning_rate, learning_rate * 0.1, learning_rate * 0.01,
                learning_rate * 0.001
            ])
    elif FLAGS.schedule == 'cosine':
        scheduler = tf.keras.experimental.CosineDecayRestarts(
            initial_learning_rate=learning_rate,
            first_decay_steps=FLAGS.num_epochs * steps_per_epoch,
            t_mul=1,
            m_mul=1)
    else:
        print('No schedule specified')

    scheduler = WarmupScheduler(optimizer=scheduler,
                                initial_learning_rate=learning_rate /
                                hvd.size(),
                                warmup_steps=steps_per_epoch * 5)

    #TODO support optimizers choice via config
    # opt = tf.keras.optimizers.SGD(learning_rate=scheduler, momentum=FLAGS.momentum, nesterov=True) # needs momentum correction term
    opt = MomentumOptimizer(learning_rate=scheduler,
                            momentum=FLAGS.momentum,
                            nesterov=True)

    if not FLAGS.fp32:
        opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            opt, loss_scale=128.)

    loss_func = tf.keras.losses.CategoricalCrossentropy(
        from_logits=True,
        label_smoothing=FLAGS.label_smoothing,
        reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)

    if hvd.rank() == 0:
        if FLAGS.resume_from:
            model = tf.keras.models.load_model(FLAGS.resume_from)
            print('loaded model from', FLAGS.resume_from)
        model_dir = os.path.join(
            FLAGS.model +
            datetime.datetime.now().strftime("_%Y-%m-%d_%H-%M-%S"))
        path_logs = os.path.join(os.getcwd(), model_dir, 'log.csv')
        os.mkdir(model_dir)
        logging.basicConfig(
            filename=path_logs,
            filemode='a',
            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
            datefmt='%H:%M:%S',
            level=logging.DEBUG)
        logging.info("Training Logs")
        logger = logging.getLogger('logger')
        logger.info('Training options: %s', FLAGS)

    # barrier
    hvd.allreduce(tf.constant(0))

    start_time = time()
    curr_step = tf.Variable(initial_value=0, dtype=tf.int32)
    best_validation_accuracy = 0.7  # only save 0.7 or higher checkpoints

    data = create_dataset(FLAGS.train_data_dir,
                          FLAGS.batch_size,
                          preprocessing=preprocessing_type,
                          validation=False)
    validation_data = create_dataset(FLAGS.validation_data_dir,
                                     FLAGS.batch_size,
                                     preprocessing=preprocessing_type,
                                     validation=True)

    for epoch in range(FLAGS.num_epochs):
        if hvd.rank() == 0:
            print('Starting training Epoch %d/%d' % (epoch, FLAGS.num_epochs))
        training_score = 0
        for batch, (images, labels) in enumerate(tqdm(data)):
            # momentum correction (V2 SGD absorbs LR into the update term)
            # prev_lr = opt._optimizer.learning_rate(curr_step-1)
            # curr_lr = opt._optimizer.learning_rate(curr_step)
            # momentum_correction_factor = curr_lr / prev_lr
            # opt._optimizer.momentum = opt._optimizer.momentum * momentum_correction_factor
            loss, score = train_step(model,
                                     opt,
                                     loss_func,
                                     images,
                                     labels,
                                     batch == 0 and epoch == 0,
                                     batch_size=FLAGS.batch_size,
                                     mixup_alpha=FLAGS.mixup_alpha,
                                     fp32=FLAGS.fp32)
            # # restore momentum
            # opt._optimizer.momentum = FLAGS.momentum
            training_score += score.numpy()
            curr_step.assign_add(1)
        training_accuracy = training_score / (FLAGS.batch_size * (batch + 1))
        average_training_accuracy = hvd.allreduce(
            tf.constant(training_accuracy))
        average_training_loss = hvd.allreduce(tf.constant(loss))
        if hvd.rank() == 0:
            print('Starting validation Epoch %d/%d' %
                  (epoch, FLAGS.num_epochs))
        validation_score = 0
        counter = 0
        for images, labels in tqdm(validation_data):
            loss, score = validation_step(images, labels, model, loss_func)
            validation_score += score.numpy()
            counter += 1
        validation_accuracy = validation_score / (FLAGS.batch_size * counter)
        average_validation_accuracy = hvd.allreduce(
            tf.constant(validation_accuracy))
        average_validation_loss = hvd.allreduce(tf.constant(loss))

        if hvd.rank() == 0:
            info_str = 'Epoch: %d, Train Accuracy: %f, Train Loss: %f, Validation Accuracy: %f, Validation Loss: %f LR:%f' % (
                epoch, average_training_accuracy, average_training_loss,
                average_validation_accuracy, average_validation_loss,
                scheduler(curr_step))
            print(info_str)
            logger.info(info_str)
            if average_validation_accuracy > best_validation_accuracy:
                logger.info("Found new best accuracy, saving checkpoint ...")
                best_validation_accuracy = average_validation_accuracy
                model.save('{}/{}'.format(FLAGS.model_dir, FLAGS.model))
    if hvd.rank() == 0:
        logger.info('Total Training Time: %f' % (time() - start_time))
Example #15
0
 def _make_variable(self, metric, value):
     with tf.name_scope('MetricAverageCallback'):
         var = tf.Variable(value, name=metric)
         K.get_session().run(var.initializer)
         allreduce_op = hvd.allreduce(var, device_dense=self.device)
         return var, allreduce_op
Example #16
0
    def __init__(self,
                 league_mgr_addr,
                 model_pool_addrs,
                 learner_ports,
                 rm_size,
                 batch_size,
                 ob_space,
                 ac_space,
                 policy,
                 gpu_id,
                 policy_config={},
                 ent_coef=1e-2,
                 distill_coef=1e-2,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 rwd_shape=False,
                 pub_interval=500,
                 log_interval=100,
                 save_interval=0,
                 total_timesteps=5e7,
                 burn_in_timesteps=0,
                 learner_id='',
                 batch_worker_num=4,
                 pull_worker_num=2,
                 unroll_length=32,
                 rollout_length=1,
                 use_mixed_precision=False,
                 use_sparse_as_dense=True,
                 adam_beta1=0.9,
                 adam_beta2=0.999,
                 adam_eps=1e-5,
                 data_type=PGData,
                 data_server_version='v1',
                 decode=False,
                 log_infos_interval=20,
                 **kwargs):
        super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs,
                                        learner_ports, learner_id)

        self.LR = tf.placeholder(tf.float32, [])
        """Learning Rate"""

        self.CLIPRANGE = tf.placeholder(tf.float32, [])
        """Learning Rate Clip Range"""

        self.ep_loss_coef = {}
        """Coefficients for those losses from the endpoints. Override it in derived
     class."""

        # TODO(pengsun): fix the policy_config default value
        self._init_const(total_timesteps, burn_in_timesteps, batch_size,
                         unroll_length, rwd_shape, ent_coef, vf_coef,
                         pub_interval, log_interval, save_interval, policy,
                         distill_coef, policy_config, rollout_length)

        # allow_soft_placement=True can fix issue when some op cannot be defined on
        # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(gpu_id)
        self.sess = tf.Session(config=config)
        self.rank = hvd.rank() if has_hvd else 0

        # Prepare dataset
        ds = data_type(ob_space,
                       ac_space,
                       self.n_v,
                       use_lstm=self.rnn,
                       hs_len=self.hs_len,
                       distillation=self.distillation,
                       version='v2')
        self._data_server = DataServer(self._pull_data,
                                       rm_size,
                                       unroll_length,
                                       batch_size,
                                       ds,
                                       gpu_id_list=(0, ),
                                       batch_worker_num=batch_worker_num,
                                       pull_worker_num=pull_worker_num,
                                       rollout_length=rollout_length,
                                       prefetch_buffer_size=2,
                                       version=data_server_version,
                                       decode=decode,
                                       log_infos_interval=log_infos_interval)

        # prepare net config
        net_config = policy.net_config_cls(ob_space, ac_space, **policy_config)
        net_config.clip_range = self.CLIPRANGE
        if rwd_shape:
            # make net_config.reward-shaping-weights a tf.placeholder so as to change
            # it during training.
            # NOTE: Assume there is reward_weights_shape in net_config
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            reward_weights_shape = net_config.reward_weights_shape
            self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape)
            net_config.reward_weights = self.rwd_weights
        if hasattr(net_config, 'lam'):
            # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it
            #  during training.
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            self.LAM = tf.placeholder(tf.float32, [])
            net_config.lam = self.LAM
        else:
            self.LAM = None

        # build the policy net
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy.net_build_fun(inputs=inputs,
                                        nc=nc,
                                        scope=model_scope)

        device = '/gpu:{}'.format(0)
        with tf.device(device):
            input_data = self._data_server.input_datas[0]
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(input_data, net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(input_data, net_config)
            else:
                model = create_policy(input_data, net_config)
            loss, vf_loss, losses = self.build_loss(model, input_data)
        if has_hvd:
            self.losses = [hvd.allreduce(loss) for loss in losses]
        else:
            self.losses = list(losses)
        self.params = tf.trainable_variables(scope='model')
        self.params_vf = tf.trainable_variables(scope='model/vf')
        self.param_norm = tf.global_norm(self.params)

        self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR,
                                              beta1=adam_beta1,
                                              beta2=adam_beta2,
                                              epsilon=adam_eps)
        self.burn_in_trainer = tf.train.AdamOptimizer(
            learning_rate=self.LR, epsilon=1e-5)  # same as default and IL
        if use_mixed_precision:
            try:
                self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.trainer)
                self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.burn_in_trainer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if has_hvd:
            self.trainer = hvd.DistributedOptimizer(
                self.trainer, sparse_as_dense=use_sparse_as_dense)
            self.burn_in_trainer = hvd.DistributedOptimizer(
                self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense)
        grads_and_vars = self.trainer.compute_gradients(loss, self.params)
        grads_and_vars_vf = self.burn_in_trainer.compute_gradients(
            vf_loss, self.params_vf)
        clip_vars = model.vars.lstm_vars
        grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars(
            grads_and_vars, clip_vars, max_grad_norm)
        grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars(
            grads_and_vars_vf, clip_vars, max_grad_norm)

        self._train_batch = self.trainer.apply_gradients(grads_and_vars)
        self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf)
        self.loss_endpoints_names = model.loss.loss_endpoints.keys()
        self._build_ops()
        if has_hvd:
            barrier_op = hvd.allreduce(tf.Variable(0.))
            broadcast_op = hvd.broadcast_global_variables(0)
        tf.global_variables_initializer().run(session=self.sess)
        self.sess.graph.finalize()

        self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None
        self.broadcast = lambda: self.sess.run(broadcast_op
                                               ) if has_hvd else None
        self.broadcast()
        # logging stuff
        format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank
                       == 0 else ['stdout', 'log', 'tensorboard', 'csv'])
        logger.configure(dir='training_log/{}rank{}'.format(
            self._learner_id, self.rank),
                         format_strs=format_strs)
def run_barrier(config):
    barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
    print(tf.Session(config=config).run(barrier))
    print("============")
Example #18
0
    def common_minimize_trainable(self, base_opt, test_opt, name):
        from tensorflow.python.framework.errors_impl import NotFoundError

        # TODO(rhdong): Recover the testing, if the horovod import error is fixed on macOS+TF2.7+.
        try:
            import horovod.tensorflow as hvd
        except NotFoundError:
            self.skipTest(
                "Skip the test for horovod import error with Tensorflow-2.7.0 on MacOS-12."
            )

        tf.config.set_soft_device_placement(True)
        hvd.init()
        base_opt = de.DynamicEmbeddingOptimizer(base_opt, synchronous=True)
        for dtype, run_step, dim in itertools.product([dtypes.float32], [1],
                                                      [10]):
            x = tf.random.uniform(shape=[32, dim])
            y = tf.zeros([32, 1])

            global_step = training_util.create_global_step()

            base_weight = tf.compat.v1.get_variable(name="base_weights",
                                                    initializer=tf.ones(
                                                        [10, 1]))

            base_logits = tf.nn.relu(math_ops.matmul(x, base_weight))
            base_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=y, logits=base_logits)

            base_opt_op = base_opt.minimize(base_loss,
                                            global_step,
                                            var_list=[base_weight])

            test_weight = tf.compat.v1.get_variable(name="test_weights",
                                                    initializer=tf.ones(
                                                        [10, 1]))

            test_logits = tf.nn.relu(math_ops.matmul(x, test_weight))
            test_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=y, logits=test_logits)

            grads_and_vars = test_opt.compute_gradients(test_loss,
                                                        var_list=[test_weight])
            var_list = []
            aggregated_grad = []
            for grad, var in grads_and_vars:
                var_list.append(var)
                aggregated_grad.append(hvd.allreduce(grad, op=hvd.Sum))
            aggregated_grads_and_vars = zip(aggregated_grad, var_list)
            test_opt_op = test_opt.apply_gradients(aggregated_grads_and_vars,
                                                   global_step)

            with monitored_session.MonitoredTrainingSession(
                    is_chief=True, config=default_config) as sess:

                for _ in range(run_step):
                    sess.run(base_opt_op)
                    sess.run(test_opt_op)

                self.assertAllCloseAccordingToType(
                    sess.run(base_weight),
                    sess.run(test_weight),
                    msg="Cond:{},{},{}".format(dtype, run_step, dim),
                )
Example #19
0
 def _make_variable(self, metric, value):
     with tf.name_scope('MetricAverageCallback'):
         var = tf.Variable(value, name=metric)
         allreduce_op = hvd.allreduce(var, device_dense=self.device)
         return var, allreduce_op
    def __init__(self,
                 ports,
                 gpu_id,
                 replay_filelist,
                 batch_size,
                 min_train_sample_num,
                 min_val_sample_num,
                 rm_size,
                 learning_rate,
                 print_interval,
                 checkpoint_interval,
                 num_val_batches,
                 replay_converter_type,
                 policy,
                 policy_config,
                 converter_config=None,
                 policy_config_type=None,
                 model_pool_addrs=None,
                 rollout_length=1,
                 checkpoints_dir=None,
                 restore_checkpoint_path=None,
                 train_generator_worker_num=4,
                 val_generator_worker_num=2,
                 pull_worker_num=2,
                 num_sgd_updates=int(1e30),
                 repeat_training_task=False,
                 unroll_length=32,
                 pub_interval=50,
                 max_clip_grad_norm=1,
                 after_loading_init_scope=None,
                 use_mixed_precision=False,
                 use_sparse_as_dense=False,
                 enable_validation=True,
                 post_process_data=None):
        assert len(ports) == 2
        self.use_hvd = has_hvd and hvd.size() > 1
        self.rank = 0 if not self.use_hvd else hvd.rank()
        self.model_key = 'IL-model'
        self.pub_interval = pub_interval
        self.rnn = (False if 'use_lstm' not in policy_config else
                    policy_config['use_lstm'])
        self.hs_len = None
        # overwrite it using the batch_size for training
        policy_config['batch_size'] = batch_size
        if self.rnn:
            assert model_pool_addrs is not None
            self._model_pool_apis = ModelPoolAPIs(model_pool_addrs)
            self._model_pool_apis.check_server_set_up()
            policy_config['rollout_len'] = rollout_length
            # infer hidden state length (size)
            if 'hs_len' in policy_config:
                self.hs_len = policy_config['hs_len']
            elif 'nlstm' in policy_config:
                self.hs_len = 2 * policy_config['nlstm']
            else:
                self.hs_len = 128

        self.should_push_model = (self.rnn and self.rank == 0)
        use_gpu = (gpu_id >= 0)
        converter_config = {} if converter_config is None else converter_config
        train_replay_filelist, val_replay_filelist = _get_local_replays(
            replay_filelist)
        replay_converter = replay_converter_type(**converter_config)
        ob_space, ac_space = replay_converter.space.spaces
        if post_process_data is not None:
            ob_space, ac_space = post_process_data(ob_space, ac_space)
        self.data_pool = ImDataServer(
            ports=ports,
            train_replay_filelist=train_replay_filelist,
            val_replay_filelist=val_replay_filelist,
            batch_size=batch_size,
            min_train_sample_num=min_train_sample_num,
            min_val_sample_num=min_val_sample_num,
            ob_space=ob_space,
            ac_space=ac_space,
            train_generator_worker_num=train_generator_worker_num,
            val_generator_worker_num=val_generator_worker_num,
            pull_worker_num=pull_worker_num,
            rm_size=rm_size,
            repeat_training_task=repeat_training_task,
            unroll_length=unroll_length,
            rollout_length=rollout_length,
            lstm=self.rnn,
            hs_len=self.hs_len,
            use_gpu=use_gpu)
        self._enable_validation = enable_validation

        config = tf.ConfigProto(allow_soft_placement=True)
        if use_gpu:
            config.gpu_options.visible_device_list = str(gpu_id)
            config.gpu_options.allow_growth = True
        self._sess = tf.Session(config=config)

        net_config = policy_config_type(ob_space, ac_space, **policy_config)
        net_config_val = deepcopy(net_config)
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy(inputs=inputs, nc=nc, scope=model_scope)

        if hasattr(net_config, 'endpoints_verbosity'):
            # intentionally disables endpoints during training
            net_config.endpoints_verbosity = 0
        device = '/gpu:0' if use_gpu else '/cpu:0'
        with tf.device(device):
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(self.data_pool.train_batch_input,
                                              net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(self.data_pool.train_batch_input,
                                          net_config)
            else:
                model = create_policy(self.data_pool.train_batch_input,
                                      net_config)

        model_val = create_policy(self.data_pool.val_batch_input,
                                  net_config_val)
        params = tf.trainable_variables(scope='model')
        param_norm = tf.global_norm(params)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           epsilon=1e-5)
        if use_mixed_precision:
            try:
                optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    optimizer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if self.use_hvd:
            optimizer = hvd.DistributedOptimizer(
                optimizer, sparse_as_dense=use_sparse_as_dense)
            barrier_op = hvd.allreduce(tf.Variable(0.))
            self.barrier = lambda: self._sess.run(barrier_op)
        train_loss = tf.reduce_mean(model.loss.total_il_loss *
                                    self.data_pool.train_batch_weight)
        val_loss = tf.reduce_mean(model_val.loss.total_il_loss *
                                  self.data_pool.val_batch_weight)
        if hasattr(net_config, 'weight_decay') and not net_config.weight_decay:
            # None or 0.0
            total_loss = train_loss
        else:
            total_loss = train_loss + model.loss.total_reg_loss
        grads_and_vars = optimizer.compute_gradients(total_loss, params)
        clip_vars = model.vars.lstm_vars
        clip_grads = [grad for grad, var in grads_and_vars if var in clip_vars]
        nonclip_grads_and_vars = [(grad, var) for grad, var in grads_and_vars
                                  if var not in clip_vars]
        if max_clip_grad_norm > 0:
            clip_grads, clip_grad_norm = tf.clip_by_global_norm(
                clip_grads, max_clip_grad_norm)
        else:
            clip_grad_norm = tf.global_norm(clip_grads)
        clip_grads_and_var = list(zip(clip_grads, clip_vars))
        grads_and_vars = clip_grads_and_var + nonclip_grads_and_vars
        grad_norm = tf.global_norm(list(zip(*grads_and_vars))[0])

        train_op = optimizer.apply_gradients(grads_and_vars)
        tf.global_variables_initializer().run(session=self._sess)

        self.new_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in params
        ]
        self.param_assign_ops = [
            p.assign(new_p) for p, new_p in zip(params, self.new_params)
        ]
        opt_params = optimizer.variables()
        self.new_opt_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in opt_params
        ]
        self.opt_param_assign_ops = [
            p.assign(new_p)
            for p, new_p in zip(opt_params, self.new_opt_params)
        ]

        def read_params():
            return self._sess.run(params)

        def read_opt_params():
            return self._sess.run(opt_params)

        def load_model(np_new_params):
            self._sess.run(
                self.param_assign_ops,
                feed_dict={
                    p: np_p
                    for p, np_p in zip(self.new_params, np_new_params)
                })

        def restore_optimizer(np_new_opt_params):
            self._sess.run(
                self.opt_param_assign_ops,
                feed_dict={
                    p: np_p
                    for p, np_p in zip(self.new_opt_params, np_new_opt_params)
                })

        def _train_step():
            return self._sess.run([
                train_loss_aggregated, *train_other_losses_aggregated,
                grad_norm, clip_grad_norm, param_norm, train_op
            ], {})[:-1]

        def _val_step():
            # maximal_feat = [tf.reduce_max(tf.cast(x, tf.float32))
            # for x in self.data_pool.val_batch_input.X]
            # print(self._sess.run(maximal_feat, {}))
            return self._sess.run([
                val_loss_aggregated, *val_other_losses_aggregated,
                *endpoints_aggregated
            ], {})

        self._saver = ChkptsFromSelf(read_params, load_model, self.model_key)

        if restore_checkpoint_path is not None:
            self._saver._restore_model_checkpoint(restore_checkpoint_path)

        if after_loading_init_scope is not None:
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=after_loading_init_scope)
            logger.log('perform after loading init for vars')
            for v in var_list:
                logger.log(v)
            tf.variables_initializer(var_list).run(session=self._sess)

        if self.use_hvd:
            hvd.broadcast_global_variables(0).run(session=self._sess)

        _allreduce = lambda x: x if not self.use_hvd else hvd.allreduce(x)
        train_loss_aggregated = _allreduce(train_loss)
        train_other_loss_names = model.loss.loss_endpoints.keys()
        train_other_losses_aggregated = [
            _allreduce(tf.reduce_mean(l * self.data_pool.train_batch_weight))
            for l in model.loss.loss_endpoints.values()
        ]
        val_loss_aggregated = _allreduce(val_loss)
        val_other_loss_names = model_val.loss.loss_endpoints.keys()
        val_other_losses_aggregated = [
            _allreduce(tf.reduce_mean(l * self.data_pool.val_batch_weight))
            for l in model_val.loss.loss_endpoints.values()
        ]
        endpoints_names = model_val.endpoints.keys()
        endpoints_aggregated = [
            _allreduce(tf.reduce_mean(l))
            for l in model_val.endpoints.values()
        ]
        self._sess.graph.finalize()
        self._total_samples = lambda: [
            self.data_pool._num_train_samples, self.data_pool._num_val_samples
        ]
        self._train_log_names = (['loss'] + list(train_other_loss_names) +
                                 ['grad_norm', 'clip_grad_norm', 'param_norm'])
        self._val_log_names = (['loss'] + list(val_other_loss_names) +
                               list(endpoints_names))
        self._batch_size = batch_size
        self._train_step = _train_step
        self._val_step = _val_step
        self._print_interval = print_interval
        self._checkpoint_interval = checkpoint_interval
        self._num_val_batches = num_val_batches
        self._checkpoints_dir = checkpoints_dir if self.rank == 0 else None
        self._num_sgd_updates = num_sgd_updates
        self.load_model = load_model
        self.restore_optimizer = restore_optimizer
        self.read_params = read_params
        self.read_opt_params = read_opt_params

        format_strs = ['log', 'tensorboard', 'csv']
        logger.configure(dir='training_log/rank{}'.format(self.rank),
                         format_strs=['stdout'] + format_strs)
        with logger.scoped_configure(dir='validation_log/rank{}'.format(
                self.rank),
                                     format_strs=['stderr'] + format_strs):
            self.val_logger = logger.Logger.CURRENT
Example #21
0
 def _setup_graph(self):
     self._placeholder = tf.placeholder(tf.float32, shape=[2], name='to_be_reduced')
     self._reduced = hvd.allreduce(self._placeholder, average=False)
def get_larc_optimizer(optimizer, loss, global_step, steps_per_epoch,
                       use_horovod):
    #get learning rate
    learning_rate = get_learning_rate(optimizer, global_step, steps_per_epoch)

    #get LARC stuff
    LARC_mode = get_dict_default(optimizer, "LARC_mode", "clip")
    LARC_eta = get_dict_default(optimizer, "LARC_eta", 0.002)
    LARC_epsilon = get_dict_default(optimizer, "LARC_epsilon", 1. / 16000.)

    #lag
    gradient_lag = get_dict_default(optimizer, "gradient_lag", 0)

    #set up optimizers
    opt_type = get_dict_default(optimizer, "opt_type", "LARC-Adam")

    #set up optimizers
    if opt_type == "LARC-Adam":
        beta1 = get_dict_default(optimizer, "beta1", 0.9)
        beta2 = get_dict_default(optimizer, "beta2", 0.999)
        optim = tf.train.AdamOptimizer(learning_rate=learning_rate)
#        optim = tf.train.experimental.enable_mixed_precision_graph_rewrite(optim)
    elif opt_type == "LARC-RMSProp":
        optim = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
    elif opt_type == "LARC-SGD":
        momentum = get_dict_default(optimizer, "momentum", 0.)
        optim = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=momentum)
    else:
        raise ValueError("Error, optimizer {} unsupported.".format(opt_type))

    # instead of using the horovod wrapper, we do the allreduce ourselves below

    #compute gradients
    grads_and_vars = optim.compute_gradients(loss)
    lag_ops = []
    for idx, (g, v) in enumerate(grads_and_vars):
        if g is not None:
            if gradient_lag > 0:
                g_lag = tf.Variable(initial_value=tf.zeros(g.shape, g.dtype),
                                    trainable=False,
                                    name=v.name.replace(":", "_") + '_lag')
                g_next = g
                g = g_lag

            if use_horovod and (hvd.size() > 1):
                # if we ask for an average, it does a scalar divide, but
                #  we can bake that into the scaling below
                g = hvd.allreduce(g, average=False)
                g_scale = 1. / hvd.size()
            else:
                g_scale = 1

            v_norm = linalg_ops.norm(tensor=v, ord=2)
            g_norm = linalg_ops.norm(tensor=g, ord=2)

            larc_local_lr = control_flow_ops.cond(
                pred=math_ops.logical_and(
                    math_ops.not_equal(v_norm, tf.constant(0.0)),
                    math_ops.not_equal(g_norm, tf.constant(0.0))),
                true_fn=lambda: (LARC_eta / g_scale) * v_norm / g_norm,
                false_fn=lambda: LARC_epsilon)

            if LARC_mode == "scale":
                effective_lr = larc_local_lr
            else:
                # DEBUG
                #effective_lr = math_ops.minimum(larc_local_lr, 1.0)
                #we need to see which LR to take and then divide out the LR because otherwise it will be multiplied in
                #again when we apply the gradients
                effective_lr = math_ops.minimum(larc_local_lr,
                                                learning_rate) / learning_rate
                # DEBUG

            # rescale gradients
            effective_lr *= g_scale

            #multiply gradients
            g_scaled = math_ops.scalar_mul(effective_lr, g)
            grads_and_vars[idx] = (g_scaled, v)

            if gradient_lag > 0:
                # once we've computed g_scaled, it's safe to overwrite g_lag
                with tf.control_dependencies([g_scaled]):
                    lag_ops.append(g_lag.assign(g_next))

    #apply gradients, making sure to complete the forward pass first
    with tf.control_dependencies([loss]):
        grad_updates = optim.apply_gradients(grads_and_vars,
                                             global_step=global_step)
    if gradient_lag > 0:
        grad_updates = tf.group([grad_updates] + lag_ops)

    return grad_updates, learning_rate
Example #23
0
def allreduce(backend, value, name, average, prescale_factor, postscale_factor, op, compression):
    return _eval(backend, hvd.allreduce(tf.constant(value, name=name), average=average,
                                        prescale_factor=prescale_factor,
                                        postscale_factor=postscale_factor,
                                        op=op, compression=compression))
Example #24
0
def ppo2_loss(neglogp,
              oldneglogp,
              vpred,
              R,
              mask=None,
              adv_normalize=True,
              clip_range=0.1,
              sync_statistics=None):
    """"PPO2 loss.

  The PPO implementation where the values are computed at the learner's end,
  i.e., there is no `oldvpred` term. This treatment is suitable for PPO with
  large Replay Memory where the off-policy effect bulks.

  It accepts the shape layout (b1, b2, ...) which will be taken as the
  batch_size dimension. For example, the neglopg
  * can be (batch_size,)
  * can be (T, B) where T=rollout_len, B=n_rollout

  Args:
    neglogp: neglogp of pi, in shape (b1, b2,...)
    oldneglogp: neglogp of pi_old, in shape (b1, b2,...)
    vpred: predicted v, in shape (b1, b2, ...)
    R: (lambda) return, in shape (b1, b2, ...)
    mask: action logits mask in 0/1 value, in shape (b1, b2,...), the
      same shape with `neglogp` and `oldneglogp`
    adv_normalize: whether to normalize advantage
    clip_range: clip range, scalar
    sync_statistics: whether to synchronize statistics across multi GPUs (if
      any)

  Returns:
    pg_loss: the PPO policy loss, a scalar in shape ()
  """
    # Note the negative sign; we want ratio = pi / pi_{old}
    ratio = oldneglogp - neglogp
    if mask is not None:
        ratio = mask * ratio
    ratio = tf.exp(ratio)

    # make the advantage
    # Note: DO the stop_gradient stuff AT THE CALLER'S END when necessary
    # R = tf.stop_gradient(R)
    # vpred = tf.stop_gradient(vpred)
    adv = R - vpred
    # normalize the advantage
    batch_mean = tf.reduce_mean(adv)
    batch_mean_square = tf.reduce_mean(tf.square(adv))
    if sync_statistics == 'horovod':
        # https://github.com/tensorpack/tensorpack/blob/07783edb998cec3ec91c4312b39bd754cf9ececa/tensorpack/models/batch_norm.py#L226-L231
        import horovod.tensorflow as hvd
        batch_mean = hvd.allreduce(batch_mean, average=True)
        batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
    adv = adv - batch_mean
    if adv_normalize:
        adv = adv / tf.sqrt(batch_mean_square + 1e-8)

    # the ppo policy gradient loss
    pg_losses1 = -adv * ratio
    pg_losses2 = -adv * tf.clip_by_value(ratio, 1.0 - clip_range,
                                         1.0 + clip_range)
    # pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))
    pg_loss = tf.reduce_mean(
        tf.where(tf.greater(adv, 0), tf.maximum(pg_losses1, pg_losses2),
                 pg_losses2))
    return pg_loss
Example #25
0
    def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
        """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
        if not trainable_vars_for_gradients:
            return tf.no_op(name="no_grad_vars_no_op")
        # AccumulateN might not be deterministic but should be faster and should require less memory.
        # We might want to make this configurable.
        if self.config.is_true("deterministic_train"):
            aggregation_method = tf.AggregationMethod.ADD_N
        else:
            aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        accum_grad_multiple_num_steps = self.config.int(
            "accum_grad_multiple_step", 0)
        grad_noise = self.config.float("gradient_noise", 0.0)
        grad_clip = self.config.float("gradient_clip", 0.0)
        grad_clip_norm = self.config.float("gradient_clip_norm", 0.0)
        grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0)
        grad_clip_global_norm = self.config.float("gradient_clip_global_norm",
                                                  0.0)
        # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

        # Extended self.optimizer.minimize() to optionally modify gradients.
        grads_and_vars = self.optimizer.compute_gradients(
            loss,
            var_list=trainable_vars_for_gradients,
            aggregation_method=aggregation_method)
        if self.config.is_true("use_horovod") and self.config.value(
                "horovod_reduce_type", "") == "grad":
            import horovod.tensorflow as hvd
            grads_and_vars = [(hvd.allreduce(
                grad, average=self.config.is_true("horovod_avg_grad"))
                               if grad is not None else None, var)
                              for (grad, var) in grads_and_vars]
        var_grads = {
            var: grad
            for (grad, var) in grads_and_vars if grad is not None
        }
        if not var_grads:
            raise Exception("no single variable to train")
        if self.config.float("maximize_grad_norm", 0):
            f = self.config.float("maximize_grad_norm", 0)
            grad_norm = tf.add_n(
                [tf.nn.l2_loss(g) for g in var_grads.values()],
                name="grad_norm_half") * 2.0
            loss_ext = grad_norm * (-f)
            grads_and_vars_ext = self.optimizer.compute_gradients(
                loss_ext,
                var_list=list(var_grads.keys()),
                aggregation_method=aggregation_method)
            var_grads_ext = {
                var: grad
                for (grad, var) in grads_and_vars_ext if grad is not None
            }
            grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var)
                              for (grad, var) in grads_and_vars]
        if accum_grad_multiple_num_steps >= 1:
            grads_and_vars = [(accum_grad_multiple_step(
                grad,
                var,
                train_step=self.network.global_train_step,
                num_accum_steps=accum_grad_multiple_num_steps), var)
                              for (grad, var) in grads_and_vars]
        if self.config.bool("debug_grad_summaries", False):
            from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
            for grad, var in grads_and_vars:
                with reuse_name_scope_of_tensor(grad, prefix="grads/"):
                    variable_summaries(grad,
                                       name="grad_of_%s" % get_base_name(var))
                with reuse_name_scope_of_tensor(var, prefix="vars/"):
                    variable_summaries(var, name=get_base_name(var))
        # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
        if self.config.bool("gradient_nan_inf_filter", False):
            from TFUtil import nan_to_num
            grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var)
                              for (grad, var) in grads_and_vars]
        if grad_noise:
            assert grad_noise > 0
            from TFUtil import add_scaled_noise_to_gradients
            with tf.name_scope("grad_noise"):
                grads_and_vars = add_scaled_noise_to_gradients(
                    grads_and_vars, grad_noise)
        if grad_clip:
            assert grad_clip > 0
            with tf.name_scope("grad_clip"):
                grads_and_vars = [(tf.clip_by_value(grad, -grad_clip,
                                                    grad_clip), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_norm:
            assert grad_clip_norm > 0
            with tf.name_scope("grad_clip_norm"):
                grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_avg_norm:
            assert grad_clip_avg_norm > 0
            with tf.name_scope("grad_clip_avg_norm"):
                grads_and_vars = [
                    (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var)
                    for grad, var in grads_and_vars
                ]
        if grad_clip_global_norm:
            assert grad_clip_global_norm > 0
            with tf.name_scope("grad_clip_global_norm"):
                grads_clipped, _ = tf.clip_by_global_norm(
                    [grad for (grad, _) in grads_and_vars],
                    grad_clip_global_norm)
                grads_and_vars = zip(grads_clipped,
                                     [var for (_, var) in grads_and_vars])
        if accum_grad_multiple_num_steps >= 1:
            apply_grads = tf.cond(
                tf.equal(
                    tf.mod(self.network.global_train_step,
                           accum_grad_multiple_num_steps),
                    accum_grad_multiple_num_steps - 1),
                true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars),
                false_fn=lambda: tf.no_op(),
                name="apply_grads/accum_grad_multiple_step")
        else:
            apply_grads = self.optimizer.apply_gradients(grads_and_vars)
        return apply_grads
Example #26
0
    data = tf.get_variable("data",
                           shape=[batch_size, 224, 224, 3],
                           initializer=tf.random_normal_initializer(),
                           trainable=False)
    target = tf.get_variable("target",
                             shape=[batch_size, 1],
                             initializer=tf.random_normal_initializer(),
                             trainable=False)
    target = tf.cast(target, tf.int64)

    network = Network(batch_size)

    loss = network.cal_train_loss([data, target])
    params = tf.trainable_variables()
    grads = tf.gradients(loss, params)
    grads1 = [hvd.allreduce(grad, average=False) for grad in grads]
    gradvars = list(zip(grads1, params))

    opt = tf.train.GradientDescentOptimizer(0.001)
    train_opt = opt.apply_gradients(gradvars)

init = tf.compat.v1.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
with tf.Session(config=config) as session:
    session.run(init)
    session.run(bcast_op)
    for x in range(num_iters):
        session.run(train_opt)
Example #27
0
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              ema_update='default',
              sync_statistics=None,
              internal_update=None):
    """
    A more powerful version of `tf.layers.batch_normalization`. It differs from
    the offical one in the following aspects:

    1. Accepts an alternative ``data_format`` option when ``axis`` is None. For 2D input, this argument will be ignored.
    2. Default value for ``momentum`` and ``epsilon`` is different.
    3. Default value for ``training`` is automatically obtained from tensorpack's ``TowerContext``.
       User-provided value can overwrite this behavior.
    4. Support the ``ema_update`` option, which covers broader use cases than the standard EMA update.
    5. Support the ``sync_statistics`` option, which implements "SyncBN" and is very useful in small-batch models.

    Args:
        training (bool): if True, use per-batch statistics to normalize. Otherwise, use stored EMA
            to normalize. By default, it is equal to `get_current_tower_context().is_training`.
            This is not a good argument name, but it is what the Tensorflow layer uses.
        ema_update (str): Only effective when ``training=True``. It has the following options:

          * "default": same as "collection". Because this is the default behavior in tensorflow.
          * "skip": do not update EMA. This can be useful when you reuse a batch norm layer in several places
            but do not want them to all update your EMA.
          * "collection": Add EMA update ops to collection `tf.GraphKeys.UPDATE_OPS`.
            The ops in the collection will be run automatically by the callback :class:`RunUpdateOps`, along with
            your training iterations. This can waste compute if your training iterations do not always depend
            on the BatchNorm layer.
          * "internal": EMA is updated inside this layer itself by control dependencies.
            In common cases, it has similar speed to "collection". But it covers more cases, e.g.:

            1. BatchNorm is used inside dynamic control flow.
               The collection-based update does not support dynamic control flows.
            2. BatchNorm layer is sometimes unused (e.g., in GANs you have two networks to train alternatively).
               Putting all update ops into a single collection will waste a lot of compute.
            3. Other part of the model relies on the "updated" EMA. The collection-based method does not update
               EMA immediately.

            Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None, "nccl", or "horovod". It determines how to compute the
          "per-batch statistics" when ``training==True``.

          * None: it uses statistics of the input tensor to normalize during training.
            This is the standard way BatchNorm was implemented in most frameworks.

          * "nccl": this layer must be used under tensorpack's multi-GPU trainers.
            It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.

          * "horovod": this layer must be used under tensorpack's :class:`HorovodTrainer`.
            It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
            Note that on single machine this is significantly slower than the "nccl" implementation.

          When not None, each GPU computes its own E[x] and E[x^2],
          which are then averaged among all GPUs to compute global mean & variance.
          Therefore each GPU needs to have the same batch size.

          The synchronization is based on the current variable scope + the name of the layer
          (`BatchNorm('name', input)`). Therefore, you need to make sure that:

          1. The BatchNorm layer on different GPUs needs to have the same name, so that
             statistics can be synchronized. If names do not match, this layer will hang.
          2. A BatchNorm layer cannot be reused within one tower.
          3. A BatchNorm layer needs to be executed for the same number of times by all GPUs.
             If different GPUs execute one BatchNorm layer for different number of times
             (e.g., if some GPUs do not execute it), this layer may hang.

          This option is also known as "SyncBN" or "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222.

          When `sync_statistics` is enabled, `ema_update` is set to "internal" automatically.
          This is to avoid running `UPDATE_OPS`, which requires synchronization.

        internal_update: deprecated option. Don't use.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        This layer is more flexible than the standard "BatchNorm" layer and provides more features:

        1. No matter whether you're doing training or not, you can set the ``training`` argument
           to use batch statistics or EMA statistics.
           i.e., you can use batch statistics during inference, or use EMA statistics during training.
           Using EMA statistics in training is useful when you load a pre-trained BN and
           don't want to update it.
        2. As long as `training=True`, `sync_statistics` and `ema_update` option will take effect.
    """
    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)

    # parse shapes
    data_format = get_data_format(data_format, keras_mode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    assert ema_update in ["default", "collection", "internal", "skip"]
    if internal_update is not None:
        log_deprecated("BatchNorm(internal_update=)", "Use ema_update='internal' instead!", "2020-01-01")
        assert ema_update == 'default', \
            "Do not use internal_update and ema_update together! internal_update is deprecated"
        ema_update = "internal" if internal_update else "collection"
    if ema_update == "default":
        ema_update = "collection"
    # Logic:
    # 1. EMA update is possible only when we compute batch statistics (training=True)
    # 2. We know that in training, non-main training tower does not need EMA update
    #    We don't know about what to do in prediction context, so be conservative and do the update.
    # 3. User can explicit disable update by "skip".
    do_ema_update = training and \
        (ctx.is_main_training_tower or not ctx.is_training) \
        and (ema_update != "skip")

    if axis is None:
        if ndims == 2:
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    assert axis in [1, 3], axis
    num_chan = shape[axis]

    TF_version = get_tf_version_tuple()

    freeze_bn_backward = not training and ctx.is_training
    if freeze_bn_backward:
        assert TF_version >= (1, 4), \
            "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!"
        if ctx.is_main_training_tower:  # only warn in first tower
            log_once("Some BatchNorm layer uses moving_mean/moving_variance in training.", func='warn')
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    do_sync_bn = (sync_statistics is not None) and training

    if not do_sync_bn:
        # Use the builtin layer for anything except for sync-bn
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable(
                {'moving_mean': 'mean/EMA',
                 'moving_variance': 'variance/EMA'}):
            tf_args = dict(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429
                fused=(ndims == 4 and axis in [1, 3] and not freeze_bn_backward),
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= (1, 5):
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            use_fp16 = inputs.dtype == tf.float16
            if use_fp16:
                # non-fused does not support fp16; fused does not support all layouts.
                # we made our best guess here
                tf_args['fused'] = True
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

        # Add EMA variables to the correct collection
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                if isinstance(v, tf.Variable):
                    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v)

        if not do_ema_update:
            restore_collection(coll_bk)
        if do_ema_update and ema_update == "internal":
            # Implement "internal" update.
            restore_collection(coll_bk)
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            num_dev = ctx.total
            if num_dev == 1:
                logger.warn("BatchNorm(sync_statistics='nccl') is used with only one tower!")
            else:
                assert six.PY2 or TF_version >= (1, 10), \
                    "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \
                    "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360"

                if TF_version <= (1, 12):
                    try:
                        from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so
                    except Exception:
                        pass
                    else:
                        _validate_and_load_nccl_so()
                    from tensorflow.contrib.nccl.ops import gen_nccl_ops
                else:
                    from tensorflow.python.ops import gen_nccl_ops
                shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name)
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            if hvd.size() == 1:
                logger.warn("BatchNorm(sync_statistics='horovod') is used with only one process!")
            else:
                import horovod
                hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
                assert hvd_version >= (0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !"

                batch_mean = hvd.allreduce(batch_mean, average=True)
                batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                tf.reshape(beta, new_shape),
                tf.reshape(gamma, new_shape), epsilon)
        else:
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                beta, gamma, epsilon)

        if do_ema_update:
            ret = internal_update_bn_ema(
                xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
def main(input_path_train, input_path_validation, downsampling_fact,
         downsampling_mode, channels, data_format, label_id, blocks, weights,
         image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, fs_type,
         optimizer, batch, batchnorm, num_epochs, dtype, chkpt, filter_sz,
         growth, disable_checkpoints, disable_imsave, tracing, trace_dir,
         output_sampling, scale_factor):

    #init horovod
    nvtx.RangePush("init horovod", 1)
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))
    nvtx.RangePop()  # init horovod

    #downsampling? recompute image dims
    image_height = image_height_orig // downsampling_fact
    image_width = image_width_orig // downsampling_fact

    #parameters
    per_rank_output = False
    loss_print_interval = 10

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=6,  #1
        intra_op_parallelism_threads=1,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)
    sess_config.gpu_options.force_gpu_compatible = True

    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    trn_data = load_data(input_path_train, True, trn_sz, horovod)
    val_data = load_data(input_path_validation, False, val_sz, horovod)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")

    #print some stats
    if comm_rank == 0:
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Batch normalization: {}".format(batchnorm))
        print("Blocks: {}".format(blocks))
        print("Growth rate: {}".format(growth))
        print("Filter size: {}".format(filter_sz))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Loss scale factor: {}".format(scale_factor))
        print("Output sampling target: {}".format(output_sampling))
        #print optimizer parameters
        for k, v in optimizer.items():
            print("Solver Parameters: {k}: {v}".format(k=k, v=v))
        #print("Optimizer type: {}".format(optimizer['opt_type']))
        print("Num training samples: {}".format(trn_data.shape[0]))
        print("Num validation samples: {}".format(val_data.shape[0]))
        print("Disable checkpoints: {}".format(disable_checkpoints))
        print("Disable image save: {}".format(disable_imsave))
        print("Downsampling factor: {}".format(downsampling_fact))
        print("Downsampling mode: {}".format(downsampling_mode))

    #compute epochs and stuff:
    if fs_type == "local":
        num_samples = trn_data.shape[0] // comm_local_size
    else:
        num_samples = trn_data.shape[0] // comm_size
    num_steps_per_epoch = num_samples // batch
    num_steps = num_epochs * num_steps_per_epoch
    if per_rank_output:
        print("Rank {} does {} steps per epoch".format(comm_rank,
                                                       num_steps_per_epoch))

    with training_graph.as_default():
        nvtx.RangePush("TF Init", 3)
        #create readers
        trn_reader = h5_input_reader(input_path_train,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id,
                                     sample_target=output_sampling)
        val_reader = h5_input_reader(input_path_validation,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id)
        #create datasets
        if fs_type == "local":
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype, tf.string),
            ((batch, len(channels), image_height_orig,
              image_width_orig) if data_format == "channels_first" else
             (batch, image_height_orig, image_width_orig, len(channels)),
             (batch, image_height_orig, image_width_orig),
             (batch, image_height_orig, image_width_orig), (batch)))
        next_elem = iterator.get_next()

        #if downsampling, do some preprocessing
        if downsampling_fact != 1:
            if downsampling_mode == "scale":
                #do downsampling
                rand_select = tf.cast(tf.one_hot(tf.random_uniform(
                    (batch, image_height, image_width),
                    minval=0,
                    maxval=downsampling_fact * downsampling_fact,
                    dtype=tf.int32),
                                                 depth=downsampling_fact *
                                                 downsampling_fact,
                                                 axis=-1),
                                      dtype=tf.int32)
                next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                             tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1,1,1,1], 'VALID'), rand_select), axis=-1), \
                             tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \
                             next_elem[3])
            elif downsampling_mode == "center-crop":
                #some parameters
                length = 1. / float(downsampling_fact)
                offset = length / 2.
                boxes = [[offset, offset, offset + length, offset + length]
                         ] * batch
                box_ind = list(range(0, batch))
                crop_size = [image_height, image_width]

                #be careful with data order
                if data_format == "channels_first":
                    next_elem = (tf.transpose(next_elem[0], perm=[0, 2, 3, 1]),
                                 next_elem[1], next_elem[2], next_elem[3])

                #crop
                next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \
                             ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \
                             tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \
                             next_elem[3])

                #be careful with data order
                if data_format == "channels_first":
                    next_elem = (tf.transpose(next_elem[0], perm=[0, 3, 1, 2]),
                                 next_elem[1], next_elem[2], next_elem[3])

            elif downsampling_mode == "random-crop":
                #some parameters
                crop_size = [
                    batch, image_height, image_width,
                    len(channels) + 2
                ]

                #concatenate input, crop, split apart
                crop_input = tf.concat([next_elem[0] if data_format=="channels_last" else tf.transpose(next_elem[0], perm=[0,2,3,1]), \
                                        ensure_type(tf.expand_dims(next_elem[1], axis=-1), tf.float32), \
                                        tf.expand_dims(next_elem[2], axis=-1)], \
                                       axis = -1)
                crop_output = tf.image.random_crop(crop_input, crop_size)

                #restore iterator output
                crop_image = crop_output[:, :, :, :len(channels)]
                crop_label = ensure_type(crop_output[:, :, :,
                                                     len(channels)], tf.int32)
                crop_weight = crop_output[:, :, :, len(channels) + 1]
                next_elem = (crop_image if data_format=="channels_last" else tf.transpose(crop_image, perm=[0,3,1,2]), \
                             crop_label, crop_weight, next_elem[3])

            else:
                raise ValueError(
                    "Error, downsampling mode {} not supported. Supported are [center-crop, random-crop, scale]"
                    .format(downsampling_mode))

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #compute the input filter number based on number of channels used
        num_channels = len(channels)
        nb_filter = 64

        #set up model
        logit, prediction = create_tiramisu(3,
                                            next_elem[0],
                                            image_height,
                                            image_width,
                                            num_channels,
                                            loss_weights=weights,
                                            nb_layers_per_block=blocks,
                                            p=0.2,
                                            wd=1e-4,
                                            dtype=dtype,
                                            batchnorm=batchnorm,
                                            growth_rate=growth,
                                            nb_filter=nb_filter,
                                            filter_sz=filter_sz,
                                            median_filter=False,
                                            data_format=data_format)
        #prediction_argmax = median_pool(prediction_argmax, 3, strides=[1,1,1,1])

        #set up loss
        loss = None
        if loss_type == "weighted":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
            if scale_factor != 1.0:
                loss *= scale_factor
        elif loss_type == "focal":
            labels_one_hot = tf.contrib.layers.one_hot_encoding(
                next_elem[1], 3)
            labels_one_hot = ensure_type(labels_one_hot, dtype)
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)
        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))

        #determine flops
        flops = graph_flops.graph_flops(
            format="NHWC" if data_format == "channels_last" else "NCHW",
            batch=batch,
            sess_config=sess_config)
        flops *= comm_size
        if comm_rank == 0:
            print('training flops: {:.3f} TF/step'.format(flops * 1e-12))

        #number of trainable parameters
        if comm_rank == 0:
            num_params = get_number_of_trainable_parameters()
            print('number of trainable parameters: {} ({} MB)'.format(
                num_params,
                num_params * (4 if dtype == tf.float32 else 2) * (2**-20)))

        if horovod:
            loss_avg = hvd.allreduce(ensure_type(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)

        #set up global step - keep on CPU
        with tf.device('/device:CPU:0'):
            global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if optimizer['opt_type'].startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op, lr = get_larc_optimizer(optimizer, loss, global_step,
                                              num_steps_per_epoch, horovod)
        else:
            train_op, lr = get_optimizer(optimizer, loss, global_step,
                                         num_steps_per_epoch, horovod)

        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        #bcast init for bcasting the model after start
        if horovod:
            init_bcast = hvd.broadcast_global_variables(0)

        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = 5 * num_steps_per_epoch
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            if (not disable_checkpoints):
                hooks.append(
                    tf.train.CheckpointSaverHook(
                        checkpoint_dir=checkpoint_dir,
                        save_steps=checkpoint_save_freq,
                        saver=checkpoint_saver))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        if tracing is not None:
            import tracehook
            tracing_hook = tracehook.TraceHook(steps_to_trace=tracing,
                                               cache_traces=True,
                                               trace_dir=trace_dir)
            hooks.append(tracing_hook)

        # instead of averaging losses over an entire epoch, use a moving
        #  window average
        recent_losses = []
        loss_window_size = 10

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #restore from checkpoint:
            if comm_rank == 0 and not disable_checkpoints:
                load_model(sess, checkpoint_saver, checkpoint_dir)
            #broadcast loaded model variables
            if horovod:
                sess.run(init_bcast)
            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op, feed_dict={handle: trn_handle})
            sess.run(val_init_op, feed_dict={handle: val_handle})

            nvtx.RangePop()  # TF Init

            # figure out what step we're on (it won't be 0 if we are
            #  restoring from a checkpoint) so we can count from there
            train_steps = sess.run([global_step])[0]

            #do the training
            epoch = 1
            step = 1

            t_sustained_start = time.time()

            nvtx.RangePush("Training Loop", 4)
            nvtx.RangePush("Epoch", epoch)
            start_time = time.time()
            while not sess.should_stop():

                #training loop
                try:
                    nvtx.RangePush("Step", step)
                    #construct feed dict
                    t_inst_start = time.time()
                    _, tmp_loss = sess.run(
                        [train_op, (loss if per_rank_output else loss_avg)],
                        feed_dict={handle: trn_handle})
                    t_inst_end = time.time()
                    train_steps += 1
                    train_steps_in_epoch = train_steps % num_steps_per_epoch
                    recent_losses = [tmp_loss
                                     ] + recent_losses[0:loss_window_size - 1]
                    train_loss = sum(recent_losses) / len(recent_losses)
                    nvtx.RangePop()  # Step
                    step += 1

                    #print step report
                    eff_steps = train_steps_in_epoch if (
                        train_steps_in_epoch > 0) else num_steps_per_epoch
                    if (train_steps % loss_print_interval) == 0:
                        if per_rank_output:
                            print(
                                "REPORT: rank {}, training loss for step {} (of {}) is {}, time {:.3f}"
                                .format(comm_rank, train_steps, num_steps,
                                        train_loss,
                                        time.time() - start_time))
                        else:
                            if comm_rank == 0:
                                print(
                                    "REPORT: training loss for step {} (of {}) is {}, time {:.3f}, r_inst {:.3f}"
                                    .format(
                                        train_steps, num_steps, train_loss,
                                        time.time() - start_time, 1e-12 *
                                        flops / (t_inst_end - t_inst_start)))

                    #do the validation phase
                    if train_steps_in_epoch == 0:
                        end_time = time.time()
                        #print epoch report
                        if per_rank_output:
                            print(
                                "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}"
                                .format(
                                    comm_rank, epoch, num_epochs, train_loss,
                                    time.time() - start_time,
                                    1e-12 * flops * num_steps_per_epoch /
                                    (end_time - t_sustained_start)))
                        else:
                            if comm_rank == 0:
                                print(
                                    "COMPLETED: training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}"
                                    .format(
                                        epoch, num_epochs, train_loss,
                                        time.time() - start_time,
                                        1e-12 * flops * num_steps_per_epoch /
                                        (end_time - t_sustained_start)))

                        #evaluation loop
                        eval_loss = 0.
                        eval_steps = 0
                        nvtx.RangePush("Eval Loop", 7)
                        while True:
                            try:
                                #construct feed dict
                                _, tmp_loss, val_model_predictions, val_model_labels, val_model_filenames = sess.run(
                                    [
                                        iou_update_op,
                                        (loss
                                         if per_rank_output else loss_avg),
                                        prediction, next_elem[1], next_elem[3]
                                    ],
                                    feed_dict={handle: val_handle})

                                #print some images
                                if comm_rank == 0 and not disable_imsave:
                                    if have_imsave:
                                        imsave(
                                            image_dir + '/test_pred_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            np.argmax(
                                                val_model_predictions[0, ...],
                                                axis=2) * 100)
                                        imsave(
                                            image_dir + '/test_label_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            val_model_labels[0, ...] * 100)
                                        imsave(
                                            image_dir +
                                            '/test_combined_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png', colormap[
                                                val_model_labels[0, ...],
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2)])
                                    else:
                                        np.savez(
                                            image_dir + '/test_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.npz',
                                            prediction=np.argmax(
                                                val_model_predictions[0, ...],
                                                axis=2) * 100,
                                            label=val_model_labels[0, ...] *
                                            100,
                                            filename=val_model_filenames[0])

                                eval_loss += tmp_loss
                                eval_steps += 1
                            except tf.errors.OutOfRangeError:
                                eval_steps = np.max([eval_steps, 1])
                                eval_loss /= eval_steps
                                if per_rank_output:
                                    print(
                                        "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                        .format(comm_rank, epoch, num_epochs,
                                                eval_loss))
                                else:
                                    if comm_rank == 0:
                                        print(
                                            "COMPLETED: evaluation loss for epoch {} (of {}) is {}"
                                            .format(epoch, num_epochs,
                                                    eval_loss))
                                if per_rank_output:
                                    iou_score = sess.run(iou_op)
                                    print(
                                        "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                        .format(comm_rank, epoch, num_epochs,
                                                iou_score))
                                else:
                                    iou_score = sess.run(iou_avg)
                                    if comm_rank == 0:
                                        print(
                                            "COMPLETED: evaluation IoU for epoch {} (of {}) is {}"
                                            .format(epoch, num_epochs,
                                                    iou_score))
                                sess.run(iou_reset_op)
                                sess.run(val_init_op,
                                         feed_dict={handle: val_handle})
                                break
                        nvtx.RangePop()  # Eval Loop

                        #reset counters
                        epoch += 1
                        step = 0
                        t_sustained_start = time.time()

                        nvtx.RangePop()  # Epoch
                        nvtx.RangePush("Epoch", epoch)

                except tf.errors.OutOfRangeError:
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

        # write any cached traces to disk
        if tracing is not None:
            tracing_hook.write_traces()
if __name__ == '__main__':

    args = get_args()

    if args.mixed_precision:
        policy = tf.keras.mixed_precision.Policy("mixed_float16")
        tf.keras.mixed_precision.set_global_policy(policy)

    local_rank = os.getenv("OMPI_COMM_WORLD_RANK")
    os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)

    hvd.init()

    dense_variables, vocabulary_tensors, samples, labels = generate_data(args)

    sok_loss_list = run_sok_model(args, dense_variables, vocabulary_tensors,
                                  samples, labels)
    # compute_average_loss
    for i in range(args.iter_num):
        sok_loss_list[i] = hvd.allreduce(sok_loss_list[i])

    if hvd.local_rank() == 0:
        tf_loss_list = run_tf_model(args, dense_variables, vocabulary_tensors,
                                    samples, labels)

    if hvd.local_rank() == 0:
        for i in range(args.iter_num):
            print('Iteration: {}, sok={}, tf={}'.format(
                i, sok_loss_list[i], tf_loss_list[i]))
Example #30
0
def hvd_barrier():
    hvd.allreduce(tf.random.normal([1]))
Example #31
0
def BatchNorm(inputs,
              axis=None,
              training=None,
              momentum=0.9,
              epsilon=1e-5,
              center=True,
              scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):
    """
    Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful)
    in the following:

    1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten.
    4. Support the `internal_update` option, which cover more use cases than the standard collection-based update.
    5. Support the `sync_statistics` option, which is very useful in small-batch models.

    Args:
        internal_update (bool): if False, add EMA update ops to
          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
          They are very similar in speed, but `internal_update=True` is recommended and can be helpful when:

          1. BatchNorm is used inside dynamic control flow.
             The collection-based update does not support dynamic control flows.
          2. BatchNorm layer is sometimes unused (e.g., when you have two networks to train alternatively).
             Putting all update ops into a single collection will waste a lot of compute.

          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None, "nccl", or "horovod".

          By default (None), it uses statistics of the input tensor to normalize.
          This is the standard way BatchNorm was done in most frameworks.

          When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
          It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.

          When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`.
          It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
          Note that on single machine this is significantly slower than the "nccl" implementation.

          If not None, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute
          global mean & variance. Therefore each GPU needs to have the same batch size.

          The synchronization is based on the current variable scope + the name of the layer
          (`BatchNorm('name', input)`). Therefore, you need to make sure that:

          1. The BatchNorm layer on different GPUs needs to have the same name, so that
             statistics can be synchronized. If names do not match, this layer will hang.
          2. Different BatchNorm layers in one tower cannot share the same name.
          3. A BatchNorm layer needs to be executed for the same number of times by all GPUs.
             If different GPUs execute one BatchNorm layer for different number of times
             (e.g., if some GPUs do not execute it), this layer may hang.

          This option only has effect in standard training mode.

          This option is also known as "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222.

          When `sync_statistics` is enabled, `internal_update` will be set to True automatically.
          This is to avoid running `UPDATE_OPS`, which requires synchronization.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        Combinations of ``training`` and ``ctx.is_training``:

        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
          and used during inference. This is the default.
        * ``training and not ctx.is_training``: still use batch statistics in inference.
        * ``not training and ctx.is_training``: use EMA to normalize in
          training. This is useful when you load a pre-trained BN and
          don't want to fine tune the EMA. EMA will not be updated in
          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, keras_mode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    assert axis in [1, 3], axis
    num_chan = shape[axis]

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_tuple()
    freeze_bn_backward = not training and ctx.is_training
    if freeze_bn_backward:
        assert TF_version >= (1, 4), \
            "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!"
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn(
                "[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable({
                'moving_mean': 'mean/EMA',
                'moving_variance': 'variance/EMA'
        }):
            tf_args = dict(
                axis=axis,
                momentum=momentum,
                epsilon=epsilon,
                center=center,
                scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429
                fused=(ndims == 4 and axis in [1, 3]
                       and not freeze_bn_backward),
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= (1, 5):
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            use_fp16 = inputs.dtype == tf.float16
            if use_fp16:
                # non-fused does not support fp16; fused does not support all layouts.
                # we made our best guess here
                tf_args['fused'] = True
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs,
                             training=training,
                             scope=tf.get_variable_scope())

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because during training, EMA isn't used
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                if isinstance(v, tf.Variable):
                    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else (
            [0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            num_dev = ctx.total
            if num_dev == 1:
                logger.warn(
                    "BatchNorm(sync_statistics='nccl') is used with only one tower!"
                )
            else:
                assert six.PY2 or TF_version >= (1, 10), \
                    "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \
                    "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360"

                if TF_version <= (1, 12):
                    try:
                        from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so
                    except Exception:
                        pass
                    else:
                        _validate_and_load_nccl_so()
                    from tensorflow.contrib.nccl.ops import gen_nccl_ops
                else:
                    from tensorflow.python.ops import gen_nccl_ops
                shared_name = re.sub('tower[0-9]+/', '',
                                     tf.get_variable_scope().name)
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                      num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            if hvd.size() == 1:
                logger.warn(
                    "BatchNorm(sync_statistics='horovod') is used with only one process!"
                )
            else:
                import horovod
                hvd_version = tuple(map(int, horovod.__version__.split('.')))
                assert hvd_version >= (
                    0, 13,
                    6), "sync_statistics=horovod needs horovod>=0.13.6 !"

                batch_mean = hvd.allreduce(batch_mean, average=True)
                batch_mean_square = hvd.allreduce(batch_mean_square,
                                                  average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var,
                                           tf.reshape(beta, new_shape),
                                           tf.reshape(gamma, new_shape),
                                           epsilon)
        else:
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta,
                                           gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean,
                                moving_var, momentum)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Example #32
0
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):
    """
    Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful)
    in the following:

    1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten.
    4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals.
    5. Support the `sync_statistics` option, which is very useful in small-batch models.

    Args:
        internal_update (bool): if False, add EMA update ops to
          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
          They are very similar in speed, but `internal_update=True` can be used
          when you have conditionals in your model, or when you have multiple networks to train.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None, "nccl", or "horovod".

          By default (None), it uses statistics of the input tensor to normalize.
          This is the standard way BatchNorm was done in most frameworks.

          When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
          It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.

          When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`.
          It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
          Note that on single machine this is significantly slower than the "nccl" implementation.

          This implementation averages the per-GPU E[x] and E[x^2] among GPUs to compute
          global mean & variance. Therefore each GPU needs to have the same batch size.

          This option has no effect when not training.

          This option is also known as "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        Combinations of ``training`` and ``ctx.is_training``:

        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
          and used during inference. This is the default.
        * ``training and not ctx.is_training``: still use batch statistics in inference.
        * ``not training and ctx.is_training``: use EMA to normalize in
          training. This is useful when you load a pre-trained BN and
          don't want to fine tune the EMA. EMA will not be updated in
          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_tuple()
    freeze_bn_backward = not training and ctx.is_training
    if freeze_bn_backward:
        assert TF_version >= (1, 4), \
            "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!"
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable(
                {'moving_mean': 'mean/EMA',
                    'moving_variance': 'variance/EMA'}):
            tf_args = dict(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429
                fused=(ndims == 4 and axis in [1, 3] and not freeze_bn_backward),
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= (1, 5):
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because during training, EMA isn't used
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                if isinstance(v, tf.Variable):
                    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            num_dev = ctx.total
            if num_dev == 1:
                logger.warn("BatchNorm(sync_statistics='nccl') is used with only one tower!")
            else:
                assert six.PY2 or TF_version >= (1, 10), \
                    "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \
                    "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360"

                from tensorflow.contrib.nccl.ops import gen_nccl_ops
                shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name)
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            if hvd.size() == 1:
                logger.warn("BatchNorm(sync_statistics='horovod') is used with only one process!")
            else:
                import horovod
                hvd_version = tuple(map(int, horovod.__version__.split('.')))
                assert hvd_version >= (0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !"

                batch_mean = hvd.allreduce(batch_mean, average=True)
                batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                tf.reshape(beta, new_shape),
                tf.reshape(gamma, new_shape), epsilon)
        else:
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                beta, gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(
                xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var,
                momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Example #33
0
    def _bn_train(self, inputs):
        """学習時のBN。"""
        # self.axisを除く軸で平均・分散を算出する
        target_axes = self._get_target_axes(ndims=inputs.shape.rank)
        stat_axes = [
            a for a in range(inputs.shape.rank) if a not in target_axes
        ]

        # 平均・分散の算出
        x = tf.cast(inputs, tf.float32)
        if tf.version.VERSION.startswith("2.2"):  # workaround
            x = tf.debugging.assert_all_finite(x, "x")
        mean = tf.math.reduce_mean(x, axis=stat_axes)
        squared_mean = tf.math.reduce_mean(tf.math.square(x), axis=stat_axes)
        # if tf.version.VERSION.startswith("2.2"):  # workaround
        mean = tf.debugging.assert_all_finite(mean, "mean")
        squared_mean = tf.debugging.assert_all_finite(squared_mean,
                                                      "squared_mean")

        # Sync BN
        if tk.hvd.initialized():
            import horovod.tensorflow as _hvd

            mean = _hvd.allreduce(mean, op=_hvd.Average)
            squared_mean = _hvd.allreduce(squared_mean, op=_hvd.Average)
        else:
            replica_context = tf.distribute.get_replica_context()
            if replica_context is not None:
                mean = replica_context.all_reduce(tf.distribute.ReduceOp.MEAN,
                                                  mean)
                squared_mean = replica_context.all_reduce(
                    tf.distribute.ReduceOp.MEAN, squared_mean)
            else:
                strategy = tf.distribute.get_strategy()
                mean = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                       mean,
                                       axis=None)
                squared_mean = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                               squared_mean,
                                               axis=None)

        var = squared_mean - tf.math.square(mean)
        if tf.version.VERSION.startswith("2.2"):  # workaround
            mean = tf.debugging.assert_all_finite(mean, "reduced mean")
            var = tf.debugging.assert_all_finite(var, "reduced var")

        # exponential moving average:
        # m_new = m_old * 0.99 + x * 0.01
        # m_new - m_old = (x - m_old) * 0.01
        decay = 1 - self.momentum
        self.add_update([
            self.moving_mean.assign_add(
                (mean - self.moving_mean) * decay,
                read_value=False,
            ),
            self.moving_variance.assign_add(
                (var - self.moving_variance) * decay,
                read_value=False,
            ),
        ])

        # y = (x - mean) / (sqrt(var) + epsilon) * gamma + beta
        #   = x * gamma / (sqrt(var) + epsilon) + (beta - mean * gamma / (sqrt(var) + epsilon))
        #   = x * a + (beta - mean * a)
        a = self.gamma * tf.math.rsqrt(var + self.epsilon)
        b = self.beta - mean * a
        return K.cast(x * a + b, K.dtype(inputs))
def main(device, input_path_train, input_path_validation, dummy_data,
         downsampling_fact, downsampling_mode, channels, data_format, label_id,
         weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, model,
         decoder, fs_type, optimizer, batch, batchnorm, num_epochs, dtype,
         disable_checkpoints, disable_imsave, tracing, trace_dir,
         output_sampling, scale_factor, intra_threads, inter_threads):
    #init horovod
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))

    #downsampling? recompute image dims
    image_height = image_height_orig // downsampling_fact
    image_width = image_width_orig // downsampling_fact

    #parameters
    per_rank_output = False
    loss_print_interval = 1

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=inter_threads,  #6
        intra_op_parallelism_threads=intra_threads,  #1
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)
    sess_config.gpu_options.force_gpu_compatible = True

    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    train_files = load_data(input_path_train, True, trn_sz, horovod)
    valid_files = load_data(input_path_validation, False, val_sz, horovod)

    #print some stats
    if comm_rank == 0:
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Decoder: {}".format(decoder))
        print("Batch normalization: {}".format(batchnorm))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Loss scale factor: {}".format(scale_factor))
        print("Output sampling target: {}".format(output_sampling))
        #print optimizer parameters
        for k, v in optimizer.items():
            print("Solver Parameters: {k}: {v}".format(k=k, v=v))
        print("Num training samples: {}".format(train_files.shape[0]))
        print("Num validation samples: {}".format(valid_files.shape[0]))
        if dummy_data:
            print("Using synthetic dummy data")
        print("Disable checkpoints: {}".format(disable_checkpoints))
        print("Disable image save: {}".format(disable_imsave))

    #compute epochs and stuff:
    if fs_type == "local":
        num_samples = train_files.shape[0] // comm_local_size
    else:
        num_samples = train_files.shape[0] // comm_size
    print("num_samples: {} batch: {}".format(num_samples, batch))
    num_steps_per_epoch = num_samples // batch
    num_steps = num_epochs * num_steps_per_epoch
    if comm_rank == 0:
        print("Number of steps per epoch: {}".format(num_steps_per_epoch))
        print("Number of steps in total: {}".format(num_steps))
    if per_rank_output:
        print("Rank {} does {} steps per epoch".format(comm_rank,
                                                       num_steps_per_epoch))

    with training_graph.as_default():

        if dummy_data:
            dummy_data_args = dict(batchsize=batch,
                                   data_format=data_format,
                                   dtype=dtype)
            trn_dataset = create_dummy_dataset(n_samples=trn_sz,
                                               num_epochs=num_epochs,
                                               **dummy_data_args)
            val_dataset = create_dummy_dataset(n_samples=val_sz,
                                               num_epochs=1,
                                               **dummy_data_args)
        else:
            #create readers
            trn_reader = h5_input_reader(input_path_train,
                                         channels,
                                         weights,
                                         dtype,
                                         normalization_file="stats.h5",
                                         update_on_read=False,
                                         data_format=data_format,
                                         label_id=label_id,
                                         sample_target=output_sampling)
            val_reader = h5_input_reader(input_path_validation,
                                         channels,
                                         weights,
                                         dtype,
                                         normalization_file="stats.h5",
                                         update_on_read=False,
                                         data_format=data_format,
                                         label_id=label_id)
            #create datasets
            if fs_type == "local":
                trn_dataset = create_dataset(trn_reader,
                                             train_files,
                                             batch,
                                             num_epochs,
                                             comm_local_size,
                                             comm_local_rank,
                                             dtype,
                                             shuffle=True)
                val_dataset = create_dataset(val_reader,
                                             valid_files,
                                             batch,
                                             1,
                                             comm_local_size,
                                             comm_local_rank,
                                             dtype,
                                             shuffle=False)
            else:
                trn_dataset = create_dataset(trn_reader,
                                             train_files,
                                             batch,
                                             num_epochs,
                                             comm_size,
                                             comm_rank,
                                             dtype,
                                             shuffle=True)
                val_dataset = create_dataset(val_reader,
                                             valid_files,
                                             batch,
                                             1,
                                             comm_size,
                                             comm_rank,
                                             dtype,
                                             shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype, tf.string),
            ((batch, len(channels), image_height_orig,
              image_width_orig) if data_format == "channels_first" else
             (batch, image_height_orig, image_width_orig, len(channels)),
             (batch, image_height_orig, image_width_orig),
             (batch, image_height_orig, image_width_orig), (batch)))
        next_elem = iterator.get_next()

        #if downsampling, do some preprocessing
        if downsampling_fact != 1:
            if downsampling_mode == "scale":
                #do downsampling
                rand_select = tf.cast(tf.one_hot(tf.random_uniform(
                    (batch, image_height, image_width),
                    minval=0,
                    maxval=downsampling_fact * downsampling_fact,
                    dtype=tf.int32),
                                                 depth=downsampling_fact *
                                                 downsampling_fact,
                                                 axis=-1),
                                      dtype=tf.int32)
                next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                             tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1,1,1,1], 'VALID'), rand_select), axis=-1), \
                             tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \
                             next_elem[3])
            elif downsampling_mode == "center-crop":
                #some parameters
                length = 1. / float(downsampling_fact)
                offset = length / 2.
                boxes = [[offset, offset, offset + length, offset + length]
                         ] * batch
                box_ind = list(range(0, batch))
                crop_size = [image_height, image_width]

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 2, 3, 1])

                #crop
                next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \
                             ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \
                             tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \
                             next_elem[3])

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 3, 1, 2])

            else:
                raise ValueError(
                    "Error, downsampling mode {} not supported. Supported are [center-crop, scale]"
                    .format(downsampling_mode))

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #compute the input filter number based on number of channels used
        num_channels = len(channels)
        #set up model
        model = deeplab_v3_plus_generator(num_classes=3,
                                          output_stride=8,
                                          base_architecture=model,
                                          decoder=decoder,
                                          batchnorm=batchnorm,
                                          pre_trained_model=None,
                                          batch_norm_decay=None,
                                          data_format=data_format)

        logit, prediction = model(next_elem[0], True, dtype)

        #set up loss
        loss = None

        #cast the logits to fp32
        logit = ensure_type(logit, tf.float32)
        if loss_type == "weighted":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM)
            if scale_factor != 1.0:
                loss *= scale_factor

        elif loss_type == "weighted_mean":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
            if scale_factor != 1.0:
                loss *= scale_factor

        elif loss_type == "focal":
            #one-hot-encode
            labels_one_hot = tf.contrib.layers.one_hot_encoding(
                next_elem[1], 3)
            #cast to FP32
            labels_one_hot = ensure_type(labels_one_hot, tf.float32)
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)

        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))

        #determine flops
        flops = graph_flops.graph_flops(
            format="NHWC" if data_format == "channels_last" else "NCHW",
            verbose=False,
            batch=batch,
            sess_config=sess_config)
        flops *= comm_size
        if comm_rank == 0:
            print('training flops: {:.3f} TF/step'.format(flops * 1e-12))

        #number of trainable parameters
        if comm_rank == 0:
            num_params = get_number_of_trainable_parameters()
            print('number of trainable parameters: {} ({} MB)'.format(
                num_params,
                num_params * (4 if dtype == tf.float32 else 2) * (2**-20)))

        if horovod:
            loss_avg = hvd.allreduce(ensure_type(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)
        tmpl = (loss if per_rank_output else loss_avg)

        #set up global step - keep on CPU
        with tf.device('/device:CPU:0'):
            global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if optimizer['opt_type'].startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op, lr = get_larc_optimizer(optimizer, loss, global_step,
                                              num_steps_per_epoch, horovod)
        else:
            train_op, lr = get_optimizer(optimizer, loss, global_step,
                                         num_steps_per_epoch, horovod)

        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        if "gpu" in device.lower():
            with tf.device(device):
                mem_usage_ops = [
                    tf.contrib.memory_stats.MaxBytesInUse(),
                    tf.contrib.memory_stats.BytesLimit()
                ]
        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        #hooks = [tf.train.StopAtStepHook(last_step=3)]
        #hooks = [tf.train.StopAtStepHook(num_steps=3)]
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        nvtx_callback = NVTXHook(skip_n_steps=0, name='TTTTTrain')
        hooks.append(nvtx_callback)
        #bcast init for bcasting the model after start
        if horovod:
            init_bcast = hvd.broadcast_global_variables(0)
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = 5 * num_steps_per_epoch
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            if (not disable_checkpoints):
                hooks.append(
                    tf.train.CheckpointSaverHook(
                        checkpoint_dir=checkpoint_dir,
                        save_steps=checkpoint_save_freq,
                        saver=checkpoint_saver))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        #tracing
        if tracing is not None:
            import tracehook
            tracing_hook = tracehook.TraceHook(steps_to_trace=tracing,
                                               cache_traces=True,
                                               trace_dir=trace_dir)
            hooks.append(tracing_hook)
            print("############ tracing enabled")

        # instead of averaging losses over an entire epoch, use a moving
        #  window average
        recent_losses = []
        loss_window_size = 10

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #restore from checkpoint:
            if comm_rank == 0 and not disable_checkpoints:
                load_model(sess, checkpoint_saver, checkpoint_dir)
            #broadcast loaded model variables
            if horovod:
                sess.run(init_bcast)
            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op, feed_dict={handle: trn_handle})
            sess.run(val_init_op, feed_dict={handle: val_handle})

            # figure out what step we're on (it won't be 0 if we are
            #  restoring from a checkpoint) so we can count from there
            train_steps = sess.run([global_step])[0]

            #do the training
            epoch = 1
            step = 1

            prev_mem_usage = 0
            t_sustained_start = time.time()
            r_peak = 0

            #warmup loops
            print("### Warmup for 5 steps")
            start_time = time.time()
            #while not sess.should_stop():
            for _ in range(5):
                #try:
                print('warmup train_steps is {}'.format(train_steps))
                if train_steps == 5:
                    #                    if have_pycuda:
                    #                        pyc.driver.start_profiler()
                    print(train_steps)
                _ = sess.run([train_op], feed_dict={handle: trn_handle})
                #tmp_loss = sess.run([(loss if per_rank_output else loss_avg)],feed_dict={handle: trn_handle})

                if train_steps == 5:
                    #                    if have_pycuda:
                    #                        pyc.driver.stop_profiler()
                    print(train_steps)

                train_steps += 1

            end_time = time.time()
            print("### Warmup time: {:0.2f}".format(end_time - start_time))

            ### Start profiling
            print('Begin training loop')
            #if have_cupy:
            #cupy.cuda.profiler.start()
            #            if have_pycuda:
            #                pyc.driver.start_profiler()
            #while not sess.should_stop():
            for _ in range(1):
                try:
                    print('train_steps is {}'.format(train_steps))
                    if train_steps == 5:
                        if have_pycuda:
                            pyc.driver.start_profiler()
                        print(train_steps)
                    _ = sess.run([tmpl], feed_dict={handle: trn_handle})
                    #                    _ = sess.run([train_op],feed_dict={handle: trn_handle})
                    if train_steps == 5:
                        if have_pycuda:
                            pyc.driver.stop_profiler()
                        print(train_steps)
                    train_steps += 1
                except tf.errors.OutOfRangeError:
                    break


#            if have_pycuda:
#                pyc.driver.stop_profiler()

### End of profiling
#if have_cupy:
#    cupy.cuda.profiler.stop()

# write any cached traces to disk
        if tracing is not None:
            tracing_hook.write_traces()

    print('All done')
def main():
    script_start = time.time()
    hvd_init()
    mpi_comm = MPI.COMM_WORLD
    args = parse_args()

    if hvd.rank() == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        tf.random.set_random_seed(args.seed)
        np.random.seed(args.seed)
        cp.random.seed(args.seed)

    if args.amp:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
    if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \
       and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1":
        args.fp16 = False

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)
    final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt')

    # Load converted data and get statistics
    train_df = pd.read_pickle(args.data + '/train_ratings.pickle')
    test_df = pd.read_pickle(args.data + '/test_ratings.pickle')
    nb_users, nb_items = train_df.max() + 1

    # Extract train and test feature tensors from dataframe
    pos_train_users = train_df.iloc[:, 0].values.astype(np.int32)
    pos_train_items = train_df.iloc[:, 1].values.astype(np.int32)
    pos_test_users = test_df.iloc[:, 0].values.astype(np.int32)
    pos_test_items = test_df.iloc[:, 1].values.astype(np.int32)
    # Negatives indicator for negatives generation
    neg_mat = np.ones((nb_users, nb_items), dtype=np.bool)
    neg_mat[pos_train_users, pos_train_items] = 0

    # Get the local training/test data
    train_users, train_items, train_labels = get_local_train_data(
        pos_train_users, pos_train_items, args.negative_samples)
    test_users, test_items = get_local_test_data(pos_test_users,
                                                 pos_test_items)

    # Create and run Data Generator in a separate thread
    data_generator = DataGenerator(
        args.seed,
        hvd.rank(),
        nb_users,
        nb_items,
        neg_mat,
        train_users,
        train_items,
        train_labels,
        args.batch_size // hvd.size(),
        args.negative_samples,
        test_users,
        test_items,
        args.valid_users_per_batch,
        args.valid_negative,
    )

    # Create tensorflow session and saver
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    if args.xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    sess = tf.Session(config=config)

    # Input tensors
    users = tf.placeholder(tf.int32, shape=(None, ))
    items = tf.placeholder(tf.int32, shape=(None, ))
    labels = tf.placeholder(tf.int32, shape=(None, ))
    is_dup = tf.placeholder(tf.float32, shape=(None, ))
    dropout = tf.placeholder_with_default(args.dropout, shape=())
    # Model ops and saver
    hit_rate, ndcg, eval_op, train_op = ncf_model_ops(
        users,
        items,
        labels,
        is_dup,
        params={
            'fp16': args.fp16,
            'val_batch_size': args.valid_negative + 1,
            'top_k': args.topk,
            'learning_rate': args.learning_rate,
            'beta_1': args.beta1,
            'beta_2': args.beta2,
            'epsilon': args.eps,
            'num_users': nb_users,
            'num_items': nb_items,
            'num_factors': args.factors,
            'mf_reg': 0,
            'layer_sizes': args.layers,
            'layer_regs': [0. for i in args.layers],
            'dropout': dropout,
            'sigmoid': True,
            'loss_scale': args.loss_scale
        },
        mode='TRAIN' if args.mode == 'train' else 'EVAL')
    saver = tf.train.Saver()

    # Accuracy metric tensors
    hr_sum = tf.get_default_graph().get_tensor_by_name(
        'neumf/hit_rate/total:0')
    hr_cnt = tf.get_default_graph().get_tensor_by_name(
        'neumf/hit_rate/count:0')
    ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0')
    ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0')

    # Prepare evaluation data
    data_generator.prepare_eval_data()

    if args.load_checkpoint_path:
        saver.restore(sess, args.load_checkpoint_path)
    else:
        # Manual initialize weights
        sess.run(tf.global_variables_initializer())

    # If test mode, run one eval
    if args.mode == 'test':
        sess.run(tf.local_variables_initializer())
        eval_start = time.time()
        for user_batch, item_batch, dup_batch \
            in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask):
            sess.run(eval_op,
                     feed_dict={
                         users: user_batch,
                         items: item_batch,
                         is_dup: dup_batch,
                         dropout: 0.0
                     })
        eval_duration = time.time() - eval_start

        # Report results
        hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False))
        hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False))
        ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False))
        ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False))

        hit_rate = hit_rate_sum / hit_rate_cnt
        ndcg = ndcg_sum / ndcg_cnt

        if hvd.rank() == 0:
            eval_throughput = pos_test_users.shape[0] * (args.valid_negative +
                                                         1) / eval_duration
            dllogger.log(step=tuple(),
                         data={
                             'eval_throughput': eval_throughput,
                             'eval_time': eval_duration,
                             'hr@10': hit_rate,
                             'ndcg': ndcg
                         })
        return

    # Performance Metrics
    train_times = list()
    eval_times = list()
    # Accuracy Metrics
    first_to_target = None
    time_to_train = 0.0
    best_hr = 0
    best_epoch = 0
    # Buffers for global metrics
    global_hr_sum = np.ones(1)
    global_hr_count = np.ones(1)
    global_ndcg_sum = np.ones(1)
    global_ndcg_count = np.ones(1)
    # Buffers for local metrics
    local_hr_sum = np.ones(1)
    local_hr_count = np.ones(1)
    local_ndcg_sum = np.ones(1)
    local_ndcg_count = np.ones(1)

    # Begin training
    begin_train = time.time()
    for epoch in range(args.epochs):
        # Train for one epoch
        train_start = time.time()
        data_generator.prepare_train_data()
        for user_batch, item_batch, label_batch \
            in zip(data_generator.train_users_batches,
                   data_generator.train_items_batches,
                   data_generator.train_labels_batches):
            sess.run(train_op,
                     feed_dict={
                         users: user_batch.get(),
                         items: item_batch.get(),
                         labels: label_batch.get()
                     })
        train_duration = time.time() - train_start
        # Only log "warm" epochs
        if epoch >= 1:
            train_times.append(train_duration)
        # Evaluate
        if epoch > args.eval_after:
            eval_start = time.time()
            sess.run(tf.local_variables_initializer())
            for user_batch, item_batch, dup_batch \
                in zip(data_generator.eval_users,
                       data_generator.eval_items,
                       data_generator.dup_mask):
                sess.run(eval_op,
                         feed_dict={
                             users: user_batch,
                             items: item_batch,
                             is_dup: dup_batch,
                             dropout: 0.0
                         })
            # Compute local metrics
            local_hr_sum[0] = sess.run(hr_sum)
            local_hr_count[0] = sess.run(hr_cnt)
            local_ndcg_sum[0] = sess.run(ndcg_sum)
            local_ndcg_count[0] = sess.run(ndcg_cnt)
            # Reduce metrics across all workers
            mpi_comm.Reduce(local_hr_count, global_hr_count)
            mpi_comm.Reduce(local_hr_sum, global_hr_sum)
            mpi_comm.Reduce(local_ndcg_count, global_ndcg_count)
            mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum)
            # Calculate metrics
            hit_rate = global_hr_sum[0] / global_hr_count[0]
            ndcg = global_ndcg_sum[0] / global_ndcg_count[0]

            eval_duration = time.time() - eval_start
            # Only log "warm" epochs
            if epoch >= 1:
                eval_times.append(eval_duration)

            if hvd.rank() == 0:
                dllogger.log(step=(epoch, ),
                             data={
                                 'train_time': train_duration,
                                 'eval_time': eval_duration,
                                 'hr@10': hit_rate,
                                 'ndcg': ndcg
                             })

                # Update summary metrics
                if hit_rate > args.target and first_to_target is None:
                    first_to_target = epoch
                    time_to_train = time.time() - begin_train
                if hit_rate > best_hr:
                    best_hr = hit_rate
                    best_epoch = epoch
                    time_to_best = time.time() - begin_train
                    if hit_rate > args.target:
                        saver.save(sess, final_checkpoint_path)

    # Final Summary
    if hvd.rank() == 0:
        train_times = np.array(train_times)
        train_throughputs = pos_train_users.shape[0] * (args.negative_samples +
                                                        1) / train_times
        eval_times = np.array(eval_times)
        eval_throughputs = pos_test_users.shape[0] * (args.valid_negative +
                                                      1) / eval_times

        dllogger.log(step=tuple(),
                     data={
                         'average_train_time_per_epoch': np.mean(train_times),
                         'average_train_throughput':
                         np.mean(train_throughputs),
                         'average_eval_time_per_epoch': np.mean(eval_times),
                         'average_eval_throughput': np.mean(eval_throughputs),
                         'first_epoch_to_hit': first_to_target,
                         'time_to_train': time_to_train,
                         'time_to_best': time_to_best,
                         'best_hr': best_hr,
                         'best_epoch': best_epoch
                     })
        dllogger.flush()

    sess.close()
    return
 def all_reduce_fn(tensor):
     return hvd.allreduce(tensor, compression=hvd.Compression.fp16)
Example #37
0
def hvd_group_all_reduce(ts):
    import horovod.tensorflow as hvd
    return [hvd.allreduce(t, average=False) for t in ts]
Example #38
0
def train_once(
    sess,
    step,
    ops,
    names=None,
    gen_feed_dict_fn=None,
    deal_results_fn=None,
    interval_steps=100,
    eval_ops=None,
    eval_names=None,
    gen_eval_feed_dict_fn=None,
    deal_eval_results_fn=melt.print_results,
    valid_interval_steps=100,
    print_time=True,
    print_avg_loss=True,
    model_dir=None,
    log_dir=None,
    is_start=False,
    num_steps_per_epoch=None,
    metric_eval_fn=None,
    metric_eval_interval_steps=0,
    summary_excls=None,
    fixed_step=None,  # for epoch only, incase you change batch size
    eval_loops=1,
    learning_rate=None,
    learning_rate_patience=None,
    learning_rate_decay_factor=None,
    num_epochs=None,
    model_path=None,
    use_horovod=False,
):
    use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ

    #is_start = False # force not to evaluate at first step
    #print('-----------------global_step', sess.run(tf.train.get_or_create_global_step()))
    timer = gezi.Timer()
    if print_time:
        if not hasattr(train_once, 'timer'):
            train_once.timer = Timer()
            train_once.eval_timer = Timer()
            train_once.metric_eval_timer = Timer()

    melt.set_global('step', step)
    epoch = (fixed_step
             or step) / num_steps_per_epoch if num_steps_per_epoch else -1
    if not num_epochs:
        epoch_str = 'epoch:%.3f' % (epoch) if num_steps_per_epoch else ''
    else:
        epoch_str = 'epoch:%.3f/%d' % (
            epoch, num_epochs) if num_steps_per_epoch else ''
    melt.set_global('epoch', '%.2f' % (epoch))

    info = IO()
    stop = False

    if eval_names is None:
        if names:
            eval_names = ['eval/' + x for x in names]

    if names:
        names = ['train/' + x for x in names]

    if eval_names:
        eval_names = ['eval/' + x for x in eval_names]

    is_eval_step = is_start or valid_interval_steps and step % valid_interval_steps == 0
    summary_str = []

    eval_str = ''
    if is_eval_step:
        # deal with summary
        if log_dir:
            if not hasattr(train_once, 'summary_op'):
                #melt.print_summary_ops()
                if summary_excls is None:
                    train_once.summary_op = tf.summary.merge_all()
                else:
                    summary_ops = []
                    for op in tf.get_collection(tf.GraphKeys.SUMMARIES):
                        for summary_excl in summary_excls:
                            if not summary_excl in op.name:
                                summary_ops.append(op)
                    print('filtered summary_ops:')
                    for op in summary_ops:
                        print(op)
                    train_once.summary_op = tf.summary.merge(summary_ops)

                #train_once.summary_train_op = tf.summary.merge_all(key=melt.MonitorKeys.TRAIN)
                train_once.summary_writer = tf.summary.FileWriter(
                    log_dir, sess.graph)

                tf.contrib.tensorboard.plugins.projector.visualize_embeddings(
                    train_once.summary_writer, projector_config)

        # if eval ops then should have bee rank 0

        if eval_ops:
            #if deal_eval_results_fn is None and eval_names is not None:
            #  deal_eval_results_fn = lambda x: melt.print_results(x, eval_names)
            for i in range(eval_loops):
                eval_feed_dict = {} if gen_eval_feed_dict_fn is None else gen_eval_feed_dict_fn(
                )
                #eval_feed_dict.update(feed_dict)

                # if use horovod let each rant use same sess.run!
                if not log_dir or train_once.summary_op is None or gezi.env_has(
                        'EVAL_NO_SUMMARY') or use_horovod:
                    #if not log_dir or train_once.summary_op is None:
                    eval_results = sess.run(eval_ops, feed_dict=eval_feed_dict)
                else:
                    eval_results = sess.run(eval_ops + [train_once.summary_op],
                                            feed_dict=eval_feed_dict)
                    summary_str = eval_results[-1]
                    eval_results = eval_results[:-1]
                eval_loss = gezi.get_singles(eval_results)
                #timer_.print()
                eval_stop = False
                if use_horovod:
                    sess.run(hvd.allreduce(tf.constant(0)))

                #if not use_horovod or  hvd.local_rank() == 0:
                # @TODO user print should also use logging as a must ?
                #print(gezi.now_time(), epoch_str, 'eval_step: %d'%step, 'eval_metrics:', end='')
                eval_names_ = melt.adjust_names(eval_loss, eval_names)
                #if not use_horovod or hvd.rank() == 0:
                #  logging.info2('{} eval_step:{} eval_metrics:{}'.format(epoch_str, step, melt.parse_results(eval_loss, eval_names_)))
                eval_str = 'valid:{}'.format(
                    melt.parse_results(eval_loss, eval_names_))

                # if deal_eval_results_fn is not None:
                #   eval_stop = deal_eval_results_fn(eval_results)

                assert len(eval_loss) > 0
                if eval_stop is True:
                    stop = True
                eval_names_ = melt.adjust_names(eval_loss, eval_names)
                if not use_horovod or hvd.rank() == 0:
                    melt.set_global('eval_loss',
                                    melt.parse_results(eval_loss, eval_names_))

        elif interval_steps != valid_interval_steps:
            #print()
            pass

    metric_evaluate = False

    # if metric_eval_fn is not None \
    #   and (is_start \
    #     or (num_steps_per_epoch and step % num_steps_per_epoch == 0) \
    #          or (metric_eval_interval_steps \
    #              and step % metric_eval_interval_steps == 0)):
    #  metric_evaluate = True

    if metric_eval_fn is not None \
      and ((is_start or metric_eval_interval_steps \
           and step % metric_eval_interval_steps == 0) or model_path):
        metric_evaluate = True

    if 'EVFIRST' in os.environ:
        if os.environ['EVFIRST'] == '0':
            if is_start:
                metric_evaluate = False
        else:
            if is_start:
                metric_evaluate = True

    if step == 0 or 'QUICK' in os.environ:
        metric_evaluate = False

    #print('------------1step', step, 'pre metric_evaluate', metric_evaluate, hvd.rank())
    if metric_evaluate:
        if use_horovod:
            print('------------metric evaluate step', step, model_path,
                  hvd.rank())
        if not model_path or 'model_path' not in inspect.getargspec(
                metric_eval_fn).args:
            metric_eval_fn_ = metric_eval_fn
        else:
            metric_eval_fn_ = lambda: metric_eval_fn(model_path=model_path)

        try:
            l = metric_eval_fn_()
            if isinstance(l, tuple):
                num_returns = len(l)
                if num_returns == 2:
                    evaluate_results, evaluate_names = l
                    evaluate_summaries = None
                else:
                    assert num_returns == 3, 'retrun 1,2,3 ok 4.. not ok'
                    evaluate_results, evaluate_names, evaluate_summaries = l
            else:  #return dict
                evaluate_results, evaluate_names = tuple(zip(*dict.items()))
                evaluate_summaries = None
        except Exception:
            logging.info('Do nothing for metric eval fn with exception:\n',
                         traceback.format_exc())

        if not use_horovod or hvd.rank() == 0:
            #logging.info2('{} valid_step:{} {}:{}'.format(epoch_str, step, 'valid_metrics' if model_path is None else 'epoch_valid_metrics', melt.parse_results(evaluate_results, evaluate_names)))
            logging.info2('{} valid_step:{} {}:{}'.format(
                epoch_str, step, 'valid_metrics',
                melt.parse_results(evaluate_results, evaluate_names)))

        if learning_rate is not None and (learning_rate_patience
                                          and learning_rate_patience > 0):
            assert learning_rate_decay_factor > 0 and learning_rate_decay_factor < 1
            valid_loss = evaluate_results[0]
            if not hasattr(train_once, 'min_valid_loss'):
                train_once.min_valid_loss = valid_loss
                train_once.deacy_steps = []
                train_once.patience = 0
            else:
                if valid_loss < train_once.min_valid_loss:
                    train_once.min_valid_loss = valid_loss
                    train_once.patience = 0
                else:
                    train_once.patience += 1
                    logging.info2('{} valid_step:{} patience:{}'.format(
                        epoch_str, step, train_once.patience))

            if learning_rate_patience and train_once.patience >= learning_rate_patience:
                lr_op = ops[1]
                lr = sess.run(lr_op) * learning_rate_decay_factor
                train_once.deacy_steps.append(step)
                logging.info2(
                    '{} valid_step:{} learning_rate_decay by *{}, learning_rate_decay_steps={}'
                    .format(epoch_str, step, learning_rate_decay_factor,
                            ','.join(map(str, train_once.deacy_steps))))
                sess.run(tf.assign(lr_op, tf.constant(lr, dtype=tf.float32)))
                train_once.patience = 0
                train_once.min_valid_loss = valid_loss

    if ops is not None:
        #if deal_results_fn is None and names is not None:
        #  deal_results_fn = lambda x: melt.print_results(x, names)

        feed_dict = {} if gen_feed_dict_fn is None else gen_feed_dict_fn()
        # NOTICE ops[2] should be scalar otherwise wrong!! loss should be scalar
        #print('---------------ops', ops)
        if eval_ops is not None or not log_dir or not hasattr(
                train_once,
                'summary_op') or train_once.summary_op is None or use_horovod:
            feed_dict[K.learning_phase()] = 1
            results = sess.run(ops, feed_dict=feed_dict)
        else:
            ## TODO why below ?
            #try:
            feed_dict[K.learning_phase()] = 1
            results = sess.run(ops + [train_once.summary_op],
                               feed_dict=feed_dict)
            summary_str = results[-1]
            results = results[:-1]
            # except Exception:
            #   logging.info('sess.run(ops + [train_once.summary_op], feed_dict=feed_dict) fail')
            #   results = sess.run(ops, feed_dict=feed_dict)

        #print('------------results', results)
        # #--------trace debug
        # if step == 210:
        #   run_metadata = tf.RunMetadata()
        #   results = sess.run(
        #         ops,
        #         feed_dict=feed_dict,
        #         options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
        #         run_metadata=run_metadata)
        #   from tensorflow.python.client import timeline
        #   trace = timeline.Timeline(step_stats=run_metadata.step_stats)

        #   trace_file = open('timeline.ctf.json', 'w')
        #   trace_file.write(trace.generate_chrome_trace_format())

        #reults[0] assume to be train_op, results[1] to be learning_rate
        learning_rate = results[1]
        results = results[2:]

        #@TODO should support aver loss and other avg evaluations like test..
        if print_avg_loss:
            if not hasattr(train_once, 'avg_loss'):
                train_once.avg_loss = AvgScore()
            #assume results[0] as train_op return, results[1] as loss
            loss = gezi.get_singles(results)
            train_once.avg_loss.add(loss)

        steps_per_second = None
        instances_per_second = None
        hours_per_epoch = None
        #step += 1
        #if is_start or interval_steps and step % interval_steps == 0:
        interval_ok = not use_horovod or hvd.local_rank() == 0
        if interval_steps and step % interval_steps == 0 and interval_ok:
            train_average_loss = train_once.avg_loss.avg_score()
            if print_time:
                duration = timer.elapsed()
                duration_str = 'duration:{:.2f} '.format(duration)
                melt.set_global('duration', '%.2f' % duration)
                info.write(duration_str)
                elapsed = train_once.timer.elapsed()
                steps_per_second = interval_steps / elapsed
                batch_size = melt.batch_size()
                num_gpus = melt.num_gpus()
                instances_per_second = interval_steps * batch_size / elapsed
                gpu_info = '' if num_gpus <= 1 else ' gpus:[{}]'.format(
                    num_gpus)
                if num_steps_per_epoch is None:
                    epoch_time_info = ''
                else:
                    hours_per_epoch = num_steps_per_epoch / interval_steps * elapsed / 3600
                    epoch_time_info = '1epoch:[{:.2f}h]'.format(
                        hours_per_epoch)
                info.write(
                    'elapsed:[{:.2f}] batch_size:[{}]{} batches/s:[{:.2f}] insts/s:[{:.2f}] {} lr:[{:.6f}]'
                    .format(elapsed, batch_size, gpu_info, steps_per_second,
                            instances_per_second, epoch_time_info,
                            learning_rate))

            if print_avg_loss:
                #info.write('train_avg_metrics:{} '.format(melt.value_name_list_str(train_average_loss, names)))
                names_ = melt.adjust_names(train_average_loss, names)
                #info.write('train_avg_metric:{} '.format(melt.parse_results(train_average_loss, names_)))
                info.write(' train:{} '.format(
                    melt.parse_results(train_average_loss, names_)))
                #info.write('train_avg_loss: {} '.format(train_average_loss))
            info.write(eval_str)
            #print(gezi.now_time(), epoch_str, 'train_step:%d'%step, info.getvalue(), end=' ')
            logging.info2('{} {} {}'.format(epoch_str, 'step:%d' % step,
                                            info.getvalue()))

            if deal_results_fn is not None:
                stop = deal_results_fn(results)

    summary_strs = gezi.to_list(summary_str)
    if metric_evaluate:
        if evaluate_summaries is not None:
            summary_strs += evaluate_summaries

    if step > 1:
        if is_eval_step:
            # deal with summary
            if log_dir:
                summary = tf.Summary()
                if eval_ops is None:
                    if train_once.summary_op is not None:
                        for summary_str in summary_strs:
                            train_once.summary_writer.add_summary(
                                summary_str, step)
                else:
                    for summary_str in summary_strs:
                        train_once.summary_writer.add_summary(
                            summary_str, step)
                    suffix = 'valid' if not eval_names else ''
                    # loss/valid
                    melt.add_summarys(summary,
                                      eval_results,
                                      eval_names_,
                                      suffix=suffix)

                if ops is not None:
                    try:
                        # loss/train_avg
                        melt.add_summarys(summary,
                                          train_average_loss,
                                          names_,
                                          suffix='train_avg')
                    except Exception:
                        pass
                    ##optimizer has done this also
                    melt.add_summary(summary, learning_rate, 'learning_rate')
                    melt.add_summary(summary,
                                     melt.batch_size(),
                                     'batch_size',
                                     prefix='other')
                    melt.add_summary(summary,
                                     melt.epoch(),
                                     'epoch',
                                     prefix='other')
                    if steps_per_second:
                        melt.add_summary(summary,
                                         steps_per_second,
                                         'steps_per_second',
                                         prefix='perf')
                    if instances_per_second:
                        melt.add_summary(summary,
                                         instances_per_second,
                                         'instances_per_second',
                                         prefix='perf')
                    if hours_per_epoch:
                        melt.add_summary(summary,
                                         hours_per_epoch,
                                         'hours_per_epoch',
                                         prefix='perf')

                if metric_evaluate:
                    #melt.add_summarys(summary, evaluate_results, evaluate_names, prefix='eval')
                    prefix = 'step_eval'
                    if model_path:
                        prefix = 'eval'
                        if not hasattr(train_once, 'epoch_step'):
                            train_once.epoch_step = 1
                        else:
                            train_once.epoch_step += 1
                        step = train_once.epoch_step
                    # eval/loss eval/auc ..
                    melt.add_summarys(summary,
                                      evaluate_results,
                                      evaluate_names,
                                      prefix=prefix)

                train_once.summary_writer.add_summary(summary, step)
                train_once.summary_writer.flush()
            return stop
        elif metric_evaluate and log_dir:
            summary = tf.Summary()
            for summary_str in summary_strs:
                train_once.summary_writer.add_summary(summary_str, step)
            #summary.ParseFromString(evaluate_summaries)
            summary_writer = train_once.summary_writer
            prefix = 'step_eval'
            if model_path:
                prefix = 'eval'
                if not hasattr(train_once, 'epoch_step'):
                    ## TODO.. restart will get 1 again..
                    #epoch_step = tf.Variable(0, trainable=False, name='epoch_step')
                    #epoch_step += 1
                    #train_once.epoch_step = sess.run(epoch_step)
                    valid_interval_epochs = 1.
                    try:
                        valid_interval_epochs = FLAGS.valid_interval_epochs
                    except Exception:
                        pass
                    train_once.epoch_step = 1 if melt.epoch() <= 1 else int(
                        int(melt.epoch() * 10) /
                        int(valid_interval_epochs * 10))
                    logging.info('train_once epoch start step is',
                                 train_once.epoch_step)
                else:
                    #epoch_step += 1
                    train_once.epoch_step += 1
                step = train_once.epoch_step
            #melt.add_summarys(summary, evaluate_results, evaluate_names, prefix='eval')
            melt.add_summarys(summary,
                              evaluate_results,
                              evaluate_names,
                              prefix=prefix)
            summary_writer.add_summary(summary, step)
            summary_writer.flush()
Example #39
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.horovod:
        hvd.init()
    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    validate_flags_or_throw(bert_config)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
    hvd_rank = 0
    hvd_local_rank = 0

    config = tf.ConfigProto()
    learning_rate = FLAGS.learning_rate
    if FLAGS.horovod:

        tf.logging.info("Multi-GPU training with TF Horovod")
        tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(),
                        hvd.rank())
        global_batch_size = FLAGS.train_batch_size * hvd.size(
        ) * FLAGS.num_accumulation_steps
        learning_rate = learning_rate * hvd.size()
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        hvd_local_rank = hvd.local_rank()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.logging.info("***** Configuaration *****")
        for key in FLAGS.__flags.keys():
            tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
        tf.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(
        LogTrainRunHook(global_batch_size, hvd_rank,
                        FLAGS.save_checkpoints_steps))

    # Prepare Training Data
    if FLAGS.do_train:
        train_examples = read_squad_examples(
            input_file=FLAGS.train_file,
            is_training=True,
            version_2_with_negative=FLAGS.version_2_with_negative)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.local_size())
            ]
            num_examples_per_local_rank = len(
                train_examples) // hvd.local_size()
            remainder = len(train_examples) % hvd.local_size()
            if hvd.local_rank() < remainder:
                start_index = hvd.local_rank() * (num_examples_per_local_rank +
                                                  1)
                end_index = start_index + num_examples_per_local_rank + 1
            else:
                start_index = hvd.local_rank(
                ) * num_examples_per_local_rank + remainder
                end_index = start_index + (num_examples_per_local_rank)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                hvd=None if not FLAGS.horovod else hvd,
                                use_fp16=FLAGS.use_fp16)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:

        # We write to a temporary file to avoid storing very large constant tensors
        # in memory.
        train_writer = FeatureWriter(filename=tmp_filenames[hvd_local_rank],
                                     is_training=True)
        convert_examples_to_features(
            examples=train_examples[start_index:end_index],
            tokenizer=tokenizer,
            max_seq_length=FLAGS.max_seq_length,
            doc_stride=FLAGS.doc_stride,
            max_query_length=FLAGS.max_query_length,
            is_training=True,
            output_fn=train_writer.process_feature,
            verbose_logging=FLAGS.verbose_logging)
        train_writer.close()

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", end_index - start_index)
        tf.logging.info("  Num split examples = %d", train_writer.num_features)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        tf.logging.info("  LR = %f", learning_rate)
        del train_examples
        if FLAGS.horovod:
            barrier = hvd.allreduce(tf.constant(0))
            with tf.Session(config=config) as sess:
                sess.run(barrier)

        train_input_fn = input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=num_train_steps)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.logging.info("-----------------------------")
            tf.logging.info("Total Training Time = %0.2f for Sentences = %d",
                            train_time_elapsed,
                            num_train_steps * global_batch_size)
            tf.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.logging.info("Throughput Average (sentences/sec) = %0.2f",
                            ss_sentences_per_second)
            tf.logging.info("-----------------------------")

    if FLAGS.export_trtis and master_process:
        export_model(estimator, FLAGS.output_dir, FLAGS.init_checkpoint)

    if FLAGS.do_predict and master_process:
        eval_examples = read_squad_examples(
            input_file=FLAGS.predict_file,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative)

        # Perform evaluation on subset, useful for profiling
        if FLAGS.num_eval_iterations is not None:
            eval_examples = eval_examples[:FLAGS.num_eval_iterations *
                                          FLAGS.predict_batch_size]

        eval_writer = FeatureWriter(filename=os.path.join(
            FLAGS.output_dir, "eval.tf_record"),
                                    is_training=False)
        eval_features = []

        def append_feature(feature):
            eval_features.append(feature)
            eval_writer.process_feature(feature)

        convert_examples_to_features(examples=eval_examples,
                                     tokenizer=tokenizer,
                                     max_seq_length=FLAGS.max_seq_length,
                                     doc_stride=FLAGS.doc_stride,
                                     max_query_length=FLAGS.max_query_length,
                                     is_training=False,
                                     output_fn=append_feature,
                                     verbose_logging=FLAGS.verbose_logging)
        eval_writer.close()

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num orig examples = %d", len(eval_examples))
        tf.logging.info("  Num split examples = %d", len(eval_features))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = input_fn_builder(
            input_file=eval_writer.filename,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        all_results = []
        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()
        for result in estimator.predict(predict_input_fn,
                                        yield_single_examples=True,
                                        hooks=eval_hooks):
            if len(all_results) % 1000 == 0:
                tf.logging.info("Processing example: %d" % (len(all_results)))
            unique_id = int(result["unique_ids"])
            start_logits = [float(x) for x in result["start_logits"].flat]
            end_logits = [float(x) for x in result["end_logits"].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.logging.info("-----------------------------")
        tf.logging.info("Total Inference Time = %0.2f for Sentences = %d",
                        eval_time_elapsed,
                        eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.logging.info("Summary Inference Statistics")
        tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
        tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                        cf_50 * 1000)
        tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                        cf_90 * 1000)
        tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                        cf_95 * 1000)
        tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                        cf_99 * 1000)
        tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                        cf_100 * 1000)
        tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.logging.info("Throughput Average (sentences/sec) = %0.2f",
                        ss_sentences_per_second)
        tf.logging.info("-----------------------------")

        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                 "null_odds.json")

        write_predictions(eval_examples, eval_features, all_results,
                          FLAGS.n_best_size, FLAGS.max_answer_length,
                          FLAGS.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file)
Example #40
0
def train(gamma, double_q, n_step_q, exp_fraction, final_eps, kp_type,
          colour_input, patch_sizes, lsp_layers, batch_size, num_iters,
          learning_starts, train_freq, kpt_encoder_type, kpt_cnn_channels,
          agent_size, learning_rate, max_grad_norm, mask_threshold, tau,
          window_size, ckpts_prefix, ckpt_load_dir, vis_load, test_every,
          mp_num_steps, img_size, replay_buffer_size, seed, noise_type, _run):

    model_init_start = time.time()
    process_seed = seed + hvd.local_rank()

    # init Gym environments
    train_env = make_env(mode="train", seed=process_seed)
    if hvd.local_rank() == 0:  # eval only on 1 node (horovod)
        eval_env = make_env(mode="eval", seed=20 * (process_seed + 1))
    n_actions = train_env.action_space.n

    # build models
    vision_model_dict = build_vision_model()
    agent_model_dict = build_agent_model(n_actions=n_actions,
                                         kpt_cnn_channels=kpt_cnn_channels)
    target_agent_model_dict = build_agent_model(
        n_actions=n_actions, kpt_cnn_channels=kpt_cnn_channels)

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = get_optimizer(learning_rate=learning_rate * hvd.size())

    # setting up ckpts for all the modules
    query_ckpt, attn_ckpt, pos_enc_ckpt, node_enc_ckpt, \
    scene_ckpt, kpt_enc_ckpt = None, None, None, None, None, None

    policy_ckpt = tf.train.Checkpoint(optimizer=optimizer,
                                      model=agent_model_dict["agent_net"])

    kpt_enc_ckpt = tf.train.Checkpoint(optimizer=optimizer,
                                       model=agent_model_dict["kpt_encoder"])
    if kpt_encoder_type == "gnn":
        node_enc_ckpt = tf.train.Checkpoint(optimizer=optimizer,
                                            model=agent_model_dict["node_enc"])
        pos_enc_ckpt = tf.train.Checkpoint(optimizer=optimizer,
                                           model=agent_model_dict["pos_net"])

    # load pre-trained vision module
    vision_model_dict = load_vision_model(vision_model_dict, kp_type,
                                          colour_input, batch_size, lsp_layers,
                                          patch_sizes, ckpt_load_dir, vis_load)

    if hvd.local_rank() == 0:
        print("initializing models and env took %4.5f s" %
              (time.time() - model_init_start))

    def train_step(inputs):
        # Minimize the TD error on a batch sampled from replay buffer.
        with tf.GradientTape() as tape:
            step_loss, extra = q_learning(
                vision_model_dict, agent_model_dict, target_agent_model_dict,
                inputs, batch_size, kp_type, agent_size, mask_threshold,
                patch_sizes, kpt_encoder_type, mp_num_steps, img_size,
                lsp_layers, window_size, gamma, double_q, n_step_q)
        w_update_start = time.time()
        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        # collecting trainable params of all modules
        params = []
        for agent_model in agent_model_dict.values():
            params = params + list(agent_model.trainable_variables)

        # compute grads
        grads = tape.gradient(step_loss, params)
        # apply grad clipping
        grads, global_norm = tf.clip_by_global_norm(grads,
                                                    clip_norm=max_grad_norm)
        # update agent
        optimizer.apply_gradients(zip(grads, params))
        # print("grad comp + weight updates take %4.5f" % (time.time() - w_update_start))
        return step_loss, extra

    # load weights using var assignment
    source_vars, target_vars = update_target_networks(agent_model_dict,
                                                      target_agent_model_dict,
                                                      tau)

    # init replay buffer
    data_spec = (specs.TensorSpec([84, 84, 3], tf.int32, 'obs_tm1'),
                 specs.TensorSpec([1], tf.int32, 'a_tm1'),
                 specs.TensorSpec([1], tf.float32, 'r_tm1'),
                 specs.TensorSpec([2], tf.float32, 'begin_end'))
    # each process has it's own smaller reply_buffer
    replay_buffer = EpisodicReplayBuffer(
        capacity=int(replay_buffer_size),
        buffer_size=8,
        dataset_drop_remainder=False,
        data_spec=data_spec,
        begin_episode_fn=lambda x: bool(x[3][0, 0]),
        end_episode_fn=lambda x: bool(x[3][0, 1]))

    # create tf.Dataset object from replay_buffer and sample
    rb_ds = replay_buffer.as_dataset(sample_batch_size=batch_size,
                                     num_steps=window_size + n_step_q + 1)

    # dataset iterator sampling trajectories from replay_buffer
    episode_ids = replay_buffer.create_episode_ids(1)
    rb_ds = rb_ds.prefetch(buffer_size=AUTOTUNE)
    rb_iterator = iter(rb_ds)

    episode_rewards = [0.0]
    obs = train_env.reset()
    reset = False

    # lists for logging exp results
    eps = 0.1
    episode_timestep = 0
    exploration = exploration_policy(num_iters, exp_fraction, final_eps)
    avg_td_error = 0.0
    # init lstm_agent state
    c_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False)
    h_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False)
    best_eval_score = -float("inf")

    # TRAINING LOOP
    for t in range(int(num_iters)):
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if t == 0:
            hvd.broadcast_variables(source_vars, root_rank=0)
            hvd.broadcast_variables(target_vars, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        online_step_start = time.time()

        # convert obs to float and scale to 0-1
        obs_float = np.asarray(obs[None, :, :, :], dtype=np.float32) / 255.0
        # sometimes add distractors
        if noise_type is not "none":
            obs_float = add_noise(obs_float[0, :, :, :], noise_type)
            obs_float = obs_float[None, :, :, :]
        # exploration
        update_eps = tf.constant(exploration.value(t))

        # compute forward pass of input obs over vision + attention modules
        bottom_up_masks, encoder_features, kpt_centers = vision_forward_pass(
            obs_float, vision_model_dict, lsp_layers, kp_type, patch_sizes,
            img_size)

        # compute keypoint encodings

        bottom_up_features = encode_keypoints(
            bottom_up_masks,
            encoder_features,
            kpt_centers,
            mask_threshold,
            kp_type,
            kpt_encoder_type,
            mp_num_steps,
            q_learn=False,
            pos_net=agent_model_dict.get("pos_net"),
            node_encoder=agent_model_dict.get("node_enc"),
            kpt_encoder=agent_model_dict.get(
                "kpt_encoder"))  # passes None if not available

        # agent step
        action, h_t, c_t = agent_model_dict["agent_net"].step(
            bottom_up_features, [h_tm1, c_tm1],
            update_eps,
            training=True,
            stochastic=True)
        # env step
        new_obs, rew, done, _ = train_env.step(action)

        episode_timestep = episode_timestep + 1
        episode_rewards[-1] += rew

        # store transitions in replay buffer
        store_exp_start = time.time()
        # making data_tuple compatible for add_batch() method
        obs = img_as_ubyte(np.array(obs_float[0, :, :, :], dtype=float))
        action = np.array(action, dtype=np.int32)
        rew = np.array(rew, ndmin=1, dtype=np.float32)
        end = np.array(done, ndmin=1, dtype=np.float32)
        begin = np.array(reset, ndmin=1, dtype=np.float32)
        begin_end = np.concatenate((begin, end), axis=0)
        # converting from
        values = (obs, action, rew, begin_end)
        values_batched = tf.nest.map_structure(lambda b: tf.stack([b]), values)
        # add batch of transitions of episode_ids to replay_buffer
        episode_ids = replay_buffer.add_batch(values_batched, episode_ids)

        obs = new_obs
        h_tm1 = h_t
        c_tm1 = c_t
        reset = False
        # episode termination
        if done:
            # saving cummulative returns at end of episode
            print("Episode Return: %3.3f" % (episode_rewards[-1]))
            print(episode_ids.numpy(), update_eps.numpy())
            obs = train_env.reset()
            episode_timestep = 0
            # reset lstm state at episode end
            c_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False)
            h_tm1 = tf.Variable(tf.zeros((1, agent_size)), trainable=False)
            episode_rewards.append(0.0)
            reset = True

        # Q_LEARNING UPDATES BEGIN
        if t > learning_starts and t % train_freq == 0:
            batch_q_start = time.time()
            # sample a batch of trajectories from replay_buffer for recurrent-dqn
            inputs = next(rb_iterator)
            step_loss, extra = train_step(inputs)
            step_loss = hvd.allreduce(step_loss)

            # soft-update target networks
            update_start = time.time()
            source_vars, target_vars = update_target_networks(
                agent_model_dict, target_agent_model_dict, tau)
            # print("Target network updates take %4.5f" % (time.time() - update_start))
            td_error = tf.reduce_mean(hvd.allreduce(extra.td_error), axis=0)

            if hvd.local_rank() == 0:
                print(
                    "Iteration: %5d Step loss: %4.4f, TD_error: %3.4f took %4.5f s"
                    % (t, step_loss, td_error, time.time() - batch_q_start))

                # logging step losses to sacred
                add_sacred_log("train.t",
                               int((t - learning_starts) / train_freq), _run)
                add_sacred_log("train.step_loss", float(step_loss), _run)
                add_sacred_log("train.step_td_error", float(td_error), _run)

            avg_td_error = avg_td_error + np.abs(td_error)
        # VALIDATION/CKPT
        if t > learning_starts and t % test_every == 0:
            # trigger evaluation run on only 1 node
            if hvd.local_rank() == 0:
                eval_start = time.time()
                mean_ep_rew, var_ep_rew, _, _ = eval_step(
                    eval_env, vision_model_dict, agent_model_dict)
                avg_td_error = avg_td_error / float(
                    (t - learning_starts) / train_freq)

                print(
                    "Evaluation after: %5d steps avg_ep_return: %4.5f running_avg_td_error: %4.5f took %4.5f s"
                    % (t, mean_ep_rew, avg_td_error, time.time() - eval_start))

                # logging avg. episodic rewards to sacred
                add_sacred_log("test.t", int(
                    (t - learning_starts) / train_freq), _run)
                add_sacred_log("test.mean_ep_return", float(mean_ep_rew), _run)
                add_sacred_log("test.var_ep_return", float(var_ep_rew), _run)
                add_sacred_log("test.avg_td_error", float(avg_td_error), _run)

                avg_td_error = 0.0

                # ckpt model based on eval-run scores
                if mean_ep_rew > 0.95 * best_eval_score:
                    best_eval_score = mean_ep_rew
                    # Horovod: save checkpoints only on worker 0 to prevent other workers from
                    # corrupting it.
                    policy_ckpt.save(ckpts_prefix + '_agent_net')
                    kpt_enc_ckpt.save(ckpts_prefix + '_kpt_encoder')
                    if kpt_encoder_type == "gnn":
                        node_enc_ckpt.save(ckpts_prefix + '_node_enc')
                        pos_enc_ckpt.save(ckpts_prefix + '_pos_net')

    if hvd.local_rank() == 0:
        print("Training complete!!!")
Example #41
0
def BatchNorm(inputs,
              axis=None,
              training=None,
              momentum=0.9,
              epsilon=1e-5,
              center=True,
              scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):
    """
    Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful)
    in the following:

    1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten.
    4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals.
    5. Support the `sync_statistics` option, which is very useful in small-batch models.

    Args:
        internal_update (bool): if False, add EMA update ops to
          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
          They are very similar in speed, but `internal_update=True` can be used
          when you have conditionals in your model, or when you have multiple networks to train.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None "nccl", or "horovod".

          By default (None), it uses statistics of the input tensor to normalize.
          This is the standard way BatchNorm was done in most frameworks.

          When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
          It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.

          When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`.
          It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
          Note that on single machine this is significantly slower than the "nccl" implementation.

          This implementation averages the per-GPU E[x] and E[x^2] among GPUs to compute
          global mean & variance. Therefore each GPU needs to have the same batch size.

          This option has no effect when not training.

          This option is also known as "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        Combinations of ``training`` and ``ctx.is_training``:

        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
          and used during inference. This is the default.
        * ``training and not ctx.is_training``: still use batch statistics in inference.
        * ``not training and ctx.is_training``: use EMA to normalize in
          training. This is useful when you load a pre-trained BN and
          don't want to fine tune the EMA. EMA will not be updated in
          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_tuple()
    if not training and ctx.is_training:
        assert TF_version >= (1, 4), \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn(
                "[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable({
                'moving_mean': 'mean/EMA',
                'moving_variance': 'variance/EMA'
        }):
            tf_args = dict(axis=axis,
                           momentum=momentum,
                           epsilon=epsilon,
                           center=center,
                           scale=scale,
                           beta_initializer=beta_initializer,
                           gamma_initializer=gamma_initializer,
                           fused=(ndims == 4 and axis in [1, 3]),
                           _reuse=tf.get_variable_scope().reuse)
            if TF_version >= (1, 5):
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs,
                             training=training,
                             scope=tf.get_variable_scope())

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because during training, EMA isn't used
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                add_model_variable(v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else (
            [0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            if six.PY3 and TF_version <= (1, 9) and ctx.is_main_training_tower:
                logger.warn(
                    "A TensorFlow bug will cause cross-GPU BatchNorm to fail. "
                    "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360"
                )

            from tensorflow.contrib.nccl.ops import gen_nccl_ops
            shared_name = re.sub('tower[0-9]+/', '',
                                 tf.get_variable_scope().name)
            num_dev = ctx.total
            if num_dev == 1:
                logger.warn(
                    "BatchNorm(sync_statistics='nccl') is used with only one tower!"
                )
            else:
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                      num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            batch_mean = hvd.allreduce(batch_mean, average=True)
            batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var,
                                           tf.reshape(beta, new_shape),
                                           tf.reshape(gamma, new_shape),
                                           epsilon)
        else:
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta,
                                           gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean,
                                moving_var, momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Example #42
0
    def set_model(self, model):
        self.model = model
        self.graph = tf.Graph()
        with self.graph.as_default():
            #Horovod added: Normal workflow
            config1 = tf.ConfigProto(log_device_placement=False)
            config1.gpu_options.allow_growth = True
            config1.gpu_options.visible_device_list = str(hvd.local_rank())
            self.sess = tf.Session(config=config1)
            #Horovod end
            with self.sess.as_default():
                initializer = tf.contrib.layers.xavier_initializer(
                    uniform=True)
                with tf.variable_scope("model",
                                       reuse=None,
                                       initializer=initializer):
                    self.trainModel = self.model(config=self)
                    #Horovod added: Vary the learning rate, dist optimizer
                    if self.optimizer != None:
                        pass
                    elif self.opt_method == "Adagrad" or self.opt_method == "adagrad":
                        self.optimizer = tf.train.AdagradOptimizer(
                            learning_rate=self.alpha * hvd.size(),
                            initial_accumulator_value=1e-20)
                    elif self.opt_method == "Adadelta" or self.opt_method == "adadelta":
                        self.optimizer = tf.train.AdadeltaOptimizer(
                            self.alpha * hvd.size())
                    elif self.opt_method == "Adam" or self.opt_method == "adam":
                        self.optimizer = tf.train.AdamOptimizer(self.alpha *
                                                                hvd.size())
                    else:
                        self.optimizer = tf.train.GradientDescentOptimizer(
                            self.alpha * hvd.size() * self.sync_after)

                    ################################################################
                    # Fetch a list of our network's trainable parameters.
                    self.trainable_vars = tf.trainable_variables()
                    #print("Shape of trainable vars: {}".format(np.array(self.trainable_vars)))

                    # Create variables to store accumulated gradients
                    self.accumulators = [
                        tf.Variable(tf.zeros_like(tv.initialized_value()),
                                    trainable=False)
                        for tv in self.trainable_vars
                    ]
                    #print("Shape of accumulators: {}".format(np.array(self.accumulators)))

                    # Create a variable for counting the number of accumulations
                    self.accumulation_counter = tf.Variable(0.0,
                                                            trainable=False)

                    # Compute gradients; grad_pairs contains (gradient, variable) pairs
                    self.grad_pairs = self.optimizer.compute_gradients(
                        self.trainModel.loss, self.trainable_vars)
                    # print("Shape of grad_pairs: {}".format(np.array(self.grad_pairs)))
                    # for g, v in self.grad_pairs:
                    # 	print("Shape of grad: {}".format(np.array(g)))

                    # Create operations which add a variable's gradient to its accumulator.
                    self.accumulate_ops = [
                        accumulator.assign_add(grad)
                        for (accumulator,
                             (grad,
                              var)) in zip(self.accumulators, self.grad_pairs
                                           )  #if grad is not None
                    ]

                    # The final accumulation operation is to increment the counter
                    self.accumulate_ops.append(
                        self.accumulation_counter.assign_add(1.0))

                    # Update trainable variables by applying the accumulated gradients
                    # divided by the counter. Note: apply_gradients takes in a list of
                    # (grad, var) pairs
                    # self.apply_step = self.optimizer.apply_gradients(
                    #     [(accumulator / self.accumulation_counter, var) \
                    #         for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs)]
                    # )

                    # Accumulators must be zeroed once the accumulated gradient is applied.
                    self.zero_ops = [
                        accumulator.assign(tf.zeros_like(tv))
                        for (accumulator,
                             tv) in zip(self.accumulators, self.trainable_vars)
                    ]

                    # Add one last op for zeroing the counter
                    self.zero_ops.append(self.accumulation_counter.assign(0.0))
                    ################################################################///////////

                    # self.dist_optimizer = hvd.DistributedOptimizer(self.optimizer)
                    # self.train_op = self.dist_optimizer.minimize(self.trainModel.loss)
                    #Horovod end
                self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
                if (hvd.rank() == 0):
                    self.saver = tf.train.Saver()
                    # self.logSummary = tf.summary.scalar('Train_loss', self.trainModel.loss)
                    # self.train_writer = tf.summary.FileWriter('./train', self.sess.graph)
                self.sess.run(tf.global_variables_initializer())

                #Horovod added: Normal workflow
                self.sess.run(hvd.broadcast_global_variables(0))
Example #43
0
def allreduce(backend, value, name, average):
    return _eval(backend,
                 hvd.allreduce(tf.constant(value, name=name), average=average))
Example #44
0
    def train_step(self, batch_h, batch_t, batch_r, batch_y, counter):
        self.sess.run(self.zero_ops)
        allreduce_loss = 0.0
        for i in range(self.sync_after):
            feed_dict = {
                self.trainModel.batch_h:
                np.append(
                    batch_h[i * self.batch_size:(i + 1) * self.batch_size],
                    batch_h[self.allreduce_batch_size +
                            i * self.batch_size:self.allreduce_batch_size +
                            (i + 1) * self.batch_size]),
                self.trainModel.batch_t:
                np.append(
                    batch_t[i * self.batch_size:(i + 1) * self.batch_size],
                    batch_t[self.allreduce_batch_size +
                            i * self.batch_size:self.allreduce_batch_size +
                            (i + 1) * self.batch_size]),
                self.trainModel.batch_r:
                np.append(
                    batch_r[i * self.batch_size:(i + 1) * self.batch_size],
                    batch_r[self.allreduce_batch_size +
                            i * self.batch_size:self.allreduce_batch_size +
                            (i + 1) * self.batch_size]),
                self.trainModel.batch_y:
                np.append(
                    batch_y[i * self.batch_size:(i + 1) * self.batch_size],
                    batch_y[self.allreduce_batch_size +
                            i * self.batch_size:self.allreduce_batch_size +
                            (i + 1) * self.batch_size])
            }
            _, c = self.sess.run([self.accumulate_ops, self.trainModel.loss],
                                 feed_dict=feed_dict)
            allreduce_loss += c
        self.track_loss.append((counter, allreduce_loss))
        self.sess.run(self.barrier)
        st1 = time.time()

        if hvd.size() > 1:
            averaged_gradients = []
            with tf.name_scope("Allreduce"):
                for (accumulator, (grad, var)) in zip(self.accumulators,
                                                      self.grad_pairs):
                    #if tf.equal(accumulator, 0)
                    if accumulator is not None:
                        avg_grad = hvd.allreduce(accumulator /
                                                 self.accumulation_counter)
                        averaged_gradients.append((avg_grad, var))
                    else:
                        averaged_gradients.append((None, var))
        else:
            averaged_gradients = []
            with tf.name_scope("Allreduce"):
                for (accumulator, (grad, var)) in zip(self.accumulators,
                                                      self.grad_pairs):
                    #print("Shape of accumulator: {}".format(np.array(accumulator)))
                    if accumulator is not None:
                        avg_grad = accumulator / self.accumulation_counter
                        averaged_gradients.append((avg_grad, var))
                    else:
                        averaged_gradients.append((None, var))

        if (counter % 200 == 0):
            print('Averaging gradients for 200 batches took: {} secs'.format(
                time.time() - st1))

        st2 = time.time()
        self.sess.run(self.optimizer.apply_gradients(averaged_gradients))

        if (counter % 200 == 0):
            print('Applying gradients for 200 batches took: {} secs'.format(
                time.time() - st2))

        return allreduce_loss