Beispiel #1
0
    def after_run(self, run_context, run_values):
        self.global_step = run_values.results[0] + 1

        if hvd.size() > 1 and len(run_values.results) == 2:
            if run_values.results[1] > 0:
                run_context.request_stop()
        elif self.signal_recieved:
            run_context.request_stop()
Beispiel #2
0
    def before_run(self, run_context):
        fetches = [tf.train.get_global_step()]
        feed_dict = None

        if hvd.size() > 1 and (self.global_step % self.sync_freq) == 0:
            fetches += [self.allreduce_op]
            feed_dict = {self.input_op: int(self.signal_recieved)}
            
        return tf.train.SessionRunArgs(fetches, feed_dict=feed_dict)
Beispiel #3
0
    def _get_session_config(mode,
                            use_xla,
                            use_dali,
                            use_cpu,
                            gpu_memory_fraction,
                            gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
            raise ValueError(
                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')"
                % mode)

        config = tf.ConfigProto()
        if not use_cpu:
            # Limit available GPU memory (tune the size)
            if use_dali:
                gpu_options = tf.GPUOptions(
                    per_process_gpu_memory_fraction=gpu_memory_fraction)
                config = tf.ConfigProto(gpu_options=gpu_options)
                config.gpu_options.allow_growth = False
            else:
                config.gpu_options.allow_growth = True

            config.allow_soft_placement = True
            config.log_device_placement = False

            config.gpu_options.visible_device_list = str(gpu_id)
            config.gpu_options.force_gpu_compatible = True  # Force pinned memory

            if hvd.size() > 1:
                config.gpu_options.visible_device_list = str(hvd.local_rank())

            config.gpu_options.force_gpu_compatible = True  # Force pinned memory

        if use_xla:
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        if mode == 'train':
            if not use_cpu:
                config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
                config.inter_op_parallelism_threads = max(
                    2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2))

        return config
Beispiel #4
0
    def __init__(self,
                 filenames,
                 idx_filenames,
                 height,
                 width,
                 batch_size,
                 num_threads,
                 dtype=tf.uint8,
                 dali_cpu=True,
                 deterministic=False,
                 training=False):
        device_id = hvd.local_rank()
        shard_id = hvd.rank()
        num_gpus = hvd.size()
        pipe = HybridPipe(tfrec_filenames=filenames,
                          tfrec_idx_filenames=idx_filenames,
                          height=height,
                          width=width,
                          batch_size=batch_size,
                          num_threads=num_threads,
                          device_id=device_id,
                          shard_id=shard_id,
                          num_gpus=num_gpus,
                          deterministic=deterministic,
                          dali_cpu=dali_cpu,
                          training=training)

        daliop = dali_tf.DALIIterator()

        with tf.device("/gpu:0"):
            self.images, self.labels = daliop(pipeline=pipe,
                                              shapes=[(batch_size, height,
                                                       width, 3),
                                                      (batch_size, 1)],
                                              dtypes=[tf.float32, tf.int64],
                                              device_id=device_id)
Beispiel #5
0
 def begin(self):
     if hvd.size() > 1:
         with tf.device("/cpu:0"):
             self.input_op = tf.placeholder(tf.int32, shape=())
             self.allreduce_op = hvd.hvd_global_object.allreduce(
                 self.input_op, op=hvd.hvd_global_object.Sum, name="signal_handler_all_reduce")
Beispiel #6
0
    def train(self,
              iter_unit,
              num_iter,
              run_iter,
              batch_size,
              warmup_steps=50,
              weight_decay=1e-4,
              lr_init=0.1,
              lr_warmup_epochs=5,
              momentum=0.9,
              log_every_n_steps=1,
              loss_scale=256,
              label_smoothing=0.0,
              mixup=0.0,
              use_cosine_lr=False,
              use_static_loss_scaling=False,
              is_benchmark=False,
              quantize=False,
              symmetric=False,
              quant_delay=0,
              finetune_checkpoint=None,
              use_final_conv=False,
              use_qdq=False):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for training!')

        if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16:
            if use_static_loss_scaling:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0"
            else:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
        else:
            use_static_loss_scaling = False  # Make sure it hasn't been set to True on FP32 training

        num_gpus = hvd.size()
        global_batch_size = batch_size * num_gpus

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="train",
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=global_batch_size,
            )

            steps_per_epoch = num_steps / num_epochs

        else:
            num_epochs = 1
            num_steps = num_iter
            steps_per_epoch = num_steps
            num_decay_steps = num_steps
            num_samples = num_steps * batch_size

        if run_iter == -1:
            run_iter = num_steps
        else:
            run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
                data_idx_dir=self.run_hparams.data_idx_dir, mode="train")

        training_hooks = []

        if hvd.rank() == 0:
            print('Starting Model Training...')
            print("Training Epochs", num_epochs)
            print("Total Steps", num_steps)
            print("Steps per Epoch", steps_per_epoch)
            print("Decay Steps", num_decay_steps)
            print("Weight Decay Factor", weight_decay)
            print("Init Learning Rate", lr_init)
            print("Momentum", momentum)
            print("Num GPUs", num_gpus)
            print("Per-GPU Batch Size", batch_size)

            if is_benchmark:
                self.training_logging_hook = hooks.BenchmarkLoggingHook(
                    global_batch_size=global_batch_size,
                    warmup_steps=warmup_steps,
                    logging_steps=log_every_n_steps)
            else:
                self.training_logging_hook = hooks.TrainingLoggingHook(
                    global_batch_size=global_batch_size,
                    num_steps=num_steps,
                    num_samples=num_samples,
                    num_epochs=num_epochs,
                    steps_per_epoch=steps_per_epoch,
                    logging_steps=log_every_n_steps)
            training_hooks.append(self.training_logging_hook)

        if hvd.size() > 1:
            bcast_hook = hvd.hvd_global_object.BroadcastGlobalVariablesHook(0)
            training_hooks.append(bcast_hook)

        training_hooks.append(hooks.PrefillStagingAreasHook())
        training_hooks.append(hooks.TrainingPartitionHook())

        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
            'num_gpus': num_gpus,
            'momentum': momentum,
            'lr_init': lr_init,
            'lr_warmup_epochs': lr_warmup_epochs,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'apply_loss_scaling': use_static_loss_scaling,
            'label_smoothing': label_smoothing,
            'mixup': mixup,
            'num_decay_steps': num_decay_steps,
            'use_cosine_lr': use_cosine_lr,
            'use_final_conv': use_final_conv,
            'quantize': quantize,
            'use_qdq': use_qdq,
            'symmetric': symmetric,
            'quant_delay': quant_delay
        }

        if finetune_checkpoint:
            estimator_params['finetune_checkpoint'] = finetune_checkpoint

        image_classifier = self._get_estimator(
            mode='train',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id)

        def training_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    print("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                if hvd.rank() == 0:
                    print("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )

        try:
            current_step = image_classifier.get_variable_value("global_step")
        except ValueError:
            current_step = 0

        run_iter = max(0, min(run_iter, num_steps - current_step))
        print("Current step:", current_step)

        if run_iter > 0:
            try:
                image_classifier.train(
                    input_fn=training_data_fn,
                    steps=run_iter,
                    hooks=training_hooks,
                )
            except KeyboardInterrupt:
                print("Keyboard interrupt")

        if hvd.rank() == 0:
            if run_iter > 0:
                print('Ending Model Training ...')
                train_throughput = self.training_logging_hook.mean_throughput.value(
                )
                dllogger.log(data={'train_throughput': train_throughput},
                             step=tuple())
            else:
                print(
                    'Model already trained required number of steps. Skipped')
Beispiel #7
0
 def _get_global_batch_size(worker_batch_size):
     return worker_batch_size * hvd.size()
Beispiel #8
0
    def __call__(self, features, labels, mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
            mandatory_params = [
                "batch_size", "lr_init", "num_gpus", "steps_per_epoch", "momentum", "weight_decay", "loss_scale",
                "label_smoothing"
            ]
            for p in mandatory_params:
                if p not in params:
                    raise RuntimeError("Parameter {} is missing.".format(p))

        if mode == tf.estimator.ModeKeys.TRAIN and not self.model_hparams.use_dali:

            with tf.device('/cpu:0'):
                # Stage inputs on the host
                cpu_prefetch_op, (features, labels) = self._stage([features, labels])

            if not self.model_hparams.use_cpu:
                with tf.device('/gpu:0'):
                    # Stage inputs to the device
                    gpu_prefetch_op, (features, labels) = self._stage([features, labels])

        main_device = "/gpu:0" if not self.model_hparams.use_cpu else "/cpu:0"
        with tf.device(main_device):

            if features.dtype != self.model_hparams.dtype:
                features = tf.cast(features, self.model_hparams.dtype)

            # Subtract mean per channel
            # and enforce values between [-1, 1]
            if not self.model_hparams.use_dali:
                features = normalized_inputs(features)

            mixup = 0
            eta = 0

            if mode == tf.estimator.ModeKeys.TRAIN:
                eta = params['label_smoothing']
                mixup = params['mixup']

            if mode != tf.estimator.ModeKeys.PREDICT:
                n_cls = self.model_hparams.n_classes
                one_hot_smoothed_labels = tf.one_hot(labels, n_cls, 
                        on_value=1 - eta + eta / n_cls, off_value=eta / n_cls)
                if mixup != 0:

                    print("Using mixup training with beta=", params['mixup'])
                    beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup'])

                    feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])

                    reversed_feature_coefficients = tf.subtract(
                        tf.ones(shape=feature_coefficients.shape), feature_coefficients
                    )

                    rotated_features = tf.reverse(features, axis=[0])

                    features = feature_coefficients * features + reversed_feature_coefficients * rotated_features

                    label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3])

                    rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])

                    reversed_label_coefficients = tf.subtract(
                        tf.ones(shape=label_coefficients.shape), label_coefficients
                    )

                    one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels

            # Update Global Step
            global_step = tf.train.get_or_create_global_step()
            tf.identity(global_step, name="global_step_ref")

            tf.identity(features, name="features_ref")

            if mode == tf.estimator.ModeKeys.TRAIN:
                tf.identity(labels, name="labels_ref")

            probs, logits = self.build_model(
                features,
                training=mode == tf.estimator.ModeKeys.TRAIN,
                reuse=False,
                use_final_conv=params['use_final_conv']
            )

            if params['use_final_conv']:
                logits = tf.squeeze(logits, axis=[-2, -1])

            y_preds = tf.argmax(logits, axis=1, output_type=tf.int32)

            # Check the output dtype, shall be FP32 in training
            assert (probs.dtype == tf.float32)
            assert (logits.dtype == tf.float32)
            assert (y_preds.dtype == tf.int32)

            tf.identity(logits, name="logits_ref")
            tf.identity(probs, name="probs_ref")
            tf.identity(y_preds, name="y_preds_ref")

            #if mode == tf.estimator.ModeKeys.TRAIN:
            #
            #    assert (len(tf.trainable_variables()) == 161)
            #
            #else:
            #
            #    assert (len(tf.trainable_variables()) == 0)

            if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']:
                dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple())
                if params['symmetric']:
                    dllogger.log(data={"MODE": "USING SYMMETRIC MODE"}, step=tuple())
                    tf.contrib.quantize.experimental_create_training_graph(
                        tf.get_default_graph(),
                        symmetric=True,
                        use_qdq=params['use_qdq'],
                        quant_delay=params['quant_delay']
                    )
                else:
                    dllogger.log(data={"MODE": "USING ASSYMETRIC MODE"}, step=tuple())
                    tf.contrib.quantize.create_training_graph(
                        tf.get_default_graph(), quant_delay=params['quant_delay'], use_qdq=params['use_qdq']
                    )

            # Fix for restoring variables during fine-tuning of Resnet
            if 'finetune_checkpoint' in params.keys():
                train_vars = tf.trainable_variables()
                train_var_dict = {}
                for var in train_vars:
                    train_var_dict[var.op.name] = var
                dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple())
                tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict)

        if mode == tf.estimator.ModeKeys.PREDICT:

            predictions = {'classes': y_preds, 'probabilities': probs}

            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                export_outputs={'predict': tf.estimator.export.PredictOutput(predictions)}
            )

        else:

            with tf.device(main_device):

                if mode == tf.estimator.ModeKeys.TRAIN:
                    acc_top1 = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
                    acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5)

                else:
                    acc_top1, acc_top1_update_op = tf.metrics.mean(
                        tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
                    )
                    acc_top5, acc_top5_update_op = tf.metrics.mean(
                        tf.nn.in_top_k(predictions=logits, targets=labels, k=5)
                    )

                tf.identity(acc_top1, name="acc_top1_ref")
                tf.identity(acc_top5, name="acc_top5_ref")

                predictions = {
                    'classes': y_preds,
                    'probabilities': probs,
                    'accuracy_top1': acc_top1,
                    'accuracy_top5': acc_top5
                }

                cross_entropy = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_smoothed_labels)

                assert (cross_entropy.dtype == tf.float32)
                tf.identity(cross_entropy, name='cross_entropy_loss_ref')

                def loss_filter_fn(name):
                    """we don't need to compute L2 loss for BN and bias (eq. to add a cste)"""
                    return all(
                        [
                            tensor_name not in name.lower()
                            # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
                            for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
                        ]
                    )

                filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)]

                if len(filtered_params) != 0:

                    l2_loss_per_vars = [tf.nn.l2_loss(v) for v in filtered_params]
                    l2_loss = tf.multiply(tf.add_n(l2_loss_per_vars), params["weight_decay"])

                else:
                    l2_loss = tf.zeros(shape=(), dtype=tf.float32)

                assert (l2_loss.dtype == tf.float32)
                tf.identity(l2_loss, name='l2_loss_ref')

                total_loss = tf.add(cross_entropy, l2_loss, name="total_loss")

                assert (total_loss.dtype == tf.float32)
                tf.identity(total_loss, name='total_loss_ref')

                tf.summary.scalar('cross_entropy', cross_entropy)
                tf.summary.scalar('l2_loss', l2_loss)
                tf.summary.scalar('total_loss', total_loss)

                if mode == tf.estimator.ModeKeys.TRAIN:

                    with tf.device("/cpu:0"):

                        learning_rate = learning_rate_scheduler(
                            lr_init=params["lr_init"],
                            lr_warmup_epochs=params["lr_warmup_epochs"],
                            global_step=global_step,
                            batch_size=params["batch_size"],
                            num_batches_per_epoch=params["steps_per_epoch"],
                            num_decay_steps=params["num_decay_steps"],
                            num_gpus=params["num_gpus"],
                            use_cosine_lr=params["use_cosine_lr"]
                        )

                    tf.identity(learning_rate, name='learning_rate_ref')
                    tf.summary.scalar('learning_rate', learning_rate)

                    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])

                    if params["apply_loss_scaling"]:
                        optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])

                    if hvd.size() > 1:
                        optimizer = hvd.hvd_global_object.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]

                    deterministic = True
                    gate_gradients = (tf.compat.v1.train.Optimizer.GATE_OP if deterministic else tf.compat.v1.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)

                    if self.model_hparams.use_dali:
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
                    elif self.model_hparams.use_cpu:
                        train_ops = tf.group(
                            backprop_op, cpu_prefetch_op, update_ops, name='train_ops'
                        )
                    else:
                        train_ops = tf.group(
                            backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops'
                        )

                    return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops)

                elif mode == tf.estimator.ModeKeys.EVAL:
                    eval_metrics = {
                        "top1_accuracy": (acc_top1, acc_top1_update_op),
                        "top5_accuracy": (acc_top5, acc_top5_update_op)
                    }

                    return tf.estimator.EstimatorSpec(
                        mode=mode, predictions=predictions, loss=total_loss, eval_metric_ops=eval_metrics
                    )

                else:
                    raise NotImplementedError('Unknown mode {}'.format(mode))
Beispiel #9
0
                     loss_scale=FLAGS.static_loss_scale,
                     label_smoothing=FLAGS.label_smoothing,
                     mixup=FLAGS.mixup,
                     use_static_loss_scaling=(FLAGS.static_loss_scale != -1),
                     use_cosine_lr=FLAGS.cosine_lr,
                     is_benchmark=FLAGS.mode == 'training_benchmark',
                     use_final_conv=FLAGS.use_final_conv,
                     quantize=FLAGS.quantize,
                     symmetric=FLAGS.symmetric,
                     quant_delay=FLAGS.quant_delay,
                     use_qdq=FLAGS.use_qdq,
                     finetune_checkpoint=FLAGS.finetune_checkpoint)

    if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:

        if FLAGS.mode == 'inference_benchmark' and hvd.size() > 1:
            raise NotImplementedError(
                "Only single GPU inference is implemented.")

        elif hvd.rank() == 0:
            runner.evaluate(iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
                            num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
                            warmup_steps=FLAGS.warmup_steps,
                            batch_size=FLAGS.batch_size,
                            log_every_n_steps=FLAGS.display_every,
                            is_benchmark=FLAGS.mode == 'inference_benchmark',
                            export_dir=FLAGS.export_dir,
                            quantize=FLAGS.quantize,
                            symmetric=FLAGS.symmetric,
                            use_final_conv=FLAGS.use_final_conv,
                            use_qdq=FLAGS.use_qdq)