Example #1
0
    def __call__(self, features, labels, mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
            mandatory_params = ["batch_size", "lr_init", "num_gpus", "steps_per_epoch",
                                "momentum", "weight_decay", "loss_scale", "label_smoothing"]
            for p in mandatory_params:
                if p not in params:
                    raise RuntimeError("Parameter {} is missing.".format(p))

        if mode == tf.estimator.ModeKeys.TRAIN and not self.model_hparams.use_dali:

            with tf.device('/cpu:0'):
                # Stage inputs on the host
                cpu_prefetch_op, (features, labels) = self._stage([features, labels])

            with tf.device('/gpu:0'):
                # Stage inputs to the device
                gpu_prefetch_op, (features, labels) = self._stage([features, labels])

        with tf.device("/gpu:0"):

            if features.dtype != self.model_hparams.dtype:
                features = tf.cast(features, self.model_hparams.dtype)

            # Subtract mean per channel
            # and enforce values between [-1, 1]
            if not self.model_hparams.use_dali:
                features = normalized_inputs(features)

            mixup = 0
            eta = 0
            
            if mode == tf.estimator.ModeKeys.TRAIN:        
                eta = params['label_smoothing']
                mixup = params['mixup']
                
            if mode != tf.estimator.ModeKeys.PREDICT: 
                one_hot_smoothed_labels = tf.one_hot(labels, 1001, 
                                                     on_value = 1 - eta + eta/1001,
                                                     off_value = eta/1001)
                if mixup != 0:

                    print("Using mixup training with beta=", params['mixup'])
                    beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup'])

                    feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])      

                    reversed_feature_coefficients = tf.subtract(tf.ones(shape=feature_coefficients.shape), feature_coefficients)

                    rotated_features = tf.reverse(features, axis=[0])      

                    features = feature_coefficients * features + reversed_feature_coefficients * rotated_features

                    label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3])

                    rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])    

                    reversed_label_coefficients = tf.subtract(tf.ones(shape=label_coefficients.shape), label_coefficients)

                    one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels
                
                
            # Update Global Step
            global_step = tf.train.get_or_create_global_step()
            tf.identity(global_step, name="global_step_ref")

            tf.identity(features, name="features_ref")
            
            if mode == tf.estimator.ModeKeys.TRAIN:
                tf.identity(labels, name="labels_ref")

            probs, logits = self.build_model(
                features,
                training=mode == tf.estimator.ModeKeys.TRAIN,
                reuse=False
            )

            y_preds = tf.argmax(logits, axis=1, output_type=tf.int32)

            # Check the output dtype, shall be FP32 in training
            assert (probs.dtype == tf.float32)
            assert (logits.dtype == tf.float32)
            assert (y_preds.dtype == tf.int32)

            tf.identity(logits, name="logits_ref")
            tf.identity(probs, name="probs_ref")
            tf.identity(y_preds, name="y_preds_ref")

            #if mode == tf.estimator.ModeKeys.TRAIN:
            #    
            #    assert (len(tf.trainable_variables()) == 161)
            #
            #else:
            #    
            #    assert (len(tf.trainable_variables()) == 0)


        if mode == tf.estimator.ModeKeys.PREDICT:

            predictions = {'classes': y_preds, 'probabilities': probs}

            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                export_outputs={'predict': tf.estimator.export.PredictOutput(predictions)}
            )

        else:

            with tf.device("/gpu:0"):

                if mode == tf.estimator.ModeKeys.TRAIN:
                    acc_top1 = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
                    acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5)

                else:
                    acc_top1, acc_top1_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=1))
                    acc_top5, acc_top5_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=5))

                tf.identity(acc_top1, name="acc_top1_ref")
                tf.identity(acc_top5, name="acc_top5_ref")

                predictions = {
                    'classes': y_preds,
                    'probabilities': probs,
                    'accuracy_top1': acc_top1,
                    'accuracy_top5': acc_top5
                }
                
                cross_entropy = tf.losses.softmax_cross_entropy(
                    logits=logits, onehot_labels=one_hot_smoothed_labels)

                assert (cross_entropy.dtype == tf.float32)
                tf.identity(cross_entropy, name='cross_entropy_loss_ref')

                def loss_filter_fn(name):
                    """we don't need to compute L2 loss for BN and bias (eq. to add a cste)"""
                    return all([
                        tensor_name not in name.lower()
                        # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
                        for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
                    ])

                filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)]

                if len(filtered_params) != 0:

                    l2_loss_per_vars = [tf.nn.l2_loss(v) for v in filtered_params]
                    l2_loss = tf.multiply(tf.add_n(l2_loss_per_vars), params["weight_decay"])

                else:
                    l2_loss = tf.zeros(shape=(), dtype=tf.float32)

                assert (l2_loss.dtype == tf.float32)
                tf.identity(l2_loss, name='l2_loss_ref')

                total_loss = tf.add(cross_entropy, l2_loss, name="total_loss")

                assert (total_loss.dtype == tf.float32)
                tf.identity(total_loss, name='total_loss_ref')

                tf.summary.scalar('cross_entropy', cross_entropy)
                tf.summary.scalar('l2_loss', l2_loss)
                tf.summary.scalar('total_loss', total_loss)
                
                if mode == tf.estimator.ModeKeys.TRAIN:

                    with tf.device("/cpu:0"):

                        learning_rate = learning_rate_scheduler(
                            lr_init=params["lr_init"],
                            lr_warmup_epochs=params["lr_warmup_epochs"],
                            global_step=global_step,
                            batch_size=params["batch_size"],
                            num_batches_per_epoch=params["steps_per_epoch"],
                            num_decay_steps=params["num_decay_steps"],
                            num_gpus=params["num_gpus"],
                            use_cosine_lr=params["use_cosine_lr"]
                        )

                    tf.identity(learning_rate, name='learning_rate_ref')
                    tf.summary.scalar('learning_rate', learning_rate)

                    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])

                    if params["apply_loss_scaling"]:
                        optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])

                    if hvd_utils.is_using_hvd():
                        optimizer = hvd.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]
                    
                    deterministic = True
                    gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)

                    
                    if self.model_hparams.use_dali:
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
                    else:
                        train_ops = tf.group(backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops')

                    return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops)

                elif mode == tf.estimator.ModeKeys.EVAL:
                    eval_metrics = {
                        "top1_accuracy": (acc_top1, acc_top1_update_op),
                        "top5_accuracy": (acc_top5, acc_top5_update_op)
                    }

                    return tf.estimator.EstimatorSpec(
                        mode=mode,
                        predictions=predictions,
                        loss=total_loss,
                        eval_metric_ops=eval_metrics
                    )

                else:
                    raise NotImplementedError('Unknown mode {}'.format(mode))
Example #2
0
    def __call__(self, features, labels, mode, params):

        # print(params)

        if mode == tf.estimator.ModeKeys.TRAIN:

            if "batch_size" not in params.keys():
                raise RuntimeError("Parameter `batch_size` is missing...")

            if "learning_rate_init" not in params.keys():
                raise RuntimeError("Parameter `learning_rate` is missing...")

            if "num_gpus" not in params.keys():
                raise RuntimeError("Parameter `num_gpus` is missing...")

            if "steps_per_epoch" not in params.keys():
                raise RuntimeError("Parameter `steps_per_epoch` is missing...")

            if "momentum" not in params.keys():
                raise RuntimeError("Parameter `momentum` is missing...")

            if "weight_decay" not in params.keys():
                raise RuntimeError("Parameter `weight_decay` is missing...")

            if "loss_scale" not in params.keys():
                raise RuntimeError("Parameter `loss_scale` is missing...")

        if mode == tf.estimator.ModeKeys.TRAIN:
            with tf.device('/cpu:0'):
                # Stage inputs on the host
                cpu_prefetch_op, (features, labels) = ResnetModel._stage(
                    [features, labels])

            with tf.device('/gpu:0'):
                # Stage inputs to the device
                gpu_prefetch_op, (features, labels) = ResnetModel._stage(
                    [features, labels])

        with tf.device("/gpu:0"):

            if True:  # not params['use_trt']:

                if features.dtype != self.model_hparams.dtype:
                    features = tf.cast(features, self.model_hparams.dtype)

                # Subtract mean per channel
                # and enforce values between [-1, 1]
                # features = normalized_inputs(features)

                # Update Global Step
                global_step = tf.train.get_or_create_global_step()
                tf.identity(global_step, name="global_step_ref")

                # tf.identity(features, name="features_ref")
                # tf.identity(labels, name="labels_ref")

                probs, logits = self.build_model(
                    features,
                    training=mode == tf.estimator.ModeKeys.TRAIN,
                    reuse=False)

            else:

                trt_graph = trt.create_inference_graph(
                    input_graph_def=None,
                    outputs=None,
                    input_saved_model_dir=os.path.join(
                        self.model_hparams.model_dir, '1554216247'),
                    input_saved_model_tags=['serve'],
                    max_batch_size=params["batch_size"],
                    max_workspace_size_bytes=1 << 20,
                    precision_mode="FP32")

                for node in trt_graph.node:
                    print(node.name)

                y_preds = tf.import_graph_def(
                    trt_graph,
                    return_elements=['resnet50_v1.5/output/softmax:0'])

                predictions = {'classes': y_preds[0]}

                return tf.estimator.EstimatorSpec(
                    mode=tf.estimator.ModeKeys.PREDICT,
                    predictions=predictions)

            y_preds = tf.argmax(logits, axis=1, output_type=tf.int32)

            # Check the output dtype, shall be FP32 in training
            assert (probs.dtype == tf.float32)
            assert (logits.dtype == tf.float32)
            assert (y_preds.dtype == tf.int32)

            tf.identity(logits, name="logits_ref")
            tf.identity(probs, name="probs_ref")
            tf.identity(y_preds, name="y_preds_ref")

            if mode == tf.estimator.ModeKeys.TRAIN:

                assert (len(tf.trainable_variables()) == 161)

            else:

                assert (len(tf.trainable_variables()) == 0)

        if mode == tf.estimator.ModeKeys.PREDICT:

            predictions = {'classes': y_preds, 'probabilities': probs}

            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                export_outputs={
                    'predict': tf.estimator.export.PredictOutput(predictions)
                })

        else:

            with tf.device("/gpu:0"):

                if mode == tf.estimator.ModeKeys.TRAIN:
                    acc_top1 = tf.nn.in_top_k(predictions=logits,
                                              targets=labels,
                                              k=1)
                    acc_top5 = tf.nn.in_top_k(predictions=logits,
                                              targets=labels,
                                              k=5)

                else:
                    acc_top1, acc_top1_update_op = tf.metrics.mean(
                        tf.nn.in_top_k(predictions=logits, targets=labels,
                                       k=1))
                    acc_top5, acc_top5_update_op = tf.metrics.mean(
                        tf.nn.in_top_k(predictions=logits, targets=labels,
                                       k=5))

                tf.identity(acc_top1, name="acc_top1_ref")
                tf.identity(acc_top5, name="acc_top5_ref")

                predictions = {
                    'classes': y_preds,
                    'probabilities': probs,
                    'accuracy_top1': acc_top1,
                    'accuracy_top5': acc_top5
                }

                cross_entropy = tf.losses.sparse_softmax_cross_entropy(
                    logits=logits, labels=labels)

                assert (cross_entropy.dtype == tf.float32)
                tf.identity(cross_entropy, name='cross_entropy_loss_ref')

                def loss_filter_fn(name):
                    """we don't need to compute L2 loss for BN and bias (eq. to add a cste)"""
                    return all([
                        tensor_name not in name.lower()
                        # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
                        for tensor_name in
                        ["batchnorm", "batch_norm", "batch_normalization"]
                    ])

                filtered_params = [
                    tf.cast(v, tf.float32) for v in tf.trainable_variables()
                    if loss_filter_fn(v.name)
                ]

                if len(filtered_params) != 0:

                    l2_loss_per_vars = [
                        tf.nn.l2_loss(v) for v in filtered_params
                    ]
                    l2_loss = tf.multiply(tf.add_n(l2_loss_per_vars),
                                          params["weight_decay"])

                else:
                    l2_loss = tf.zeros(shape=(), dtype=tf.float32)

                assert (l2_loss.dtype == tf.float32)
                tf.identity(l2_loss, name='l2_loss_ref')

                total_loss = tf.add(cross_entropy, l2_loss, name="total_loss")

                assert (total_loss.dtype == tf.float32)
                tf.identity(total_loss, name='total_loss_ref')

                tf.summary.scalar('cross_entropy', cross_entropy)
                tf.summary.scalar('l2_loss', l2_loss)
                tf.summary.scalar('total_loss', total_loss)

                if mode == tf.estimator.ModeKeys.TRAIN:

                    with tf.device("/cpu:0"):

                        learning_rate = learning_rate_scheduler(
                            learning_rate_init=params["learning_rate_init"],
                            global_step=global_step,
                            batch_size=params["batch_size"],
                            num_batches_per_epoch=params["steps_per_epoch"],
                            num_gpus=params["num_gpus"])

                    tf.identity(learning_rate, name='learning_rate_ref')
                    tf.summary.scalar('learning_rate', learning_rate)

                    optimizer = tf.train.MomentumOptimizer(
                        learning_rate=learning_rate,
                        momentum=params["momentum"])

                    if params["apply_loss_scaling"]:
                        optimizer = FixedLossScalerOptimizer(
                            optimizer, scale=params["loss_scale"])

                    if hvd_utils.is_using_hvd():
                        optimizer = hvd.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]

                    deterministic = True
                    gate_gradients = (tf.train.Optimizer.GATE_OP
                                      if deterministic else
                                      tf.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(
                        total_loss,
                        gate_gradients=gate_gradients,
                        global_step=global_step)

                    train_ops = tf.group(backprop_op,
                                         cpu_prefetch_op,
                                         gpu_prefetch_op,
                                         update_ops,
                                         name='train_ops')

                    return tf.estimator.EstimatorSpec(mode=mode,
                                                      loss=total_loss,
                                                      train_op=train_ops)

                elif mode == tf.estimator.ModeKeys.EVAL:

                    eval_metrics = {
                        "top1_accuracy": (acc_top1, acc_top1_update_op),
                        "top5_accuracy": (acc_top5, acc_top5_update_op)
                    }

                    return tf.estimator.EstimatorSpec(
                        mode=mode,
                        predictions=predictions,
                        loss=total_loss,
                        eval_metric_ops=eval_metrics)

                else:
                    raise NotImplementedError('Unknown mode {}'.format(mode))