def get_model_fn(features, labels, mode, params):
    """Returns a function that will build the TargetedLearning framework."""
    """Model body.

	Args:
		features: a list of tensors
		labels: a list of tensors
		mode: ModeKeys.TRAIN or EVAL
		params: Hyperparameters suitable for tuning
	Returns:
		A EstimatorSpec object.
	"""
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    weight_decay = params.weight_decay
    out_lr = 0.1  #params.learning_rate

    train_features = features[0]
    train_labels = labels[0]
    if is_training:
        val_features = features[1]
        val_labels = labels[1]
    else:
        val_features = features[0]
        val_labels = labels[0]

    # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
    # on CPU. The exception is Intel MKL on CPU which is optimal with
    # channels_last.
    num_gpus = len(utils.get_available_gpus())
    data_format = params.data_format
    if not data_format:
        if num_gpus == 0:
            data_format = 'channels_last'
        else:
            data_format = 'channels_first'

    train_op = []

    # Building the base model
    with tf.compat.v1.variable_scope('base_model') as var_scope:
        if params.dataset == 'mnist':
            base_model = model.BilevelLenet(num_class=params.num_class)
        else:
            base_model = model.BilevelResNet(resnet_size=params.num_layers,
                                             num_classes=params.num_class,
                                             resnet_version=params.version)
        base_model_logits = base_model(train_features, is_training)
        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS, var_scope.name)
        extra_update_ops = base_model.get_updates_for(train_features)
        update_ops.extend(extra_update_ops)
        # Get the params of the model
        base_model_params = tf.compat.v1.trainable_variables(
            scope=var_scope.name)

        # Set initial weights
        class_init = np.array([[1.0 / params.num_class]
                               for _ in range(params.num_class)
                               ]).astype(np.float32)
        class_weights = tf.compat.v1.get_variable('class_weight',
                                                  initializer=class_init)

        weight = tf.matmul(
            tf.cast(
                tf.one_hot(train_labels,
                           len(class_init),
                           on_value=1,
                           off_value=0), tf.float32), class_weights)

        # Get the loss of the main model
        base_model_loss, base_model_preds = _loss_fn(
            base_model_logits,
            tf.one_hot(train_labels, params.num_class, on_value=1,
                       off_value=0))
        base_model_loss_reduced = tf.reduce_mean(
            tf.squeeze(weight) * base_model_loss) + weight_decay * tf.add_n(
                [tf.nn.l2_loss(v) for v in base_model_params])

    # Define the outer model's logits, which is the bilevel model
    with tf.compat.v1.variable_scope(
            'bilevel_model', reuse=tf.compat.v1.AUTO_REUSE) as var_scope1:
        base_model.perturb_model_weights(base_model_loss_reduced,
                                         params.learning_rate, var_scope.name)
        target_logits = base_model(val_features, False)
        target_params = tf.compat.v1.trainable_variables(scope=var_scope1.name)
        target_loss, target_preds = _loss_fn(
            target_logits,
            tf.one_hot(val_labels, params.num_class, on_value=1, off_value=0))
        target_loss = tf.reduce_mean(target_loss) + weight_decay * tf.add_n(
            [tf.nn.l2_loss(v) for v in target_params])

    # Calculate the gradients with respect to the class weights and normalize it
    class_weight_gradient = tf.gradients(target_loss, class_weights)
    update_class_weights = tf.clip_by_value(class_weights -
                                            out_lr * class_weight_gradient[0],
                                            clip_value_min=0.0,
                                            clip_value_max=100.0)
    sum_class_weights = tf.reduce_sum(update_class_weights) + 2e-12
    update_class_weights /= sum_class_weights

    # Update the weight every n steps.
    weight_update_hook = utils.WeightUpdateHook1(
        class_weights,
        update_class_weights,
        every_n_steps=10,
        log_every_n_step=params.log_freq)

    # Calculate the base model grads
    base_model_grads = tf.gradients(base_model_loss_reduced, base_model_params)
    base_model_gradvars = zip(base_model_grads, base_model_params)

    boundaries = [
        params.num_batches_per_epoch * x
        for x in np.array([91, 136, 182], dtype=np.int64)
    ]
    staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.001]]

    learning_rate = tf.compat.v1.train.piecewise_constant(
        tf.compat.v1.train.get_global_step(), boundaries, staged_lr)

    # Define optimizer
    optimizer = tf.compat.v1.train.MomentumOptimizer(
        learning_rate=learning_rate, momentum=params.momentum)
    # optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
    train_op.append(
        optimizer.apply_gradients(
            base_model_gradvars,
            global_step=tf.compat.v1.train.get_global_step()))

    # Calculate metrics
    target_accuracy = tf.compat.v1.metrics.accuracy(val_labels,
                                                    target_preds['classes'])
    accuracy = tf.compat.v1.metrics.accuracy(train_labels,
                                             base_model_preds['classes'])
    # The following metrics are for the binary classification scenario.
    # They should be adopted for multiclass classification tasks.
    if params.num_class == 2:
        train_labels_mask = tf.cast(train_labels, tf.bool)
        inverse_train_labels_mask = tf.cast(
            tf.math.logical_not(train_labels_mask), tf.float32)
        inverse_prediction_mask = tf.cast(
            tf.math.logical_not(tf.cast(base_model_preds['classes'], tf.bool)),
            tf.float32)
        recall_minor = tf.compat.v1.metrics.recall(inverse_train_labels_mask,
                                                   inverse_prediction_mask)
        recall_major = tf.compat.v1.metrics.recall(train_labels,
                                                   base_model_preds['classes'])
        precision_minor = tf.compat.v1.metrics.precision(
            inverse_train_labels_mask, inverse_prediction_mask)
        metrics = {
            'obj/accuracy': accuracy,
            'metrics/recall_minor': recall_minor,
            'metrics/recall_major': recall_major,
            'metrics/precision_minor': precision_minor
        }
    else:
        metrics = {'obj/accuracy': accuracy}

    examples_sec_hook = utils.ExamplesPerSecondHook(
        params.train_batch_size, every_n_steps=params.log_freq)

    tensors_to_log = {
        'Target loss': target_loss,
        'Main loss': base_model_loss_reduced,
        'Target accuracy': target_accuracy[1],
        'Main accuracy': accuracy[1],
        'learning_rates': learning_rate,
        'step': tf.compat.v1.train.get_global_step()
    }

    logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log,
                                                  every_n_iter=params.log_freq)
    train_hooks = [weight_update_hook, logging_hook, examples_sec_hook]

    train_op.extend(update_ops)
    train_op = tf.group(*train_op)

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=target_preds,
                                      loss=base_model_loss_reduced,
                                      train_op=train_op,
                                      training_hooks=train_hooks,
                                      eval_metric_ops=metrics)
Exemple #2
0
    def model_fn(features, labels, mode):
        """Inception_Resnet_V2 model body.
        Support single host, one or more GPU training. Parameter distribution can
        be either one of the following scheme.
        1. CPU is the parameter server and manages gradient updates.
        2. Parameters are distributed evenly across all GPUs, and the first GPU
        manages gradient updates.
        Args:
        features: a list of tensors, one for each tower
        labels: a list of tensors, one for each tower
        mode: ModeKeys.TRAIN or EVAL
        Returns:
        A EstimatorSpec object.
        """
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        tower_features = features
        tower_labels = labels
        tower_losses = []
        tower_gradvars = []
        tower_preds = []

        # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
        # on CPU. The exception is Intel MKL on CPU which is optimal with
        # channels_last.
        data_format = None
        if not data_format:
            if GPU_COUNT == 0:
                data_format = 'channels_last'
            else:
                data_format = 'channels_first'

        if GPU_COUNT == 0:
            num_devices = 1
            device_type = 'cpu'
        else:
            num_devices = GPU_COUNT
            device_type = 'gpu'

        for i in range(num_devices):
            worker_device = '/{}:{}'.format(device_type, i)
            if VARIABLE_STRATEGY == 'CPU':
                device_setter = utils.local_device_setter(
                    worker_device=worker_device)
            elif VARIABLE_STRATEGY == 'GPU':
                device_setter = utils.local_device_setter(
                    ps_device_type='gpu',
                    worker_device=worker_device,
                    ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
                        GPU_COUNT, tf.contrib.training.byte_size_load_fn))
            with tf.variable_scope('', reuse=bool(i != 0)):
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        loss, gradvars, preds = tower_fn(is_training, tower_features[i],
                                                         tower_labels and tower_labels[i], num_classes)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)
                        if i == 0:
                            # Only trigger batch_norm moving mean and variance update from
                            # the 1st tower. Ideally, we should grab the updates from all
                            # towers but these stats accumulate extremely fast so we can
                            # ignore the other stats from the other towers without
                            # significant detriment.
                            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                                           name_scope)
        if mode == 'train' or mode == 'eval':
            # Now compute global loss and gradients.
            gradvars = []
            with tf.name_scope('gradient_ing'):
                all_grads = {}
                for grad, var in itertools.chain(*tower_gradvars):
                    if grad is not None:
                        all_grads.setdefault(var, []).append(grad)
                for var, grads in six.iteritems(all_grads):
                    # Average gradients on the same device as the variables
                    # to which they apply.
                    with tf.device(var.device):
                        if len(grads) == 1:
                            avg_grad = grads[0]
                        else:
                            avg_grad = tf.multiply(
                                tf.add_n(grads), 1. / len(grads))
                    gradvars.append((avg_grad, var))

            # Device that runs the ops to apply global gradient updates.
            consolidation_device = '/gpu:0' if VARIABLE_STRATEGY == 'GPU' else '/cpu:0'
            with tf.device(consolidation_device):
                loss = tf.reduce_mean(tower_losses, name='loss')

                examples_sec_hook = utils.ExamplesPerSecondHook(
                    BATCH_SIZE, every_n_steps=10)

                global_step = tf.train.get_global_step()

                learning_rate = tf.constant(LEARNING_RATE)

                tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}

                logging_hook = tf.train.LoggingTensorHook(
                    tensors=tensors_to_log, every_n_iter=100)

                initializer_hook = utils.IteratorInitializerHook()

                train_hooks = [initializer_hook, logging_hook, examples_sec_hook]

                optimizer = tf.train.MomentumOptimizer(
                    learning_rate=LEARNING_RATE, momentum=MOMENTUM)

                # Create single grouped train op
                train_op = [
                    optimizer.apply_gradients(gradvars, global_step=global_step)
                ]
                train_op.extend(update_ops)
                train_op = tf.group(*train_op)

                predictions = {
                    'classes':
                        tf.concat([p['classes'] for p in tower_preds], axis=0),
                    'probabilities':
                        tf.concat([p['probabilities']
                                for p in tower_preds], axis=0)
                }
                stacked_labels = tf.concat(labels, axis=0)
                metrics = {
                    'accuracy':
                        tf.metrics.accuracy(stacked_labels, predictions['classes'])
                }

            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                loss=loss,
                train_op=train_op,
                training_hooks=train_hooks,
                eval_metric_ops=metrics)
        else:
            predictions = {
                'classes':
                    tf.concat([p['classes'] for p in tower_preds], axis=0),
                'probabilities':
                    tf.concat([p['probabilities']
                            for p in tower_preds], axis=0),
                'features': tf.concat([feature for feature in features], axis=0)
            }
            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions)
    def _hg_model_fn(features, labels, mode, params):
        """ HG model body.

    Support single host, one or more GPU training. Parameter distribution can
    be either one of the following scheme.
    1. CPU is the parameter server and manages gradient updates.
    2. Parameters are distributed evenly across all GPUs, and the first GPU
       manages gradient updates.

    Args:
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        weight_decay = params.weight_decay
        momentum = params.momentum
        decay_factor = params.decay_factor
        decay_step = params.decay_step
        init_learning_rate = params.init_learning_rate
        num_stacks = params.num_stacks
        num_joints = params.num_joints

        tower_features = features
        if mode == tf.estimator.ModeKeys.PREDICT:
            if num_gpus < 1:
                tower_labels = [None]
            else:
                tower_labels = [None for i in range(num_gpus)]
        else:
            tower_labels = labels

        tower_losses = []
        tower_gradvars = []
        tower_preds = []

        # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
        # on CPU. The exception is Intel MKL on CPU which is optimal with
        # channels_last.
        data_format = params.data_format
        if not data_format:
            if num_gpus == 0:
                data_format = 'channels_last'
            else:
                data_format = 'channels_first'

        if num_gpus == 0:
            num_devices = 1
            device_type = 'cpu'
        else:
            num_devices = num_gpus
            device_type = 'gpu'

        for i in range(num_devices):
            worker_device = '/{}:{}'.format(device_type, i)
            if variable_strategy == 'CPU':
                device_setter = utils.local_device_setter(
                    worker_device=worker_device)
            elif variable_strategy == 'GPU':
                device_setter = utils.local_device_setter(
                    ps_device_type='gpu',
                    worker_device=worker_device,
                    ps_strategy=tf.contrib.training.
                    GreedyLoadBalancingStrategy(
                        num_gpus, tf.contrib.training.byte_size_load_fn))
            if mode == tf.estimator.ModeKeys.TRAIN:
                batch_size = params.train_batch_size / num_devices
            else:
                batch_size = params.eval_batch_size / num_devices

            with tf.variable_scope('hg', reuse=bool(i != 0)):
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        loss, gradvars, preds = _tower_fn(
                            mode, weight_decay, tower_features[i][0],
                            tower_labels[i], data_format,
                            params.batch_norm_decay, params.batch_norm_epsilon,
                            params.num_stacks, params.num_out, params.n_low,
                            params.num_joints, batch_size, params.seq_length)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)
                        if i == 0:
                            # Only trigger batch_norm moving mean and variance update from
                            # the 1st tower. Ideally, we should grab the updates from all
                            # towers but these stats accumulate extremely fast so we can
                            # ignore the other stats from the other towers without
                            # significant detriment.
                            update_ops = tf.get_collection(
                                tf.GraphKeys.UPDATE_OPS, name_scope)

        if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:

            # Now compute global loss and gradients.
            gradvars = []
            with tf.name_scope('gradient_averaging'):
                all_grads = {}
                for grad, var in itertools.chain(*tower_gradvars):
                    if grad is not None:
                        all_grads.setdefault(var, []).append(grad)
                for var, grads in six.iteritems(all_grads):
                    # Average gradients on the same device as the variables
                    # to which they apply.
                    with tf.device(var.device):
                        if len(grads) == 1:
                            avg_grad = grads[0]
                        else:
                            avg_grad = tf.multiply(tf.add_n(grads),
                                                   1. / len(grads))
                    gradvars.append((avg_grad, var))

            # Device that runs the ops to apply global gradient updates.
            consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
            with tf.device(consolidation_device):

                learning_rate = tf.train.exponential_decay(
                    init_learning_rate,
                    tf.train.get_global_step(),
                    decay_step,
                    decay_factor,
                    staircase=True,
                    name='learning_rate')

                loss = tf.reduce_mean(tower_losses, name='loss')

                examples_sec_hook = utils.ExamplesPerSecondHook(
                    params.train_batch_size, every_n_steps=10)

                tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}

                logging_hook = tf.train.LoggingTensorHook(
                    tensors=tensors_to_log, every_n_iter=100)

                train_hooks = [logging_hook, examples_sec_hook]

                optimizer = tf.train.RMSPropOptimizer(
                    learning_rate=learning_rate)

                if params.sync:
                    optimizer = tf.train.SyncReplicasOptimizer(
                        optimizer, replicas_to_aggregate=num_workers)
                    sync_replicas_hook = optimizer.make_session_run_hook(
                        params.is_chief)
                    train_hooks.append(sync_replicas_hook)

                # Create single grouped train op
                train_op = [
                    optimizer.apply_gradients(
                        gradvars, global_step=tf.train.get_global_step())
                ]

                train_op.extend(update_ops)
                train_op = tf.group(*train_op)

                predictions = {
                    'heatmaps':
                    tf.concat([p['heatmaps'] for p in tower_preds], axis=0),
                    'images':
                    tf.concat([i for i in tower_features], axis=0)
                }
                if mode == tf.estimator.ModeKeys.EVAL:
                    hm = predictions['heatmaps']
                    stacked_labels = tf.concat(labels[0][0][0], axis=0)

                    gt_labels = tf.transpose(stacked_labels, [1, 0, 3, 4, 2])

                    joint_accur = []
                    for j in range(params.seq_length):
                        for i in range(params.num_joints):
                            joint_accur.append(
                                _pck_hm(hm[j, :, -1, :, :,
                                           i], gt_labels[j, :, :, :, i],
                                        params.eval_batch_size / num_devices))
                    accuracy = tf.stack(joint_accur)
                    metrics = {'Mean Pixel Error': tf.metrics.mean(accuracy)}
                    tf.logging.info('Accuracy op computed')
                else:
                    metrics = None

        else:
            train_op = None
            loss = None
            train_hooks = None
            metrics = None
            predictions = {
                'heatmaps': tf.concat([p['heatmaps'] for p in tower_preds],
                                      axis=0),
                'images': tf.concat([i for i in tower_features], axis=0)
            }

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=train_hooks,
                                          eval_metric_ops=metrics)
Exemple #4
0
    def _model_fn(features, labels, mode, params):
        """Resnet model body.

    Support single host, one or more GPU training. Parameter distribution can
    be either one of the following scheme.
    1. CPU is the parameter server and manages gradient updates.
    2. Parameters are distributed evenly across all GPUs, and the first GPU
       manages gradient updates.

    Args:
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        weight_decay = params.weight_decay
        momentum = params.momentum

        tower_features = features
        tower_labels = labels
        tower_losses = []
        tower_gradvars = []
        tower_preds = []

        # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
        # on CPU. The exception is Intel MKL on CPU which is optimal with
        # channels_last.
        data_format = params.data_format
        if not data_format:
            if num_gpus == 0:
                data_format = 'channels_last'
            else:
                data_format = 'channels_first'

        if num_gpus == 0:
            num_devices = 1
            device_type = 'cpu'
        else:
            num_devices = num_gpus
            device_type = 'gpu'

        for i in range(num_devices):
            worker_device = '/{}:{}'.format(device_type, i)
            if variable_strategy == 'CPU':
                device_setter = utils.local_device_setter(
                    worker_device=worker_device)
            elif variable_strategy == 'GPU':
                device_setter = utils.local_device_setter(
                    ps_device_type='gpu',
                    worker_device=worker_device,
                    ps_strategy=tf.contrib.training.
                    GreedyLoadBalancingStrategy(
                        num_gpus, tf.contrib.training.byte_size_load_fn))
            with tf.variable_scope(params.model_name, reuse=bool(i != 0)):
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        loss, gradvars, preds = _tower_fn(
                            is_training, params.dp_keep_prob, weight_decay,
                            tower_features[i], tower_labels[i], data_format,
                            params.num_layers, params.batch_norm_decay,
                            params.batch_norm_epsilon, params)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)
                        if i == 0:
                            # Only trigger batch_norm moving mean and variance update from
                            # the 1st tower. Ideally, we should grab the updates from all
                            # towers but these stats accumulate extremely fast so we can
                            # ignore the other stats from the other towers without
                            # significant detriment.
                            update_ops = tf.get_collection(
                                tf.GraphKeys.UPDATE_OPS, name_scope)

        # Now compute global loss and gradients.
        gradvars = []
        with tf.name_scope('gradient_averaging'):
            all_grads = {}
            for grad, var in itertools.chain(*tower_gradvars):
                if grad is not None:
                    all_grads.setdefault(var, []).append(grad)
            for var, grads in six.iteritems(all_grads):
                # Average gradients on the same device as the variables
                # to which they apply.
                with tf.device(var.device):
                    if len(grads) == 1:
                        avg_grad = grads[0]
                    else:
                        avg_grad = tf.multiply(tf.add_n(grads),
                                               1. / len(grads))
                gradvars.append((avg_grad, var))

        # Device that runs the ops to apply global gradient updates.
        consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
        with tf.device(consolidation_device):
            # Suggested learning rate scheduling from
            # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
            num_batches_per_epoch = imagenet.ImageNetDataSet.num_examples_per_epoch(
                'train') // (params.train_batch_size * num_workers)
            boundaries = [
                num_batches_per_epoch * x
                for x in np.array([30, 60, 90], dtype=np.int64)
            ]
            staged_lr = [
                params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]
            ]

            learning_rate = tf.train.piecewise_constant(
                tf.train.get_global_step(), boundaries, staged_lr)

            loss = tf.reduce_mean(tower_losses, name='loss')

            examples_sec_hook = utils.ExamplesPerSecondHook(
                params.train_batch_size, every_n_steps=10)

            #optimizer = tf.train.MomentumOptimizer(
            #    learning_rate=learning_rate, momentum=momentum)
            optimizer = tf.train.AdamOptimizer()

            if params.sync:
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer, replicas_to_aggregate=num_workers)
                sync_replicas_hook = optimizer.make_session_run_hook(
                    params.is_chief)
                train_hooks.append(sync_replicas_hook)

            # Create single grouped train op
            train_op = [
                optimizer.apply_gradients(
                    gradvars, global_step=tf.train.get_global_step())
            ]
            train_op.extend(update_ops)
            train_op = tf.group(*train_op)

            predictions = {
                'classes':
                tf.concat([p['classes'] for p in tower_preds], axis=0),
                'probabilities':
                tf.concat([p['probabilities'] for p in tower_preds], axis=0)
            }
            stacked_labels = tf.concat(labels, axis=0)
            metrics = {
                'accuracy':
                tf.metrics.accuracy(stacked_labels, predictions['classes'])
            }
            tensors_to_log = {
                'learning_rate': learning_rate,
                'loss': loss,
                'acc': metrics['accuracy'][0]
            }
            logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                                      every_n_iter=100)
            train_hooks = [logging_hook, examples_sec_hook]

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=train_hooks,
                                          eval_metric_ops=metrics)
Exemple #5
0
    def _linearregression_model_fn_sync(features, labels, mode, params):
        """Resnet model body.

    Support single host, one or more GPU training. Parameter distribution can
    be either one of the following scheme.
    1. CPU is the parameter server and manages gradient updates.
    2. Parameters are distributed evenly across all GPUs, and the first GPU
       manages gradient updates.

    Args:
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        weight_decay = params.weight_decay

        features = features[0:num_gpus]
        labels = labels[0:num_gpus]
        tower_features = features
        tower_labels = labels
        tower_losses = []
        tower_gradvars = []
        tower_preds = []

        if num_gpus == 0:
            num_devices = 1
            device_type = 'cpu'
        else:
            num_devices = num_gpus
            device_type = 'gpu'

        for i in range(num_devices):
            worker_device = '/{}:{}'.format(device_type, i)
            if variable_strategy == 'CPU':
                device_setter = utils.local_device_setter(
                    worker_device=worker_device)
            elif variable_strategy == 'GPU':
                device_setter = utils.local_device_setter(
                    ps_device_type='gpu',
                    worker_device=worker_device,
                    ps_strategy=tf.contrib.training.
                    GreedyLoadBalancingStrategy(
                        num_gpus, tf.contrib.training.byte_size_load_fn))
            with tf.variable_scope('LinearRegression',
                                   reuse=bool(i != 0)) as var_scope:
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        loss, gradvars, preds = _tower_fn(
                            is_training, weight_decay, tower_features[i],
                            tower_labels[i], params.feature_dim,
                            var_scope.name, params.problem)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)

        # Now compute global loss and gradients.
        gradvars = []
        with tf.name_scope('gradient_averaging'):
            all_grads = {}
            for grad, var in itertools.chain(*tower_gradvars):
                if grad is not None:
                    all_grads.setdefault(var, []).append(grad)
            for var, grads in six.iteritems(all_grads):
                # Average gradients on the same device as the variables
                # to which they apply.
                with tf.device(var.device):
                    if len(grads) == 1:
                        avg_grad = grads[0]
                    else:
                        avg_grad = tf.multiply(tf.add_n(grads),
                                               1. / len(grads))
                gradvars.append((avg_grad, var))

        # Device that runs the ops to apply global gradient updates.
        consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
        with tf.device(consolidation_device):
            loss = tf.reduce_mean(tower_losses, name='loss')

            examples_sec_hook = utils.ExamplesPerSecondHook(
                params.train_batch_size, every_n_steps=100)

            tensors_to_log = {'loss': loss}

            logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                                      every_n_iter=100)

            train_hooks = [logging_hook, examples_sec_hook]

            # optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate)
            optimizer = tf.train.AdamOptimizer(
                learning_rate=params.learning_rate)

            if params.run_type == 'sync':
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer, replicas_to_aggregate=num_workers)
                sync_replicas_hook = optimizer.make_session_run_hook(
                    params.is_chief)
                train_hooks.append(sync_replicas_hook)

            # Create single grouped train op
            train_op = [
                optimizer.apply_gradients(
                    gradvars, global_step=tf.train.get_global_step())
            ]

            train_op = tf.group(*train_op)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=train_hooks)
Exemple #6
0
    def _linearregression_model_fn_local(features, labels, mode, params):
        """

    Args:
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        weight_decay = params.weight_decay

        # features = features[0:num_gpus]
        # labels = labels[0:num_gpus]
        tower_features = features
        tower_labels = labels
        tower_losses = []
        tower_ops = []
        tower_preds = []
        var_scopes = []

        if num_gpus == 0:
            num_devices = 1
            device_type = 'cpu'
        else:
            num_devices = num_gpus
            device_type = 'gpu'

        for i in range(num_devices):
            worker_device = '/{}:{}'.format(device_type, i)
            if variable_strategy == 'CPU':
                device_setter = utils.local_device_setter(
                    worker_device=worker_device)
                # device_setter = tf.train.replica_device_setter(
                #     worker_device=worker_device)
            elif variable_strategy == 'GPU':
                device_setter = utils.local_device_setter(
                    ps_device_type='gpu',
                    worker_device=worker_device,
                    ps_strategy=tf.contrib.training.
                    GreedyLoadBalancingStrategy(
                        num_gpus, tf.contrib.training.byte_size_load_fn))
                # device_setter = tf.train.replica_device_setter(
                #     ps_device=worker_device,
                #     worker_device=worker_device
                # )
            with tf.variable_scope(
                    'LinearRegression_{}'.format(i)) as var_scope:
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        loss, gradvars, preds = _tower_fn(
                            is_training, weight_decay, tower_features[i],
                            tower_labels[i], params.feature_dim,
                            var_scope.name, params.problem)
                        var_scopes.append(var_scope.name)

                        tower_losses.append(loss)
                        # tower_gradvars.append(gradvars)
                        tower_preds.append(preds)

                        global_step = tf.cast(tf.train.get_global_step(),
                                              tf.float32)
                        lr = params.learning_rate
                        # optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate)
                        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
                        # optimizer = tf.train.MomentumOptimizer(learning_rate=params.learning_rate,momentum=0.97)

                        # Create single grouped train op
                        train_op = [
                            optimizer.apply_gradients(
                                gradvars,
                                global_step=tf.train.get_global_step(),
                                name='apply_gradient_tower_{}'.format(i))
                        ]
                        tower_ops.append(train_op)

        # Device that runs the ops to apply global gradient updates.
        consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
        with tf.device(consolidation_device):

            examples_sec_hook = utils.ExamplesPerSecondHook(
                params.train_batch_size * (1 + params.redundancy),
                every_n_steps=100)
            loss = tf.reduce_mean(tower_losses, name='loss')
            tensors_to_log = {'loss': loss}
            logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                                      every_n_iter=100)
            train_hooks = [logging_hook, examples_sec_hook]
            if params.run_type == 'multi':
                if params.adaptive:
                    alpha = 2 / (params.num_comm +
                                 1) * (params.train_steps /
                                       (params.num_comm * params.sync_step))
                    local_updates = [
                        params.sync_step * (1 + alpha * i)
                        for i in range(params.num_comm + 1)
                    ]
                    sync_hook = utils.SyncHook(scopes=var_scopes,
                                               every_n_steps=params.sync_step,
                                               adaptive=local_updates)
                else:
                    sync_hook = utils.SyncHook(scopes=var_scopes,
                                               every_n_steps=params.sync_step)
                train_hooks.append(sync_hook)

            train_ops = tf.group(*tower_ops)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_ops,
                                          training_hooks=train_hooks)