Esempio n. 1
0
def projection(inputs, channels, is_training, data_format):
    """1x1 projection (as in ResNet) followed by batch normalization and ReLU."""
    with tf.variable_scope('projection'):
        net = base_ops.conv_bn_relu(inputs, 1, channels, is_training,
                                    data_format)

    return net
Esempio n. 2
0
def build_model(features, spec, config, mode=tf.estimator.ModeKeys.TRAIN):
    """Builds the model from the input features."""
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    if config['data_format'] == 'channels_last':
        channel_axis = 3
    elif config['data_format'] == 'channels_first':
        # Currently this is not well supported
        channel_axis = 1
    else:
        raise ValueError('invalid data_format')

    # Store auxiliary activations increasing in depth of network. First
    # activation occurs immediately after the stem and the others immediately
    # follow each stack.
    aux_activations = []

    # Initial stem convolution
    with tf.variable_scope('stem'):
        net = base_ops.conv_bn_relu(features, 3, config['stem_filter_size'],
                                    is_training, config['data_format'])
        aux_activations.append(net)

    for stack_num in range(config['num_stacks']):
        channels = net.get_shape()[channel_axis].value

        # Downsample at start (except first)
        if stack_num > 0:
            net = tf.layers.max_pooling2d(inputs=net,
                                          pool_size=(2, 2),
                                          strides=(2, 2),
                                          padding='same',
                                          data_format=config['data_format'])

            # Double output channels each time we downsample
            channels *= 2

        with tf.variable_scope('stack{}'.format(stack_num)):
            for module_num in range(config['num_modules_per_stack']):
                with tf.variable_scope('module{}'.format(module_num)):
                    print(channels, is_training)
                    net = build_module(spec,
                                       inputs=net,
                                       channels=channels,
                                       is_training=is_training)
            aux_activations.append(net)

    # Global average pool
    if config['data_format'] == 'channels_last':
        net = tf.reduce_mean(net, [1, 2])
    elif config['data_format'] == 'channels_first':
        net = tf.reduce_mean(net, [2, 3])
    else:
        raise ValueError('invalid data_format')

    # Fully-connected layer to labels
    logits = tf.layers.dense(inputs=net, units=config['num_labels'])
    return logits
Esempio n. 3
0
def nasbench_tensorflow_model_builder(model_spec,
                                      config,
                                      in_shape,
                                      is_training=True):
    if config["data_format"] == "channels_last":
        channel_axis = 3
    else:
        assert False

    # setup inputs
    features = tf.placeholder(tf.float32, shape=in_shape, name="g_input")

    # build the stem
    with tf.variable_scope("stem"):
        net = base_ops.conv_bn_relu(features, 3, config["stem_filter_size"],
                                    is_training, config["data_format"])

    # Build stacks
    for stack_num in range(config["num_stacks"]):
        channels = net.get_shape()[channel_axis].value

        # Downsample at start (except first)
        if stack_num > 0:
            net = tf.layers.max_pooling2d(
                inputs=net,
                pool_size=(2, 2),
                strides=(2, 2),
                padding="same",
                data_format=config["data_format"],
            )

        # Double output channels each time we downsample
        channels *= 2

        with tf.variable_scope("stack{}".format(stack_num)):
            for module_num in range(config["num_modules_per_stack"]):
                with tf.variable_scope("module{}".format(module_num)):
                    net = model_builder.build_module(
                        model_spec,
                        inputs=net,
                        channels=channels,
                        is_training=is_training,
                    )

    # Global average pool
    if config["data_format"] == "channels_last":
        net = tf.reduce_mean(net, [1, 2])
    elif config["data_format"] == "channels_first":
        net = tf.reduce_mean(net, [2, 3])
    else:
        raise ValueError("invalid data_format")

    # Fully-connected layer to labels
    logits = tf.layers.dense(inputs=net, units=config["num_labels"])

    return features, logits
Esempio n. 4
0
    def model_fn(features, labels, mode, params):
        """Builds the model from the input features."""
        del params  # Unused
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # Store auxiliary activations increasing in depth of network. First
        # activation occurs immediately after the stem and the others immediately
        # follow each stack.
        aux_activations = []

        # Initial stem convolution
        with tf.variable_scope('stem'):
            net = base_ops.conv_bn_relu(features, 3,
                                        config['stem_filter_size'],
                                        is_training, config['data_format'])
            aux_activations.append(net)

        for stack_num in range(config['num_stacks']):
            channels = net.get_shape()[channel_axis].value

            # Downsample at start (except first)
            if stack_num > 0:
                net = tf.layers.max_pooling2d(
                    inputs=net,
                    pool_size=(2, 2),
                    strides=(2, 2),
                    padding='same',
                    data_format=config['data_format'])

                # Double output channels each time we downsample
                channels *= 2

            with tf.variable_scope('stack{}'.format(stack_num)):
                for module_num in range(config['num_modules_per_stack']):
                    with tf.variable_scope('module{}'.format(module_num)):
                        net = build_module(spec,
                                           inputs=net,
                                           channels=channels,
                                           is_training=is_training)
                aux_activations.append(net)

        # Global average pool
        if config['data_format'] == 'channels_last':
            net = tf.reduce_mean(net, [1, 2])
        elif config['data_format'] == 'channels_first':
            net = tf.reduce_mean(net, [2, 3])
        else:
            raise ValueError('invalid data_format')

        # Fully-connected layer to labels
        logits = tf.layers.dense(inputs=net, units=config['num_labels'])

        if mode == tf.estimator.ModeKeys.PREDICT and not config['use_tpu']:
            # It is a known limitation of Estimator that the labels
            # are not passed during PREDICT mode when running on CPU/GPU
            # (https://github.com/tensorflow/tensorflow/issues/17824), thus we cannot
            # compute the loss or anything dependent on it (i.e., the gradients).
            loss = tf.constant(0.0)
        else:
            if config['use_KD']:
                imitation_lmb = config['imitation_lmb']
                temperature = config['temperature']
                loss_soft = tf.keras.losses.KLD(
                    tf.math.log_softmax(logits / temperature),
                    tf.math.softmax(labels[:, 1:] / temperature))
                loss_soft = tf.math.reduce_mean(loss_soft)
                loss_soft *= (temperature**2.0)

                loss_ce = tf.losses.softmax_cross_entropy(
                    onehot_labels=tf.one_hot(
                        tf.dtypes.cast(labels[:, 0], tf.int32),
                        config['num_labels']),
                    logits=logits)

                loss = (1.0 -
                        imitation_lmb) * loss_ce + imitation_lmb * loss_soft
                # loss = tf.losses.softmax_cross_entropy(
                #     onehot_labels=tf.one_hot(tf.dtypes.cast(labels[:, 0], tf.int32), config['num_labels']),
                #     logits=logits)

            else:
                loss = tf.losses.softmax_cross_entropy(
                    onehot_labels=tf.one_hot(tf.dtypes.cast(labels, tf.int32),
                                             config['num_labels']),
                    logits=logits)

            loss += config['weight_decay'] * tf.add_n(
                [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

        # Use inference mode to compute some useful metrics on a fixed sample
        # Due to the batch being sharded on TPU, these metrics should be run on CPU
        # only to ensure that the metrics are computed on the whole batch. We add a
        # leading dimension because PREDICT expects batch-shaped tensors.
        if mode == tf.estimator.ModeKeys.PREDICT:
            parameter_norms = {
                'param:' + tensor.name: tf.expand_dims(tf.norm(tensor, ord=2),
                                                       0)
                for tensor in tf.trainable_variables()
            }

            # Compute gradients of all parameters and the input simultaneously
            all_params_names = []
            all_params_tensors = []
            for tensor in tf.trainable_variables():
                all_params_names.append('param_grad_norm:' + tensor.name)
                all_params_tensors.append(tensor)
            all_params_names.append('input_grad_norm')
            all_params_tensors.append(features)

            grads = tf.gradients(loss, all_params_tensors)

            param_gradient_norms = {}
            for name, grad in list(zip(all_params_names, grads))[:-1]:
                if grad is not None:
                    param_gradient_norms[name] = (tf.expand_dims(
                        tf.norm(grad, ord=2), 0))
                else:
                    param_gradient_norms[name] = (tf.expand_dims(
                        tf.constant(0.0), 0))

            if grads[-1] is not None:
                input_grad_norm = tf.sqrt(
                    tf.reduce_sum(tf.square(grads[-1]), axis=[1, 2, 3]))
            else:
                input_grad_norm = tf.expand_dims(tf.constant(0.0), 0)

            covariance_matrices = {
                'cov_matrix_%d' % i: tf.expand_dims(_covariance_matrix(aux), 0)
                for i, aux in enumerate(aux_activations)
            }

            predictions = {
                'logits': logits,
                'loss': tf.expand_dims(loss, 0),
                'input_grad_norm': input_grad_norm,
            }
            predictions.update(parameter_norms)
            predictions.update(param_gradient_norms)
            predictions.update(covariance_matrices)

            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)

        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.train.get_or_create_global_step()
            base_lr = config['learning_rate']
            if config['use_tpu']:
                base_lr *= config['tpu_num_shards']

            if config['lr_decay_method'] == 'COSINE_BY_STEP':
                total_steps = int(config['train_epochs'] * num_train_images /
                                  config['batch_size'])
                progress_fraction = tf.cast(global_step,
                                            tf.float32) / total_steps
                learning_rate = (0.5 * base_lr *
                                 (1 + tf.cos(np.pi * progress_fraction)))

            elif config['lr_decay_method'] == 'COSINE_BY_TIME':
                # Requires training_time.limit hooks to be added to Estimator
                elapsed_time = tf.cast(training_time.get_total_time(),
                                       dtype=tf.float32)
                progress_fraction = elapsed_time / config['train_seconds']
                learning_rate = (0.5 * base_lr *
                                 (1 + tf.cos(np.pi * progress_fraction)))

            elif config['lr_decay_method'] == 'STEPWISE':
                # divide LR by 10 at 1/2, 2/3, and 5/6 of total epochs
                total_steps = (config['train_epochs'] * num_train_images /
                               config['batch_size'])
                boundaries = [
                    int(0.5 * total_steps),
                    int(0.667 * total_steps),
                    int(0.833 * total_steps)
                ]
                values = [
                    1.0 * base_lr, 0.1 * base_lr, 0.01 * base_lr,
                    0.0001 * base_lr
                ]
                learning_rate = tf.train.piecewise_constant(
                    global_step, boundaries, values)

            else:
                raise ValueError('invalid lr_decay_method')

            # Set LR to 0 for step 0 to initialize the weights without training
            learning_rate = tf.where(tf.equal(global_step, 0), 0.0,
                                     learning_rate)

            if "optimizer" in config and config["optimizer"] == 'Adam':
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=learning_rate,
                    momentum=config['momentum'],
                )
                #epsilon=1.0)
            elif "optimizer" in config and config["optimizer"] == 'SGD':
                optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=learning_rate)
            elif "optimizer" in config and config["optimizer"] == 'Momentum':
                optimizer = tf.train.MomentumOptimizer(
                    learning_rate=learning_rate,
                    momentum=config['momentum'],
                )
            else:
                optimizer = tf.train.RMSPropOptimizer(
                    learning_rate=learning_rate,
                    momentum=config['momentum'],
                    epsilon=1.0)

            if config['use_tpu']:
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

            # Update ops required for batch norm moving variables
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss, global_step)

            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op)

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(labels, logits):
                predictions = tf.argmax(logits, axis=1)
                if config['use_KD']:
                    accuracy = tf.metrics.accuracy(
                        tf.dtypes.cast(labels[:, 0], tf.int32), predictions)
                else:
                    accuracy = tf.metrics.accuracy(labels, predictions)

                return {'accuracy': accuracy}

            eval_metrics = (metric_fn, [labels, logits])

            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   eval_metrics=eval_metrics)