def get_model_fn(features, labels, mode, params): """Returns a function that will build the TargetedLearning framework.""" """Model body. Args: features: a list of tensors labels: a list of tensors mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay out_lr = 0.1 #params.learning_rate train_features = features[0] train_labels = labels[0] if is_training: val_features = features[1] val_labels = labels[1] else: val_features = features[0] val_labels = labels[0] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. num_gpus = len(utils.get_available_gpus()) data_format = params.data_format if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' train_op = [] # Building the base model with tf.compat.v1.variable_scope('base_model') as var_scope: if params.dataset == 'mnist': base_model = model.BilevelLenet(num_class=params.num_class) else: base_model = model.BilevelResNet(resnet_size=params.num_layers, num_classes=params.num_class, resnet_version=params.version) base_model_logits = base_model(train_features, is_training) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS, var_scope.name) extra_update_ops = base_model.get_updates_for(train_features) update_ops.extend(extra_update_ops) # Get the params of the model base_model_params = tf.compat.v1.trainable_variables( scope=var_scope.name) # Set initial weights class_init = np.array([[1.0 / params.num_class] for _ in range(params.num_class) ]).astype(np.float32) class_weights = tf.compat.v1.get_variable('class_weight', initializer=class_init) weight = tf.matmul( tf.cast( tf.one_hot(train_labels, len(class_init), on_value=1, off_value=0), tf.float32), class_weights) # Get the loss of the main model base_model_loss, base_model_preds = _loss_fn( base_model_logits, tf.one_hot(train_labels, params.num_class, on_value=1, off_value=0)) base_model_loss_reduced = tf.reduce_mean( tf.squeeze(weight) * base_model_loss) + weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in base_model_params]) # Define the outer model's logits, which is the bilevel model with tf.compat.v1.variable_scope( 'bilevel_model', reuse=tf.compat.v1.AUTO_REUSE) as var_scope1: base_model.perturb_model_weights(base_model_loss_reduced, params.learning_rate, var_scope.name) target_logits = base_model(val_features, False) target_params = tf.compat.v1.trainable_variables(scope=var_scope1.name) target_loss, target_preds = _loss_fn( target_logits, tf.one_hot(val_labels, params.num_class, on_value=1, off_value=0)) target_loss = tf.reduce_mean(target_loss) + weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in target_params]) # Calculate the gradients with respect to the class weights and normalize it class_weight_gradient = tf.gradients(target_loss, class_weights) update_class_weights = tf.clip_by_value(class_weights - out_lr * class_weight_gradient[0], clip_value_min=0.0, clip_value_max=100.0) sum_class_weights = tf.reduce_sum(update_class_weights) + 2e-12 update_class_weights /= sum_class_weights # Update the weight every n steps. weight_update_hook = utils.WeightUpdateHook1( class_weights, update_class_weights, every_n_steps=10, log_every_n_step=params.log_freq) # Calculate the base model grads base_model_grads = tf.gradients(base_model_loss_reduced, base_model_params) base_model_gradvars = zip(base_model_grads, base_model_params) boundaries = [ params.num_batches_per_epoch * x for x in np.array([91, 136, 182], dtype=np.int64) ] staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.001]] learning_rate = tf.compat.v1.train.piecewise_constant( tf.compat.v1.train.get_global_step(), boundaries, staged_lr) # Define optimizer optimizer = tf.compat.v1.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params.momentum) # optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate) train_op.append( optimizer.apply_gradients( base_model_gradvars, global_step=tf.compat.v1.train.get_global_step())) # Calculate metrics target_accuracy = tf.compat.v1.metrics.accuracy(val_labels, target_preds['classes']) accuracy = tf.compat.v1.metrics.accuracy(train_labels, base_model_preds['classes']) # The following metrics are for the binary classification scenario. # They should be adopted for multiclass classification tasks. if params.num_class == 2: train_labels_mask = tf.cast(train_labels, tf.bool) inverse_train_labels_mask = tf.cast( tf.math.logical_not(train_labels_mask), tf.float32) inverse_prediction_mask = tf.cast( tf.math.logical_not(tf.cast(base_model_preds['classes'], tf.bool)), tf.float32) recall_minor = tf.compat.v1.metrics.recall(inverse_train_labels_mask, inverse_prediction_mask) recall_major = tf.compat.v1.metrics.recall(train_labels, base_model_preds['classes']) precision_minor = tf.compat.v1.metrics.precision( inverse_train_labels_mask, inverse_prediction_mask) metrics = { 'obj/accuracy': accuracy, 'metrics/recall_minor': recall_minor, 'metrics/recall_major': recall_major, 'metrics/precision_minor': precision_minor } else: metrics = {'obj/accuracy': accuracy} examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=params.log_freq) tensors_to_log = { 'Target loss': target_loss, 'Main loss': base_model_loss_reduced, 'Target accuracy': target_accuracy[1], 'Main accuracy': accuracy[1], 'learning_rates': learning_rate, 'step': tf.compat.v1.train.get_global_step() } logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=params.log_freq) train_hooks = [weight_update_hook, logging_hook, examples_sec_hook] train_op.extend(update_ops) train_op = tf.group(*train_op) return tf.estimator.EstimatorSpec(mode=mode, predictions=target_preds, loss=base_model_loss_reduced, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def model_fn(features, labels, mode): """Inception_Resnet_V2 model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = None if not data_format: if GPU_COUNT == 0: data_format = 'channels_last' else: data_format = 'channels_first' if GPU_COUNT == 0: num_devices = 1 device_type = 'cpu' else: num_devices = GPU_COUNT device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if VARIABLE_STRATEGY == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif VARIABLE_STRATEGY == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( GPU_COUNT, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope('', reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = tower_fn(is_training, tower_features[i], tower_labels and tower_labels[i], num_classes) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) if mode == 'train' or mode == 'eval': # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_ing'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply( tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if VARIABLE_STRATEGY == 'GPU' else '/cpu:0' with tf.device(consolidation_device): loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( BATCH_SIZE, every_n_steps=10) global_step = tf.train.get_global_step() learning_rate = tf.constant(LEARNING_RATE) tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=100) initializer_hook = utils.IteratorInitializerHook() train_hooks = [initializer_hook, logging_hook, examples_sec_hook] optimizer = tf.train.MomentumOptimizer( learning_rate=LEARNING_RATE, momentum=MOMENTUM) # Create single grouped train op train_op = [ optimizer.apply_gradients(gradvars, global_step=global_step) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0) } stacked_labels = tf.concat(labels, axis=0) metrics = { 'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes']) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics) else: predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0), 'features': tf.concat([feature for feature in features], axis=0) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions)
def _hg_model_fn(features, labels, mode, params): """ HG model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay momentum = params.momentum decay_factor = params.decay_factor decay_step = params.decay_step init_learning_rate = params.init_learning_rate num_stacks = params.num_stacks num_joints = params.num_joints tower_features = features if mode == tf.estimator.ModeKeys.PREDICT: if num_gpus < 1: tower_labels = [None] else: tower_labels = [None for i in range(num_gpus)] else: tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = params.data_format if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) if mode == tf.estimator.ModeKeys.TRAIN: batch_size = params.train_batch_size / num_devices else: batch_size = params.eval_batch_size / num_devices with tf.variable_scope('hg', reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( mode, weight_decay, tower_features[i][0], tower_labels[i], data_format, params.batch_norm_decay, params.batch_norm_epsilon, params.num_stacks, params.num_out, params.n_low, params.num_joints, batch_size, params.seq_length) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, name_scope) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): learning_rate = tf.train.exponential_decay( init_learning_rate, tf.train.get_global_step(), decay_step, decay_factor, staircase=True, name='learning_rate') loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=10) tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate) if params.sync: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'heatmaps': tf.concat([p['heatmaps'] for p in tower_preds], axis=0), 'images': tf.concat([i for i in tower_features], axis=0) } if mode == tf.estimator.ModeKeys.EVAL: hm = predictions['heatmaps'] stacked_labels = tf.concat(labels[0][0][0], axis=0) gt_labels = tf.transpose(stacked_labels, [1, 0, 3, 4, 2]) joint_accur = [] for j in range(params.seq_length): for i in range(params.num_joints): joint_accur.append( _pck_hm(hm[j, :, -1, :, :, i], gt_labels[j, :, :, :, i], params.eval_batch_size / num_devices)) accuracy = tf.stack(joint_accur) metrics = {'Mean Pixel Error': tf.metrics.mean(accuracy)} tf.logging.info('Accuracy op computed') else: metrics = None else: train_op = None loss = None train_hooks = None metrics = None predictions = { 'heatmaps': tf.concat([p['heatmaps'] for p in tower_preds], axis=0), 'images': tf.concat([i for i in tower_features], axis=0) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def _model_fn(features, labels, mode, params): """Resnet model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay momentum = params.momentum tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = params.data_format if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope(params.model_name, reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, params.dp_keep_prob, weight_decay, tower_features[i], tower_labels[i], data_format, params.num_layers, params.batch_norm_decay, params.batch_norm_epsilon, params) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, name_scope) # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): # Suggested learning rate scheduling from # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 num_batches_per_epoch = imagenet.ImageNetDataSet.num_examples_per_epoch( 'train') // (params.train_batch_size * num_workers) boundaries = [ num_batches_per_epoch * x for x in np.array([30, 60, 90], dtype=np.int64) ] staged_lr = [ params.learning_rate * x for x in [1, 0.1, 0.01, 0.002] ] learning_rate = tf.train.piecewise_constant( tf.train.get_global_step(), boundaries, staged_lr) loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=10) #optimizer = tf.train.MomentumOptimizer( # learning_rate=learning_rate, momentum=momentum) optimizer = tf.train.AdamOptimizer() if params.sync: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0) } stacked_labels = tf.concat(labels, axis=0) metrics = { 'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes']) } tensors_to_log = { 'learning_rate': learning_rate, 'loss': loss, 'acc': metrics['accuracy'][0] } logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def _linearregression_model_fn_sync(features, labels, mode, params): """Resnet model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay features = features[0:num_gpus] labels = labels[0:num_gpus] tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope('LinearRegression', reuse=bool(i != 0)) as var_scope: with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, weight_decay, tower_features[i], tower_labels[i], params.feature_dim, var_scope.name, params.problem) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=100) tensors_to_log = {'loss': loss} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] # optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate) optimizer = tf.train.AdamOptimizer( learning_rate=params.learning_rate) if params.run_type == 'sync': optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op = tf.group(*train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=train_hooks)
def _linearregression_model_fn_local(features, labels, mode, params): """ Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay # features = features[0:num_gpus] # labels = labels[0:num_gpus] tower_features = features tower_labels = labels tower_losses = [] tower_ops = [] tower_preds = [] var_scopes = [] if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) # device_setter = tf.train.replica_device_setter( # worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) # device_setter = tf.train.replica_device_setter( # ps_device=worker_device, # worker_device=worker_device # ) with tf.variable_scope( 'LinearRegression_{}'.format(i)) as var_scope: with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, weight_decay, tower_features[i], tower_labels[i], params.feature_dim, var_scope.name, params.problem) var_scopes.append(var_scope.name) tower_losses.append(loss) # tower_gradvars.append(gradvars) tower_preds.append(preds) global_step = tf.cast(tf.train.get_global_step(), tf.float32) lr = params.learning_rate # optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=lr) # optimizer = tf.train.MomentumOptimizer(learning_rate=params.learning_rate,momentum=0.97) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step(), name='apply_gradient_tower_{}'.format(i)) ] tower_ops.append(train_op) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size * (1 + params.redundancy), every_n_steps=100) loss = tf.reduce_mean(tower_losses, name='loss') tensors_to_log = {'loss': loss} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] if params.run_type == 'multi': if params.adaptive: alpha = 2 / (params.num_comm + 1) * (params.train_steps / (params.num_comm * params.sync_step)) local_updates = [ params.sync_step * (1 + alpha * i) for i in range(params.num_comm + 1) ] sync_hook = utils.SyncHook(scopes=var_scopes, every_n_steps=params.sync_step, adaptive=local_updates) else: sync_hook = utils.SyncHook(scopes=var_scopes, every_n_steps=params.sync_step) train_hooks.append(sync_hook) train_ops = tf.group(*tower_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_ops, training_hooks=train_hooks)