def testCreateOnecloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier model_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertEqual(framework.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, model_args) self.assertEqual(len(framework.get_variables()), 5) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 2) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '/job:worker/device:GPU:0') self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0')
def testCreateMulticloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=2, num_ps_tasks=2) self.assertEqual(framework.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(framework.get_variables()), 5) for i, v in enumerate(framework.get_variables()): t = i % 2 self.assertDeviceEqual(v.device, '/job:ps/task:%d/device:CPU:0' % t) self.assertDeviceEqual(v.device, v.value().device) self.assertEqual(len(clones), 2) for i, clone in enumerate(clones): self.assertEqual( clone.outputs.op.name, 'clone_%d/BatchNormClassifier/fully_connected/Sigmoid' % i) self.assertEqual(clone.scope, 'clone_%d/' % i) self.assertDeviceEqual(clone.device, '/job:worker/device:GPU:%d' % i)
def testCreateOnecloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertEqual(framework.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(clones), 1) clone = clones[0] self.assertEqual(clone.outputs.op.name, 'BatchNormClassifier/fully_connected/Sigmoid') self.assertDeviceEqual(clone.device, '/job:worker/device:GPU:0') self.assertEqual(clone.scope, '') self.assertEqual(len(framework.get_variables()), 5) for v in framework.get_variables(): self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0') self.assertDeviceEqual(v.device, v.value().device)
def testCreateMulticlone(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) num_clones = 4 deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones) self.assertEqual(framework.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(framework.get_variables()), 5) for v in framework.get_variables(): self.assertDeviceEqual(v.device, 'CPU:0') self.assertDeviceEqual(v.value().device, 'CPU:0') self.assertEqual(len(clones), num_clones) for i, clone in enumerate(clones): self.assertEqual( clone.outputs.op.name, 'clone_%d/BatchNormClassifier/fully_connected/Sigmoid' % i) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope) self.assertEqual(len(update_ops), 2) self.assertEqual(clone.scope, 'clone_%d/' % i) self.assertDeviceEqual(clone.device, 'GPU:%d' % i)
def testCreateLogisticClassifier(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = LogisticClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(framework.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) clone = clones[0] self.assertEqual(len(framework.get_variables()), 2) for v in framework.get_variables(): self.assertDeviceEqual(v.device, 'CPU:0') self.assertDeviceEqual(v.value().device, 'CPU:0') self.assertEqual(clone.outputs.op.name, 'LogisticClassifier/fully_connected/Sigmoid') self.assertEqual(clone.scope, '') self.assertDeviceEqual(clone.device, 'GPU:0') self.assertEqual(len(tf.losses.get_losses()), 1) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(update_ops, [])
def train(create_tensor_dict_fn_list, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir): """Training function for models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. """ data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = tf.train.create_global_step() with tf.device(deploy_config.inputs_device()), \ tf.name_scope('Input'): input_queue_list = [] for i, create_tensor_dict_fn in enumerate( create_tensor_dict_fn_list): input_queue_list.append( _create_input_queue( train_config.batch_size[i] // num_clones, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options)) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue_list]) first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()), \ tf.name_scope('Optimizer'): training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries) sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.train.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=train_config.worker_replicas) sync_optimizer = training_optimizer # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint) available_var_map = variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn with tf.device(deploy_config.optimizer_device()), \ tf.variable_scope('OptimizeClones'): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=None) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = [r'.*bias(?:es)?', r'.*beta'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = tf.contrib.training.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for (grad, var) in grads_and_vars: var_name = var.op.name grad_name = 'grad/' + var_name global_summaries.add(tf.summary.histogram(grad_name, grad)) global_summaries.add(tf.summary.histogram(var_name, var)) # for model_var in tf.contrib.framework.get_model_variables(): # global_summaries.add(tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(init_fn=init_fn, summary_op=summary_op, saver=saver) stop_hook = tf.train.StopAtStepHook( num_steps=(train_config.num_steps if train_config.num_steps else None), ) profile_hook = profile_session_run_hooks.ProfileAtStepHook( at_step=200, checkpoint_dir=train_dir) tf.contrib.training.train( train_tensor, train_dir, master=master, is_chief=is_chief, scaffold=scaffold, hooks=[stop_hook, profile_hook], chief_only_hooks=None, save_checkpoint_secs=train_config.save_checkpoint_secs, save_summaries_steps=train_config.save_summaries_steps, config=session_config)