def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar100.distorted_inputs() logits = cifar100.inference(images) loss = cifar100.loss(logits, labels) train_op = cifar100.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value) if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-100 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-100. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar100.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar100.inference(images) # Calculate loss. loss = cifar100.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar100.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results self._last_loss = loss_value examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) def last_loss(self): return self._last_loss loghook = _LoggerHook() with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), loghook ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: t1 = time.time() while not mon_sess.should_stop(): mon_sess.run(train_op) t2 = time.time() print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps)) print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps)) print('last loss value: %.2f ' % loghook.last_loss())
def train(): """Train CIFAR-100 for a number of steps.""" output = open('output_data/output_' + str(time.time()) + '.txt', 'w') with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-100. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar100.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logitsA,logitsB = cifar100.inference(images) # Calculate loss. lossA = cifar100.loss(logitsA, labels) lossB = cifar100.loss(logitsB, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_opA = cifar100.train(lossA, global_step) train_opB = cifar100.train(lossB, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(lossA) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) print((str(self._step) + '\t' + str(loss_value) + '\n'), file=output) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(lossA), tf.train.NanTensorHook(lossB), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: file_writer = tf.summary.FileWriter('tb-logs/', mon_sess.graph) while not mon_sess.should_stop(): print("stepA") mon_sess.run(train_opA) print("stepB") mon_sess.run(train_opB) output.close()
def train(): """Train CIFAR-100 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar100.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar100.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar100.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar100.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Get images and labels for CIFAR-100. images, labels = cifar100.distorted_inputs() #batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( # [images, labels], capacity=2 * FLAGS.num_gpus) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar100.TOWER_NAME, i)) as scope: # Dequeues one batch for the GPU #image_batch, label_batch = batch_queue.dequeue() # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. #loss = tower_loss(scope, image_batch, label_batch) loss = tower_loss(scope, images, labels) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar100.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) t1 = time.time() for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) t2 = time.time() print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps)) print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps)) print('last loss value: %.2f ' % loss_value)
def train(): print('FLAGS.data_dir: %s' % FLAGS.data_dir) ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_index == 0) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, ps_device="/job:ps/task:0", cluster=cluster)): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Get images and labels for CIFAR-100. images, labels = cifar100.distorted_inputs() num_workers = len(worker_hosts) num_replicas_to_aggregate = num_workers logits = cifar100.inference(images) # Calculate loss. loss = cifar100.loss(logits, labels) # Retain the summaries from the chief. # Calculate the learning rate schedule. num_batches_per_epoch = (cifar100.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar100.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar100.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar100.LEARNING_RATE_DECAY_FACTOR, staircase=True) if is_chief: summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers, #use_locking=True) use_locking=False) # Calculate the gradients for the batch grads = opt.compute_gradients(loss) # Add histograms for gradients at the chief worker. if is_chief: for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # apply gradients to variable train_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. if is_chief: for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) #variable_averages = tf.train.ExponentialMovingAverage( # cifar100.MOVING_AVERAGE_DECAY, global_step) #variables_averages_op = variable_averages.apply(tf.trainable_variables()) #train_op = tf.group(train_op, variables_averages_op) if is_chief: #Build the summary operation at the chief worker summary_op = tf.summary.merge(summaries) chief_queue_runner = opt.get_chief_queue_runner() init_token_op = opt.get_init_tokens_op() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # Create a saver. saver = tf.train.Saver(tf.global_variables()) sv = tf.train.Supervisor(is_chief=is_chief, global_step=global_step, init_op=init_op) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) with sv.prepare_or_wait_for_session(server.target, config=sess_config) as sess: # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. # start sync queue runner and run the init token op at the chief worker queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) if is_chief: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_token_op) #open the summary writer summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) t1 = time.time() for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * num_workers examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / num_workers format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: if is_chief: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: if is_chief: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) t2 = time.time() print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps)) logger.info('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps)) logger.info('last loss value: %.2f ' % loss_value)