def train(): with tf.Graph().as_default(): # get global step global_step = tf.train.get_or_create_global_step() # get data through cpu with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # get loss and logit # logits = cifar10.inference(images=images, r=low_ranks) logits = cifar10.inference(images=images, r=low_ranks) loss = cifar10.loss(logits=logits, labels=labels) # set train_op train_op = cifar10.train(loss, global_step) for v in tf.trainable_variables(): print(v) nonzero = tf.count_nonzero(tf.get_collection('sparse_components')[-1]) # define a LoggerHook to log something # clean_list = tf.get_collection('sparse_components') # clean_list = clean_s(clean_list) # clean_op = [c.op for c in clean_list] class _LoggerHook(tf.train.SessionRunHook): """ log session and runtime info """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.6f (%.1f examples/sec;' '%.3f sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, example_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def main(_): with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) summary = tf.summary.merge_all() writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) print('Training started') for i in range(FLAGS.max_step): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time if i % 100 == 0: print('Step %6d: loss = %.2f (%.3f sec)' % (i, loss_value, duration)) summary_str = sess.run(summary) writer.add_summary(summary_str, i) writer.flush() coord.request_stop() coord.join(threads)
def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == 'test' images, labels = cifar10.distorted_inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Normalize predictions with softmax preds = tf.nn.softmax(logits, name='predictions') # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) while True: eval_once(saver, summary_writer, top_k_op, summary_op, preds, labels) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() testImg, testlabels = cifar10.inputs(eval_data=True) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) test_pre = cifar10.inference(testImg,test=True) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time if step % 10 == 0: print ('loss '+str(loss_value)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 10 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) #eval if step%10==0: cifar10.accuracy(test_pre,testlabels)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images, labels) # Calculate loss. loss = cifar10.loss(logits, labels) # Calculate accuracy. accuracy = cifar10.accuracy(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs( accuracy) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, accuracy = %.4f, (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.train_dir, g) eval_once(saver, summary_writer, top_k_op, summary_op)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs(FLAGS.contrastcase) # Build inference Graph. logits = cifar10.inference1_2(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images, train = True) loss = cifar10.loss(logits, labels) accuracy = cifar10.accuracy(logits, labels) train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs([loss, accuracy]) def after_run(self, run_context, run_values): if self._step % 10 == 0: loss_value, acc_value = run_values.results format_str = ('step %d, loss = %.2f, accuracy = %.2f ') print (format_str %(self._step, loss_value, acc_value)) with tf.train.MonitoredTrainingSession( checkpoint_dir=train_dir, hooks=[tf.train.StopAtStepHook(last_step=max_step), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=False)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) # loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels)) # train_op = tf.train.GradientDescentOptimizer(1e-2).minimize(loss) train_op = cifar10.train(loss, global_step) top_k_op = tf.nn.in_top_k(logits, labels, 1) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) true_count = 0 for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, precisions = sess.run([train_op, loss, top_k_op]) true_count += np.sum(precisions) if step % 10 == 0: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) duration = time.time() - start_time print(' step %d, loss = %.3f, acc = %.3f, dur = %.2f' % (step, loss_value, true_count/(FLAGS.batch_size*10), duration)) true_count = 0
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name +' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def tower_loss(scope): images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) _ = cifar10.loss(logits, labels) losses = tf.get_collection('losses', scope) total_loss = tf.add_n(losses, name = 'total_loss') return total_loss
def train(): with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for training data. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs( custom_data_dir="./train_data") # Build a Graph that computes the logits predictions from the # inference model. # ccen: there're 128 classes instead of 10 logits = cifar10.inference(images, 128) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.global_variables()) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = duration format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) evaluate()
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. # loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') # loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. # for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. # loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. # tf.scalar_summary(loss_name +' (raw)', l) # tf.scalar_summary(loss_name, loss_averages.average(l)) # with tf.control_dependencies([loss_averages_op]): # total_loss = tf.identity(total_loss) return total_loss
def train(): with tf.Graph().as_default(): images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) global_step = tf.Variable(0, trainable=False) train_op = cifar10.train(loss, global_step=global_step) summary_op = tf.merge_all_summaries() init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) trainable_var = tf.trainable_variables() print([v.name for v in trainable_var]) var = trainable_var[0] res = sess.run(var) print(type(res))
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): # with tf.variable_scope("cifar10", reuse=tf.AUTO_REUSE) as scope: global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. train_flag = tf.placeholder(tf.bool, shape = ()) trX, trY = cifar10.distorted_inputs() teX, teY = cifar10.inputs(eval_data = True) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(trX) # Calculate accuracy tr_acc = cifar10.accuracy(logits, trY)[1] print(tr_acc, "tr_acc\n") # tr_acc_sum = tf.summary.scalar('train/accuracy', tr_acc) # Calculate loss. loss = cifar10.loss(logits, trY) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) tf.get_variable_scope().reuse_variables() eval_logits = cifar10.inference(teX) te_acc = cifar10.accuracy(eval_logits, teY)[1] print(te_acc, "te_acc\n") # te_acc_sum = tf.summary.scalar('test/accuracy', te_acc) accuracy = tf.cond(train_flag, lambda: tr_acc, lambda: te_acc) tf.summary.scalar("accuracy", accuracy) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('tmp/cifar10/train') test_writer = tf.summary.FileWriter('tmp/cifar10/test') print("Training Starts") # Configs config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth=True mon_sess = tf.train.MonitoredTrainingSession( hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss)],config=config) step = -1 while not mon_sess.should_stop(): step += 1 _,loss_value = mon_sess.run([train_op,loss]) if step % FLAGS.log_frequency == 0: tr_acc,summary = mon_sess.run([accuracy,merged], feed_dict = {train_flag : True}) train_writer.add_summary(summary, step) te_acc, summary = mon_sess.run([accuracy, merged], feed_dict = {train_flag : False}) test_writer.add_summary(summary, step) format_str = ('%s: step %d, loss = %.2f, test accuracy = %.2f, train accuracy = %.2f') print (format_str % (datetime.now(), step, loss_value, te_acc, tr_acc))
def train(): """Train CIFAR-10 for a number of steps.""" g1 = tf.Graph() with g1.as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) grads = cifar10.train_part1(loss, global_step) only_gradients = [g for g, _ in grads] only_vars = [v for _, v in grads] placeholder_gradients = [] #with tf.device("/gpu:0"): for grad_var in grads: placeholder_gradients.append( (tf.placeholder('float', shape=grad_var[0].get_shape()), grad_var[1])) feed_dict = {} for i, grad_var in enumerate(grads): feed_dict[placeholder_gradients[i][0]] = np.zeros( placeholder_gradients[i][0].shape) train_op = cifar10.train_part2(global_step, placeholder_gradients) sess = tf.Session() sess.run(tf.global_variables_initializer()) feeds = [] print("Reached here") for i, grad_var in enumerate(grads): feeds.append(placeholder_gradients[i][0]) # Partial Run print("Reached here", len(feeds)) for x in feeds: print(x, ) h = sess.partial_run_setup([only_gradients, train_op], feeds) print("Reached here") for i in xrange(10): res_grads = sess.partial_run(h, only_gradients, feed_dict=feed_dict) feed_dict = {} for i, grad_var in enumerate(res_grads): feed_dict[placeholder_gradients[i][0]] = res_grads[i] res_train_op = sess.partial_run(h, train_op, feed_dict=feed_dict)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), "Model diverged with loss = NaN" if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f " "sec/batch)" print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) global step_no step_no = self._step with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, inter_op_parallelism_threads=4, intra_op_parallelism_threads=0)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) """run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
def train(): """ 多步训练CIFAR-10数据集 """ with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # 指定 CPU:0 进行数据的变形处理 with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # inference 前向预测,定义在cifar10.py文件 logits = cifar10.inference(images) # loss. loss = cifar10.loss(logits, labels) # train train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """记录 loss 和 runtime.""" def __init__(self): self._start_time = time.time() self._step = -1 def begin(self): pass def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs( loss) # 将loss 操作加到Session.run()调用 def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)' print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook( last_step=FLAGS.max_steps), # 在运行指定步长后停止 tf.train.NanTensorHook(loss), # 监视loss,如果loss==NaN,停止训练 _LoggerHook() ], # log_device_placement是否打印设备分配日志 config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. if FLAGS.model == 'small_cnn': logits = cifar10.inference_small(images) elif FLAGS.model == 'resnet_18': logits = cifar10.inference_resnet(images, is_train=True) elif FLAGS.model == 'lcnn_resnet_18': logits = cifar10.inference_lcnn_resnet(images, is_train=True) elif FLAGS.model == 'resnet_lcnn_hybrid': logits = cifar10.inference_resnet_lcnn_hybrid(images, is_train=True) else: raise Exception('Unknown model type: {}'.format(FLAGS.model)) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') l1_loss = tf.add_n([0.] + [l for l in losses if l.op.name.find('l1_loss') != -1], name='l1_sum_loss') l2_loss = tf.add_n([0.] + [l for l in losses if l.op.name.find('weight_loss') != -1], name='l2_sum_loss') all_losses = losses + [total_loss, l1_loss, l2_loss] # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(all_losses) # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in all_losses: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name +' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) updates = tf.group(*update_ops) # For correct batch norm execution with tf.control_dependencies([updates, loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def build_model_func(): images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) return loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build a Graph that computes the logits predictions from the # inference model. if tfFLAGS.network == 1: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images) else: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % tfFLAGS.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) global step_no step_no = self._step with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, inter_op_parallelism_threads=4,intra_op_parallelism_threads=0)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) """run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
def train(): with g.as_default(): global_step = tf.train.get_or_create_global_step() with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() logits = cifar10.inference3model(images) stats_graph(g) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 # 在这里返回你想在运行过程中产看的信息,以list的形式传递,如:[loss, accuracy] return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time # results返回的是上面before_run()的返回结果,上面是loss所以返回loss # 如若上面返回的是个list,则返回的也是个list loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) print( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)' % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], save_checkpoint_secs=60, config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() #为CIFAR-10获取(图像,label)样本对 images, labels = cifar10.distorted_inputs() #创建图Graph用于从推断模型计算logits predictions logits = cifar10.inference(images) #计算损失 loss = cifar10.loss(logits, labels) #创建图:用一个batch的样本来训练模型 #更新模型参数 train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) #请求loss值 def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - _start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.batch_size * log_frequency / duration sec_per_batch = float(duration / log_frequency) format_str = ( '%s: step %d, loss = %.2f(%.1f examples/sec, %.3f sec/batch)' ) print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """ CIFAR10训练函数 """ with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # 从数据中读取图像和标签. images, labels = cifar10.distorted_inputs() # 建立一个图用来计算模型的预测结果 logits = cifar10.inference(images) # 计算误差 loss = cifar10.loss(logits, labels) # 构建图用一个batch的训练数据来训练模型并更新参数 train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ 记录误差和运行时间 """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # 计算误差值 def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.5f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """ Train CIFAR-10 for a number of steps :return: """ with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # 获取CIFAR-10的images和labels # 让CPU专注流输入,避免GPU处理完,导致停顿 with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # 预测结果 以及 loss logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ log loss and runtime. """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() self._start_time = current_time for step in range(0, FLAGS.max_steps + 1, FLAGS.log_frequency): print(str(step)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=step), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) # evaluate test data cifar10_eval.evaluate() # evaluate train data evaluate()
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs([loss, logits, labels]) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value, predictions, targets = run_values.results if self._step % FLAGS.print_eval_interval == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) accuracy = 100.0 * np.mean(np.argmax(predictions, 1) == targets) format_str = ('%s: step %d, loss = %.2f, train acc = %.1f%%, ' '(%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, accuracy, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], save_checkpoint_secs=FLAGS.save_checkpoint_secs, config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): global updated_batch_size_num global passed_info global shall_update ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) issync = FLAGS.sync #server = tf.train.Server({'ps': ps_hosts, 'worker': worker_hosts}, # job_name = FLAGS.job_name, # task_index=FLAGS.task_id) cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) # batchSizeManager = BatchSizeManager(FLAGS.batch_size, len(worker_hosts)) if FLAGS.job_name == 'ps': # rpcServer = batchSizeManager.create_rpc_server(ps_hosts[0].split(':')[0]) # rpcServer.serve() server.join() elif FLAGS.job_name == "worker": time.sleep(10) # rpcClient = batchSizeManager.create_rpc_client(ps_hosts[0].split(':')[0]) is_chief = (FLAGS.task_index == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) # modified by faye # device_setter = tf.train.replica_device_setter(ps_tasks=1) # with tf.device('/job:worker/task:%d' % FLAGS.task_index): # with tf.device(device_setter): # global_step = tf.Variable(0, trainable=False) with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster )): global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) decay_steps = 50000*350.0/FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) print('zx0') print(images.get_shape().as_list())
def train(): with tf.Graph().as_default(), tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) images, labels = cifar10.distorted_inputs() # 使用预加载的队列,使用这个会速度可以提高五倍,亲测 batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * FLAGS.num_gpus) tower_grads = [] for i in range(num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: loss = tower_loss(scope, images, labels) tf.get_variable_scope().reuse_variables() grads = opt.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) saver = tf.train.Saver(tf.all_variables()) init = tf.global_variables_initializer() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init) tf.train.start_queue_runners(sess=sess) for step in range(max_steps): start_time = time.time() _, loss_value = sess.run([apply_gradient_op, loss]) duration = time.time() - start_time if step % 10 == 0: num_examples_per_step = batch_size * num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / num_gpus format_str = ('step %d, loss= %.2f (%.1f example/sec; %.3f' 'sec/batch)') print(format_str % (step, loss_value, examples_per_sec, sec_per_batch)) if step % 1000 == 0 or (step + 1) == max_steps: saver.save(sess, '/', global_step=step)
def train(): with tf.Graph().as_default(): images, labels = cifar10.distorted_inputs() sess = tf.Session() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) for i in range(10): im, label = sess.run([images, labels]) print(im.shape) coord.request_stop() coord.join(threads)
def train(): # todo:这句话啥意识 with tf.Graph().as_default(): # todo:啥意思 global_step = tf.train.get_or_create_global_step() with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # 推测模型 logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ logs loss and runtime """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run( self, run_context, # pylint: disable=unused-argument run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d,loss=%.2f (%.1f example/sec);%.3f sec/batch' ) print(format_str) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def run_training(): with tf.Graph().as_default(), tf.device('/gpu:0'): global_step = tf.Variable(0, trainable=False) with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size, cifar10.IMAGE_SIZE) logits = cifar10.inference(images_placeholder) losses_dict = cifar10.loss(logits, labels_placeholder) moving_averages_op = cifar10.add_summaries_and_moving_avgs( losses_dict, global_step) lbfgs_optimizer = customized_optimizer.CustomizedOptimizerInterface( global_step=global_step, loss_dict=losses_dict, data_fetches=[images, labels], data_placeholders=(images_placeholder, labels_placeholder), maxiter=FLAGS.max_steps) saver = tf.train.Saver(tf.global_variables(), max_to_keep=25) summary_op = tf.summary.merge_all() init = tf.global_variables_initializer() with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.4))) as sess: sess.run(init) coordinator = tf.train.Coordinator() try: threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) lbfgs_optimizer.minimize(session=sess, moving_averages_op=moving_averages_op, summary_op=summary_op, saver=saver, step_callback=step_callback) except Exception as e: coordinator.request_stop(e) coordinator.request_stop() coordinator.join(threads, stop_grace_period_secs=10)
def train(): """Train CIFAR-10 for a number of steps.""" sess = tf.InteractiveSession() #sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Get images and labels for CIFAR-10. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Define all the fixed point variables we will be using later cifar10.initialize_fix_point_variables() # Build a Graph that computes the logits predictions from the inference model logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, 0.05) # Update fixed point conversion parameters when needed update_fix_pt_ops = cifar10.update_fix_point_accuracy() # Merge all the summaries and write them out to # FLAGS.log_dir merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') # init all variables tf.global_variables_initializer().run() # create a saver for checkpoints saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) # needed on interactive session so it doesn't hang tf.train.start_queue_runners() for i in range(FLAGS.max_steps): summary, _ = sess.run([merged_summary, train_op]) train_writer.add_summary(summary, i) # summary if (i % 10 == 0): saver.save(sess, FLAGS.log_dir + '/checkpoint', global_step=i) if (i % 5 == 0): sess.run([update_fix_pt_ops]) print('Step: %s, Loss: %s' % (i, loss.eval())) train_writer.close()
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), "Model diverged with loss = NaN" if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)" print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step)
def train_vgg16(): with tf.Graph().as_default(): image_size = 224 # 输入图像尺寸 # 生成随机数测试是否能跑通 #images = tf.Variable(tf.random_normal([batch_size, image_size, image_size, 3], dtype=tf.float32, stddev=1e-1)) with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() keep_prob = tf.placeholder(tf.float32) prediction,softmax,fc8,p = inference_op(images,keep_prob) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) time_tensorflow_run(sess, prediction,{keep_prob:1.0}, "Forward") # 用以模拟训练的过程 objective = tf.nn.l2_loss(fc8) # 给一个loss grad = tf.gradients(objective, p) # 相对于loss的 所有模型参数的梯度 time_tensorflow_run(sess, grad, {keep_prob:0.5},"Forward-backward")
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # # Visualize conv1 features # with tf.variable_scope('conv1') as scope_conv: # #tf.get_variable_scope().reuse_variables() # scope_conv.reuse_variables() # weights = tf.get_variable('weights') # grid_x = grid_y = 8 # to get a square grid for 64 conv1 features # grid = put_kernels_on_grid (weights, (grid_y, grid_x)) # tf.image_summary('conv1/features', grid, max_images=1) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / float(duration) sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(lambs): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. lambs = tf.constant(lambs, dtype=tf.float32) loss = infor.loss(logits, labels, lambs) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op, lr_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. previous_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] print ('Start training from previous') else: print('Strart training from step 0') previous_step = 0 init = tf.initialize_all_variables() sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) step = previous_step while step <= FLAGS.max_steps: start_time = time.time() _, loss_value, lr_value = sess.run([train_op, loss, lr_op]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f, lr_value = %.4f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, lr_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) step += 1
def train(): """Train a model for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for a segmentation model. images, labels, ground_truth = cifar10.distorted_inputs() tf.histogram_summary('label_hist/with_ignore', labels) tf.histogram_summary('label_hist/ground_truth', ground_truth) # Build a Graph that computes the logits predictions from the # inference model. print("before inference") print(images.get_shape()) logits, nr_params = cifar10.inference(images) print("nr_params: "+str(nr_params) ) print("after inference") # Calculate loss. loss = cifar10.loss(logits, labels) accuracy, precision, cat_accs = cifar10.accuracy(logits, ground_truth) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # tf.image_summary('images2', images) print (logits) # tf.image_summary('predictions', logits) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') print('Initializing new model') sess.run(init) global_step = 0 # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(global_step, FLAGS.max_steps): start_time = time.time() _, loss_value, accuracy_value, precision_value, cat_accs_val = sess.run([train_op, loss, accuracy, precision, cat_accs]) duration = time.time() - start_time print (precision_value) print (cat_accs_val) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' #precision_value = [0 if np.isnan(p) else p for p in precision_value] #print (precision_value) if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)\n Accuracy = %.4f, mean average precision = %.4f') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch, accuracy_value, np.mean(precision_value))) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) summary = tf.Summary() summary.value.add(tag='Accuracy (raw)', simple_value=float(accuracy_value)) for i,s in enumerate(CLASSES): summary.value.add(tag="precision/"+s+" (raw)",simple_value=float(precision_value[i])) summary.value.add(tag="accs/"+s+" (raw)",simple_value=float(cat_accs_val[i])) # summary.value.add(tag='Human precision (raw)', simple_value=float(precision_value)) summary_writer.add_summary(summary, step) print("hundred steps") # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: print("thousand steps") checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(_): class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss,global_step) # The StopAtStepHook handles stopping after running given steps. hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook()] # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(master=server.target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.train_dir, save_checkpoint_secs=60, hooks=hooks) as mon_sess: while not mon_sess.should_stop(): # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # mon_sess.run handles AbortedError in case of preempted PS. mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. if tfFLAGS.network == 1: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images) else: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % tfFLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = tfFLAGS.log_frequency * tfFLAGS.batch_size / duration sec_per_batch = float(duration / tfFLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print_(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) texts = ['conv1:', 'conv1Biases:', 'conv2:', 'conv2Biases:', 'local3:', 'local3Biases:', 'local4:', 'local4Biases:', 'softmax:', 'softmaxBiases:'] total_parameters = 0; count = 0 for variable in tf.trainable_variables(): variable_parametes = 1 for dim in variable.get_shape(): variable_parametes *= dim.value print('Number of hidden parameters of ' + texts[count], variable_parametes) total_parameters += variable_parametes count += 1 print('Total Number of hidden parameters:', total_parameters) with tf.train.MonitoredTrainingSession(checkpoint_dir=tfFLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=tfFLAGS.max_steps), tf.train.NanTensorHook(loss),_LoggerHook()], config=tf.ConfigProto( device_count = {'GPU': 0}, log_device_placement=tfFLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) # Train CIFAR-10 for a number of steps. with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * FLAGS.num_gpus) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Dequeues one batch for the GPU image_batch, label_batch = batch_queue.dequeue() # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope, image_batch, label_batch) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): # # debug # true_classes = np.ndarray(shape=(FLAGS.batch_size, 1), dtype=int) # true_classes.fill(2) # # Create a pair of constant ops, add the numpy # # array matrices. # true_classes_tf_matrix = tf.constant(true_classes, dtype=tf.int64) # # playing with introducing the sampler # classes_sampler = tf.nn.learned_unigram_candidate_sampler( # true_classes_tf_matrix, # 1, # true_classes # 5, # num_sampled # False, # unique # 10, # range_max # seed=None, # name="my_classes_sampler") # # print(classes_sampler) # # print("debug") # # print(classes_sampler.set_sampler) # # exit() """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # print("images") # print(images) # images = tf.Print(images, [images]) # print() # print(images[1]) print("------------------- train calling interference ---------------------") print(cifar10.__file__) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): # manually load the contents of images and labels # before calling this sess.run() # 1. have Cifar10 dataset in memory # 2. create a mini-batch # 3. set the placeholders/vars to the the mini-batch data # 4. run one forward-backward step # print("training step: " + str(step)) start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # debug, temp change, go back to the one below summary_str = sess.run(summary_op) # print("summary: " + summary_str) summary_writer.add_summary(summary_str, step) summary_writer.flush() # if step % 100 == 0: # summary_str = sess.run(summary_op) # # print("summary: " + summary_str) # summary_writer.add_summary(summary_str, step) # summary_writer.flush() # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)