def run_training(): """Train MNIST for a number of steps.""" # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Input images and labels. images, labels = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph operations that train the model. train_op = mnist.training(loss, FLAGS.learning_rate) # The op for initializing the variables. init_op = tf.initialize_all_variables() # Create a session for running operations in the Graph. sess = tf.Session() # Initialize the variables (the trained variables and the # epoch counter). sess.run(init_op) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. The return values are # the activations from the `train_op` (which is # discarded) and the `loss` op. To inspect the values # of your ops or variables, you may include them in # the list passed to sess.run() and the value tensors # will be returned in the tuple from the call. _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Print an overview fairly often. if step % 100 == 0: print("Step %d: loss = %.2f (%.3f sec)" % (step, loss_value, duration)) step += 1 except tf.errors.OutOfRangeError: print("Done training for %d epochs, %d steps." % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def run_training(): """Train MNIST for a number of steps.""" # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Input images and labels. image_batch, label_batch = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(image_batch, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the loss calculation. loss = mnist.loss(logits, label_batch) # Add to the Graph operations that train the model. train_op = mnist.training(loss, FLAGS.learning_rate) # The op for initializing the variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Create a session for running operations in the Graph. with tf.Session() as sess: # Initialize the variables (the trained variables and the # epoch counter). sess.run(init_op) try: step = 0 while True: #train until OutOfRangeError start_time = time.time() # Run one step of the model. The return values are # the activations from the `train_op` (which is # discarded) and the `loss` op. To inspect the values # of your ops or variables, you may include them in # the list passed to sess.run() and the value tensors # will be returned in the tuple from the call. _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Print an overview fairly often. if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
def run_training(): # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Create a saver for writing training checkpoints. saver = tf.train.Saver(tf.all_variables()) # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. init = tf.initialize_all_variables() sess.run(init) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: print('...no checkpoint found...') # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.merge_all_summaries() # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. train_dir = tempfile.mkdtemp() data_sets = input_data.read_data_sets(train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs() # Build a Graph that computes predictions from the inference model. logits, clustering_loss, kmeans_training_op = inference(images_placeholder, FLAGS.num_clusters, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = tf.group(mnist.training(loss, FLAGS.learning_rate), kmeans_training_op) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a session for running Ops on the Graph. sess = tf.Session() feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder, batch_size=5000) # Run the Op to initialize the variables. sess.run(init, feed_dict=feed_dict) # Start the training loop. max_test_prec = 0 for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder, FLAGS.batch_size) # Run one step of the model. _, loss_value, clustering_loss_value = sess.run([train_op, loss, clustering_loss], feed_dict=feed_dict) duration = time.time() - start_time if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f, clustering_loss = %.2f (%.3f sec)' % ( step, loss_value, clustering_loss_value, duration)) # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') test_prec = do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) max_test_prec = max(max_test_prec, test_prec) return max_test_prec
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. train_dir = tempfile.mkdtemp() data_sets = input_data.read_data_sets(train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs() # Build a Graph that computes predictions from the inference model. logits, clustering_loss, kmeans_training_op = inference( images_placeholder, FLAGS.num_clusters, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = tf.group(mnist.training(loss, FLAGS.learning_rate), kmeans_training_op) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a session for running Ops on the Graph. sess = tf.Session() feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder, batch_size=5000) # Run the Op to initialize the variables. sess.run(init, feed_dict=feed_dict) # Start the training loop. max_test_prec = 0 for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder, FLAGS.batch_size) # Run one step of the model. _, loss_value, clustering_loss_value = sess.run( [train_op, loss, clustering_loss], feed_dict=feed_dict) duration = time.time() - start_time if step % 100 == 0: # Print status to stdout. print( 'Step %d: loss = %.2f, clustering_loss = %.2f (%.3f sec)' % (step, loss_value, clustering_loss_value, duration)) # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') test_prec = do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) max_test_prec = max(max_test_prec, test_prec) return max_test_prec
def run_training(): # 获取数据 data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) # 在默认Graph下运行. with tf.Graph().as_default(): # 配置graph images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) loss = mnist.loss(logits, labels_placeholder) train_op = mnist.training(loss, FLAGS.learning_rate) eval_correct = mnist.evaluation(logits, labels_placeholder) # 汇聚tensor summary = tf.summary.merge_all() # 建立初始化机制 init = tf.global_variables_initializer() # 建立保存机制 saver = tf.train.Saver() # 建立Session sess = tf.Session() # 建立一个SummaryWriter输出汇聚的tensor summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # 开始执行 # 执行变量 sess.run(init) # 开始训练,2000次循环 for step in xrange(FLAGS.max_steps): start_time = time.time() #获取当次循环的数据 feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # 丢弃了train数据 _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # 每训练100次输出当前损失,并记录数据 if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # 每1000次测试模型 if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) with tf.Graph().as_default(): images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) loss = mnist.loss(logits, labels_placeholder) train_op = mnist.training(loss, FLAGS.learning_rate) eval_correct = mnist.evaluation(logits, labels_placeholder) summary = tf.summary.merge_all() init = tf.global_variables_initializer() saver = tf.train.Saver() sess = tf.Session() summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) sess.run(init) for step in xrange(FLAGS.max_steps): start_time = time.time() feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. # BOT: making the lr a variable so we can update it using our bot learning_rate = tf.Variable(FLAGS.learning_rate, trainable=False) train_op = mnist.training(loss, learning_rate) bot.lr = FLAGS.learning_rate # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.summary.merge_all() # Add the variable initializer Op. init = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Print step number: print("step: {}".format(step)) # Evaluate against the training set. print('Training Data Eval:') message_trn = 'Training Data Eval: \n' + do_eval( sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') message_val = 'Validation Data Eval:\n' + do_eval( sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate validation loss val_loss_value = sess.run(loss, feed_dict=fill_feed_dict( data_sets.validation, images_placeholder, labels_placeholder)) # Evaluate against the test set. print('Test Data Eval:') message_tst = 'Test Data Eval:\n' + do_eval( sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) ## BOT: handling of all bot commands ## # Prepare bot update message message = "\n".join([ "step: {}".format(step + 1), message_trn, message_val, message_tst ]) bot.set_status(message) # Send update message if bot.verbose: bot.send_message(message) # Stop training command from bot if bot.stop_train_flag: bot.send_message('Training stopped!') print( 'Training Stopped! Stop command sent via Telegram bot.' ) break # Update bot's loss history (for /plot command) bot.loss_hist.append(loss_value) bot.val_loss_hist.append(val_loss_value) # Modify learning rate via bot if bot.modify_lr != 1: curr_lr = sess.run(learning_rate) new_lr = curr_lr * bot.modify_lr learning_rate = tf.assign(learning_rate, new_lr) message = '\nStep %05d: setting learning rate to %f.' % ( step + 1, new_lr) print(message) bot.send_message(message) bot.modify_lr = 1 bot.lr = new_lr
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. If input_path is specified, download the data from GCS to # the folder expected by read_data_sets. data_dir = tempfile.mkdtemp() if FLAGS.input_path: files = [ os.path.join(FLAGS.input_path, file_name) for file_name in INPUT_FILES ] subprocess.check_call(['gsutil', '-m', '-q', 'cp', '-r'] + files + [data_dir]) data_sets = input_data.read_data_sets(data_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels and mark as input. placeholders = placeholder_inputs() keys_placeholder, images_placeholder, labels_placeholder = placeholders inputs = { 'key': keys_placeholder.name, 'image': images_placeholder.name } input_signatures = {} for key, val in inputs.iteritems(): predict_input_tensor = meta_graph_pb2.TensorInfo() predict_input_tensor.name = val for placeholder in placeholders: if placeholder.name == val: predict_input_tensor.dtype = placeholder.dtype.as_datatype_enum input_signatures[key] = predict_input_tensor tf.add_to_collection('inputs', json.dumps(inputs)) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # To be able to extract the id, we need to add the identity function. keys = tf.identity(keys_placeholder) # The prediction will be the index in logits with the highest score. # We also use a softmax operation to produce a probability distribution # over all possible digits. prediction = tf.argmax(logits, 1) scores = tf.nn.softmax(logits) # Mark the outputs. outputs = { 'key': keys.name, 'prediction': prediction.name, 'scores': scores.name } output_signatures = {} for key, val in outputs.iteritems(): predict_output_tensor = meta_graph_pb2.TensorInfo() predict_output_tensor.name = val for placeholder in [keys, prediction, scores]: if placeholder.name == val: predict_output_tensor.dtype = placeholder.dtype.as_datatype_enum output_signatures[key] = predict_output_tensor tf.add_to_collection('outputs', json.dumps(outputs)) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. # Remove this if once Tensorflow 0.12 is standard. try: summary_op = tf.contrib.deprecated.merge_all_summaries() except AttributeError: summary_op = tf.merge_all_summaries() # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing legacy training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. # Remove this if once Tensorflow 0.12 is standard. try: summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) except AttributeError: summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) file_io.create_dir(FLAGS.model_dir) predict_signature_def = signature_def_utils.build_signature_def( input_signatures, output_signatures, signature_constants.PREDICT_METHOD_NAME) # Create a saver for writing SavedModel training checkpoints. build = builder.SavedModelBuilder( os.path.join(FLAGS.model_dir, 'saved_model')) logging.debug('Saved model path %s', os.path.join(FLAGS.model_dir, 'saved_model')) build.add_meta_graph_and_variables( sess, [tag_constants.SERVING], signature_def_map={ signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: predict_signature_def }, assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)) build.save()
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') task_index = FLAGS.task_index master = "grpc://" + worker_hosts[task_index] logs_path = os.path.join(FLAGS.log_dir, str(task_index)) # start a server for a specific task cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) # Between-graph replication with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # count the number of updates global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = async_training(loss, FLAGS.learning_rate, global_step) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Add the variable initializer Op. init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(task_index == 0), global_step=global_step, init_op=init_op) with sv.prepare_or_wait_for_session(master) as sess: # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(logs_path, sess.graph) # And then after everything is built: # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value, summary = sess.run([train_op, loss, summary_op], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_writer.add_summary(summary, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): with tf.Graph().as_default(): # train data and run valid after each epoch, so nb_epochs=1 images, labels = inputs(train=True, batch_size=cfg.FLAGS.batch_size, nb_epochs=cfg.FLAGS.nb_epochs) logits = mnist.inference(images, cfg.FLAGS.hidden1, cfg.FLAGS.hidden2) loss = mnist.loss(logits, labels) train_op = mnist.training(loss, cfg.FLAGS.learning_rate) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) data_sets = mnist_datasets.read_data_sets( cfg.FLAGS.train_dir, dtype=tf.uint8, reshape=False, validation_size=cfg.FLAGS.validation_size) nb_train_samples = data_sets.train.num_examples # print('training samples: {}; batch_size: {}'.format(nb_train_samples, cfg.FLAGS.batch_size)) # .. 55000 and 100 # prepare validation data in terms of tf.constant image_valid_np = data_sets.validation.images.reshape( (cfg.FLAGS.validation_size, mnist.IMAGE_PIXELS)) label_valid_np = data_sets.validation.labels # shape (5000,) # to fit the batch size idx_valid = np.random.choice(cfg.FLAGS.validation_size, cfg.FLAGS.batch_size, replace=False) image_valid_np = image_valid_np[idx_valid, :] image_valid_np = image_valid_np * ( 1. / 255) - 0.5 # remember to preprocessing label_valid_np = label_valid_np[idx_valid] step = 0 epoch_idx = 0 try: start_time = time.time() while not coord.should_stop(): _, loss_value = sess.run([train_op, loss]) step += 1 if step >= nb_train_samples // cfg.FLAGS.batch_size: epoch_idx += 1 end_time = time.time() duration = end_time - start_time print( 'Training Epoch {}, Step {}: loss = {:.02f} ({:.03f} sec)' .format(epoch_idx, step, loss_value, duration)) start_time = end_time # re-timing step = 0 # reset step counter # derive loss on validation dataset loss_valid_value = sess.run(loss, feed_dict={ images: image_valid_np, labels: label_valid_np }) print('Validation Epoch {}: loss = {:.02f}'.format( epoch_idx, loss_valid_value)) except tf.errors.OutOfRangeError: print('Done training for epoch {}, {} steps'.format( epoch_idx, step)) finally: coord.request_stop() # # restart runner for validation data # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess=sess, coord=coord) # # step = 0 # try: # start_time = time.time() # while not coord.should_stop(): # loss_value_valid = sess.run(loss_valid) # step += 1 # except tf.errors.OutOfRangeError: # print('Done validation for epoch {}, {} steps'.format(epoch_idx, step)) # finally: # coord.request_stop() # duration = time.time() - start_time # print('Validation: Epoch {}, Step {}: loss = {:.02f} ({:.03f} sec)' # .format(epoch_idx, step, loss_value_valid, duration)) coord.join(threads) sess.close()
def run_training(): """Train MNIST for a number of epochs.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): with tf.name_scope('input'): # Input data, pin to CPU because rest of pipeline is CPU-only with tf.device('/cpu:0'): input_images = tf.constant(data_sets.train.images) input_labels = tf.constant(data_sets.train.labels) image, label = tf.train.slice_input_producer( [input_images, input_labels], num_epochs=FLAGS.num_epochs, capacity=3000000) label = tf.cast(label, tf.int32) images, labels = tf.train.shuffle_batch( [image, label], batch_size=FLAGS.batch_size, capacity=3000000, num_threads=24, min_after_dequeue=300) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create the op for initializing variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. sess.run(init_op) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # And then after everything is built, start the training loop. try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. _, loss_value = sess.run([train_op, loss], options=run_options, run_metadata=run_metadata) tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 1 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) step += 1 # Save a checkpoint periodically. if (step + 1) % 1000 == 0: print('Saving') saver.save(sess, FLAGS.train_dir, global_step=step) step += 1 except tf.errors.OutOfRangeError: print('Saving') saver.save(sess, FLAGS.train_dir, global_step=step) print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def run_training(): data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops fro loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summarys. summary = tf.summary.merge_all() # Add the variable initializer Op. init = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on teh Graph. sess = tf.Session() # Instantiate a SummaryWrite to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # And after everything is build. # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in range(FLAGS.max_steps): start_time = time.time() feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval.') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval.') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval.') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. # The fake_data flag is used for unit-testing purposes and may be safely ignored by the reader. data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. # A tf.Graph is a collection of ops that may be executed together as a group. # Most TensorFlow uses will only need to rely on the single default graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.summary.merge_all() # Add the variable initializer Op. init = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. # The empty parameter to session indicates that this code will attach to # (or create if not yet created) the default local session. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop after initializing. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. # Since train_op is an Operation with no output value, # the corresponding element in the returned tuple is None and, thus, discarded. # However, the value of the loss tensor may become NaN if the model diverges during training, # so we capture this value for logging. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. # The events file will be updated with new summary values every time the summary is evaluated # and the output passed to the writer's add_summary() function. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. # At some later point in the future, training might be resumed # by using the tf.train.Saver.restore method to reload the model parameters. # Note that more complicated usage would usually sequester the data_sets.test # to only be checked after significant amounts of hyperparameter tuning. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(tempfile.mkdtemp(), FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels and mark as input. placeholders = placeholder_inputs() keys_placeholder, images_placeholder, labels_placeholder = placeholders inputs = {'key': keys_placeholder.name, 'image': images_placeholder.name} tf.add_to_collection('inputs', json.dumps(inputs)) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # To be able to extract the id, we need to add the identity function. keys = tf.identity(keys_placeholder) # The prediction will be the index in logits with the highest score. # We also use a softmax operation to produce a probability distribution # over all possible digits. prediction = tf.argmax(logits, 1) scores = tf.nn.softmax(logits) # Mark the outputs. outputs = {'key': keys.name, 'prediction': prediction.name, 'scores': scores.name} tf.add_to_collection('outputs', json.dumps(outputs)) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) # Export the model so that it can be loaded and used later for predictions. file_io.create_dir(FLAGS.model_dir) saver.save(sess, os.path.join(FLAGS.model_dir, 'export'))
def run_training(): """Train MNIST for a number of epochs.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): with tf.name_scope('input'): # Input data, pin to CPU because rest of pipeline is CPU-only with tf.device('/cpu:0'): input_images = tf.constant(data_sets.train.images) input_labels = tf.constant(data_sets.train.labels) image, label = tf.train.slice_input_producer( [input_images, input_labels], num_epochs=FLAGS.num_epochs) label = tf.cast(label, tf.int32) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create the op for initializing variables. init_op = tf.initialize_all_variables() # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. sess.run(init_op) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # And then after everything is built, start the training loop. try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) step += 1 # Save a checkpoint periodically. if (step + 1) % 1000 == 0: print('Saving') saver.save(sess, FLAGS.train_dir, global_step=step) step += 1 except tf.errors.OutOfRangeError: print('Saving') saver.save(sess, FLAGS.train_dir, global_step=step) print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def run_training(): data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) #告诉TensorFlow将该模型内置到默认图形中 with tf.Graph().as_default(): images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) #构建一个从推理模型计算预测的图表 logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) #添加到图表中用于损失函数计算 loss = mnist.loss(logits, labels_placeholder) #添加到图表计算和应用梯度的操作 train_op = mnist.training(loss, FLAGS.learning_rate) #添加Op,以便在评估过程中将logit与标签进行比较。 eval_correct = mnist.evaluation(logits, labels_placeholder) #根据汇总的TF收集构建摘要Tensor summary = tf.summary.merge_all() #变量初始化 init = tf.global_variables_initializer() #创建一个保存程序来编写训练检查点 saver = tf.train.Saver() #在图上建立会话 sess = tf.Session() #实例化一个SummaryWriter以输出摘要和Graph summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) sess.run(init) #开始循环训练 for step in xrange(FLAGS.max_steps): start_time = time.time() feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time #撰写摘要,并经常打印概述 if step % 100 == 0: print('Step %d:loss = %.0f(%.3f sec)' % (step, loss_value, duration)) #更新事件文件 summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() #保存检查点并定期评估模型 if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) #评估训练集 print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) #评估验证集 print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) #评估测试集 print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # 获取mnist的训练集、验证集和测试集 data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) # 创建默认的计算图 with tf.Graph().as_default(): # 生成输入数据images和labels的占位符 images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # 模型输出 logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # 模型损失 loss = mnist.loss(logits, labels_placeholder) # 训练操作 train_op = mnist.training(loss, FLAGS.learning_rate) # 评估操作 eval_correct = mnist.evaluation(logits, labels_placeholder) # 合并所有的summary summary = tf.summary.merge_all() # 所有变量初始化操作 init = tf.global_variables_initializer() # 创建保存checkpoints的saver saver = tf.train.Saver() # Saver可以选择要保存的参数 # 开启session sess = tf.Session() # 保存计算图 summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # 初始化变量 sess.run(init) # 开启训练的循环操作 for step in xrange(FLAGS.max_steps): # 记录开始时间 start_time = time.time() # 获取feed_dict feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # 获取损失 _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) # 计算花费的时间 duration = time.time() - start_time # 每隔100步打印训练信息,保存summary if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) # 添加summary summary_writer.flush() # 缓冲summary # 每隔1000步保存checkpoint,并对模型做出评估 if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # 在训练集上评估 print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # 传入eval_correct操作 # 在验证集上评估 print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # 在测试集上评估 print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.summary.merge_all() # Add the variable initializer Op. init = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) # Compute embeddings and save them. thumbnail_size = int(np.sqrt(mnist.IMAGE_PIXELS)) for data_set, name in [(data_sets.train, 'train'), (data_sets.validation, 'validation'), (data_sets.test, 'test')]: output_path = os.path.join(FLAGS.log_dir, 'embed', name) print('Computing %s Embedding' % name) (all_images, all_labels, hidden1_vectors, hidden2_vectors) = do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_set, True) embed_tensors = [] summary_writer = tf.summary.FileWriter(output_path, sess.graph) config = projector.ProjectorConfig() for layer, embed_vectors in enumerate( [hidden1_vectors, hidden2_vectors]): embed_tensor = tf.Variable(np.array(embed_vectors).reshape( len(embed_vectors) * embed_vectors[0].shape[0], -1), name=('%s_layer_%s' % (name, layer))) embed_tensors.append(embed_tensor) sess.run(embed_tensor.initializer) embedding = config.embeddings.add() embedding.tensor_name = embed_tensor.name embedding.metadata_path = os.path.join(output_path, 'labels.tsv') embedding.sprite.image_path = os.path.join( output_path, 'sprite.png') embedding.sprite.single_image_dim.extend( [thumbnail_size, thumbnail_size]) projector.visualize_embeddings(summary_writer, config) result = sess.run(embed_tensors) saver = tf.train.Saver(embed_tensors) saver.save(sess, os.path.join(output_path, 'model.ckpt'), layer) # Make sprite and labels. images = np.array(all_images).reshape( -1, thumbnail_size, thumbnail_size).astype(np.float32) sprite = images_to_sprite(images) scipy.misc.imsave(os.path.join(output_path, 'sprite.png'), sprite) all_labels = np.array(all_labels).flatten() metadata_file = open(os.path.join(output_path, 'labels.tsv'), 'w') metadata_file.write('Name\tClass\n') for ll in xrange(len(all_labels)): metadata_file.write('%06d\t%d\n' % (ll, all_labels[ll])) metadata_file.close()
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. # fake_dataは単体テストのために使われるフラグ。今は無視してOK。 data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. # tf.Graph()のグローバルなデフォルトのインスタンスに対して、行っている操作であることを # Pythonのwith構文で記述。 # 大抵の場合はtf.Graphのインスタンスは単一でOKなので、as_default()を使えばOK with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # mnist.pyに記述されている関数を計算グラフを構築する。 # Build a Graph that computes predictions from the inference model. # 1つ目 inference() # 学習したいネットワーク? logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. # 2つ目 loss() # loss関数のOps(operation?)をグラフに追加 loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. # 3つ目 training() # loss関数を最小化するための最適化計算を追加 # 入力されたloss関数を、どういう手法で最適化するのかを記述している。 train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. # 推論結果の評価方法を追加 # logitsがどういう出力をしていたら良いのかをevaluation()で記述している(?) eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.summary.merge_all() # Add the variable initializer Op. # 初期化処理を生成しておく init = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. # 計算グラフの構築など、必要な操作をすべて生成完了したらtf.Session()を生成する # Session()の引数が空であることは、デフォルトのローカル・セッションにアタッチ(使う)ということ。 sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. # Session.runを呼ぶことで、変数が初期化される sess.run(init) # Start the training loop. # 各種インスタンス化やOperationの作成・構築が終わったら学習のループを開始 for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. # run()に入力する引数が2つなので、出力も2つと覚えれば良い(?)。 # train_opは学習のOperationであり、出力を持たないのでNoneが返ってくる。破棄する。 # lossは出力を持つので変数に保持。 _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_train(self): with tf.Graph().as_default(): isTrain = tf.placeholder(tf.bool, name="isTrain") images, labels = inputs(train=isTrain, batch_size=100, num_epochs=500) logits = mnist.inference(images, 128, 32) loss = mnist.loss(logits, labels) tf.summary.scalar('loss', loss) train_op = mnist.training(loss, 0.01) evaluation = mnist.evaluation(logits, labels) tf.summary.scalar('evaluation', evaluation) cur_step = tf.Variable(0, name='cur_step') summary = tf.summary.merge_all() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) saver = tf.train.Saver() sess = tf.Session() self._graph = sess.graph sess.run(init_op) self.write_graph_to_file() sess = sessionRun(saver, sess, self.FLAGS.ckpt_dir + DIR_NAME) if self.FLAGS.debug and self.FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if self.FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession( sess, ui_type=self.FLAGS.ui_type) elif self.FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, self.FLAGS.tensorboard_debug_address) summary_writer = tf.summary.FileWriter(self.FLAGS.log_dir, sess.graph) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) start = sess.run(cur_step) try: step = start while not coord.should_stop(): start_time = time.time() _, loss_value, prd, summary_str = sess.run( [train_op, loss, evaluation, summary], feed_dict={isTrain: True}) summary_writer.add_summary(summary_str, step) summary_writer.flush() print(step) duration = time.time() - start_time if step % 100 == 0: cs = sess.run(cur_step.assign(step)) saveModel(saver, sess, step, self.FLAGS.ckpt_dir + DIR_NAME) print("Step :%d: loss=%.2f (%.3f sec) evaluation:%s" % (step, loss_value, duration, prd)) step += 1 if step > 3000: prd = sess.run(evaluation, feed_dict={isTrain: False}) print("loss value:%s evaluation:%s" % (loss_value, prd)) break except tf.errors.OutOfRangeError: print("Done training for %d epochs, %d steps." % (1000, step)) finally: coord.request_stop() coord.join(threads) sess.close()
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. If input_path is specified, download the data from GCS to # the folder expected by read_data_sets. data_dir = tempfile.mkdtemp() if FLAGS.input_path: files = [os.path.join(FLAGS.input_path, file_name) for file_name in INPUT_FILES] subprocess.check_call(['gsutil', '-m', '-q', 'cp', '-r'] + files + [data_dir]) data_sets = input_data.read_data_sets(data_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. # Remove this if once Tensorflow 0.12 is standard. try: summary_op = tf.contrib.deprecated.merge_all_summaries() except AttributeError: summary_op = tf.merge_all_summaries() # Add the variable initializer Op. # Remove this if once Tensorflow 0.12 is standard. try: init = tf.global_variables_initializer() except AttributeError: init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. # Remove this if once Tensorflow 0.12 is standard. try: summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) except AttributeError: summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') task_index = FLAGS.task_index master = "grpc://" + worker_hosts[task_index] logs_path = os.path.join(FLAGS.log_dir, str(task_index)) # start a server for a specific task cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) # Between-graph replication with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # count the number of updates global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) images, labels = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph operations that train the model. train_op = mnist.training(loss, FLAGS.learning_rate) # merge all summaries into a single "operation" which we can execute in a session summary_op = tf.summary.merge_all() # Add the variable initializer Op. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sv = tf.train.Supervisor(is_chief=(task_index == 0), global_step=global_step, init_op=init_op) with sv.prepare_or_wait_for_session(master) as sess: # create log writer object (this will log on every machine) summary_writer = tf.summary.FileWriter(logs_path, sess.graph) coord = sv.coord threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: step = 0 while not coord.should_stop(): start_time = time.time() _, loss_value, summary = sess.run([train_op, loss, summary_op]) duration = time.time() - start_time # Print an overview fairly often. if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) summary_writer.add_summary(summary, step) summary_writer.flush() step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close() sv.stop() print("done")
def run_training(): """Train MNIST for a number of steps.""" # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Input images and labels. images, labels = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph operations that train the model. train_op = mnist.training(loss, FLAGS.learning_rate) # The op for initializing the variables. init_op = tf.group(tf.initialize_all_variables(), tf.initialize_local_variables()) # Create a session for running operations in the Graph. sess = tf.Session() # Initialize the variables (the trained variables and the # epoch counter). sess.run(init_op) # Start input enqueue threads. print("Queue runners: %s" %([qr.name for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)])) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # waiting for queue to get loaded time.sleep(15) run_metadata = tf.RunMetadata() try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. The return values are # the activations from the `train_op` (which is # discarded) and the `loss` op. To inspect the values # of your ops or variables, you may include them in # the list passed to sess.run() and the value tensors # will be returned in the tuple from the call. if step == 500: _, loss_value = sess.run([train_op, loss], options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) from tensorflow.python.client import timeline trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file = open('timeline.reader-1thread.json', 'w') trace_file.write(trace.generate_chrome_trace_format()) else: _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Print an overview fairly often. if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. if DATASET == 'pickle': data_sets = mnist_dataset.read_data_sets(FLAGS.train_dir, data_dir='../mnist.pkl') elif DATASET == 'keras': data_sets = mnist_dataset.read_data_sets(FLAGS.train_dir, keras=True) else: data_sets = input_data.read_data_sets(FLAGS.train_dir) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2, FLAGS.hidden3) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = training(loss) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.merge_all_summaries() # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) steps_per_epoch = data_sets.train.num_examples // FLAGS.batch_size # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % steps_per_epoch == 0: # Print status to stdout. print('Epoch %d: loss = %.2f (%.3f sec)' % (step / steps_per_epoch, loss_value, duration)) # Update the events file. summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 10000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) # save weights with tf.variable_scope("hidden1", reuse=True): W = tf.get_variable("weights", [IMAGE_PIXELS, FLAGS.hidden1]) b = tf.get_variable("biases", [FLAGS.hidden1]) W_1, b_1 = W.eval(sess), b.eval(sess) #np.savetxt("weights/hidden1_W.csv", W_val, delimiter=",") #np.savetxt("weights/hidden1_b.csv", b_val, delimiter=",") with tf.variable_scope("hidden2", reuse=True): W = tf.get_variable("weights", [FLAGS.hidden1, FLAGS.hidden2]) b = tf.get_variable("biases", [FLAGS.hidden2]) W_2, b_2 = W.eval(sess), b.eval(sess) #np.savetxt("weights/hidden2_W.csv", W_val, delimiter=",") #np.savetxt("weights/hidden2_b.csv", b_val, delimiter=",") with tf.variable_scope("hidden3", reuse=True): W = tf.get_variable("weights", [FLAGS.hidden2, FLAGS.hidden3]) b = tf.get_variable("biases", [FLAGS.hidden3]) W_3, b_3 = W.eval(sess), b.eval(sess) #np.savetxt("weights/hidden3_W.csv", W_val, delimiter=",") #np.savetxt("weights/hidden3_b.csv", b_val, delimiter=",") cPickle.dump([W_1, b_1, W_2, b_2, W_3, b_3], open("pretrain.pkl", "wb"))
def run_training(): """Train MNIST for a number of steps.""" # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Input images and labels. images, labels = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph operations that train the model. train_op = mnist.training(loss, FLAGS.learning_rate) # The op for initializing the variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Create a session for running operations in the Graph. sess = tf.Session() # Initialize the variables (the trained variables and the # epoch counter). sess.run(init_op) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. The return values are # the activations from the `train_op` (which is # discarded) and the `loss` op. To inspect the values # of your ops or variables, you may include them in # the list passed to sess.run() and the value tensors # will be returned in the tuple from the call. _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Print an overview fairly often. if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def main(_): """Train MNIST""" data_sets = input_data.read_data_sets(FLAGS.data_dir, FLAGS.fake_data) # Build graph: use default graph graph = tf.Graph() with graph.as_default(): # Training input feeds images_placeholder = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, mnist.IMAGE_PIXELS)) labels_placeholder = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, )) # Build model: inference/loss/training + evaluation # Implementation in mnist.py from TensorFlow library logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) loss = mnist.loss(logits, labels_placeholder) train_op = mnist.training(loss, FLAGS.learning_rate) eval_correct = mnist.evaluation(logits, labels_placeholder) # Reporting, initialization and checkpointing summary = tf.merge_all_summaries() init = tf.initialize_all_variables() saver = tf.train.Saver() # Run session: initialize and do training loops with tf.Session(graph=graph) as sess: summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # Now that everything has been built, start execution sess.run(init) for step in range(FLAGS.max_steps): start_time = time.time() # Construct batch of MNIST images/labels to feed into NN feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Execute and fetch results: train_op is the key operation, # but the result we want is loss _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Report training progress / write files for TensorBoard if step % 100 == 0: print('Step {}: loss = {} ({} sec)'.format( step, loss_value, duration)) summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Print precision against training, validation & test sets print('Training precision: ', end='') evaluate(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) print('Validation precision: ', end='') evaluate(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) print('Test precision: ', end='') evaluate(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
'Must divide evenly into the dataset sizes.') flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.') flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data ' 'for unit testing.') ## Download data and unpack ## data_sets is a custom DataSet data type data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) ## Initialize graph and start drawing on it with tf.Graph().as_default(): ## Prepare inputs and placeholders images_placeholder = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, mnist.IMAGE_PIXELS)) labels_placeholder = tf.placeholder(tf.int32, shape=(FLAGS.batch_size)) ## mnist.inference() builds feed-forward portion of graph ## It takes the images placeholder and two integers, each representing the ## number of neurons for the respective hidden layers and returns logits logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) loss = mnist.loss(logits, labels_placeholder) train_op = mnist.training(loss, FLAGS.learning_rate) eval_correct = mnist.evaluation(logits, labels_placeholder) ## Initialize variables, run session, and write summary writer data summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session() summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) sess.run(init)
def downpour_training_distributed_op(self): """ Set up workers with corresponding constants """ FLAGS = self.flags.FLAGS # Pass in by --ps_hosts=ps0.example.com:2222, ps1.example.com:2222 # ps_hosts = FLAGS.ps_hosts.split(",") # worker_hosts = FLAGS.worker_hosts.split(",") # Create cluster: cluster = tf.train.ClusterSpec({ "ps": ["localhost:2222"], "worker": ["localhost:2222", "localhost:2222"] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": # Do something for parameter sharing scheme. # Currently updating all part. server.join() elif FLAGS.job_name == "worker": # Assign operations to local worker by default: with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/replica:%d/task:%d/cpu:%d" % (0, FLAGS.task_index, 0))): # Bulid model: # Do something for parameter sharing scheme. # Currently updating all parameters. images_placeholder, labels_placeholder = self.placeholder_inputs( FLAGS.batch_size) logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) loss = mnist.loss(logits, labels_placeholder) # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable='False') # Add a scalar summary for the snapshot loss. tf.summary.scalar('loss', loss) # Create the gradient descent optimizer with the given learning rate. optimizer = tf.train.GradientDescentOptimizer( FLAGS.learning_rate) # Use the optimizer to apply the gradients that minimize the loss. # feed_dict somewhere. train_op = optimizer.minimize(loss, global_step=global_step) saver = tf.train.Saver() summary_op = tf.merge_all_summaries() init_op = tf.initialize_all_variables() # Create a "supervisor", which oversees the training process. sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir=FLAGS.train_log, init_op=init_op, summary_op=summary_op, saver=saver, global_step=global_step, save_model_secs=600) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) as sess: # Loop until the supervisor shuts down or 1000 steps have completed. step = 0 while not sv.should_stop() and step < 1000: # Run a training step asynchronously. feed_dict = self.fill_feed_dict(self.data_set.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, step = sess.run([train_op, loss], feed_dict=feed_dict) sv.stop()
def run_training(): """Train MNIST for a number of epochs.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): with tf.name_scope('input'): # Input data images_initializer = tf.placeholder( dtype=data_sets.train.images.dtype, shape=data_sets.train.images.shape) labels_initializer = tf.placeholder( dtype=data_sets.train.labels.dtype, shape=data_sets.train.labels.shape) input_images = tf.Variable(images_initializer, trainable=False, collections=[]) input_labels = tf.Variable(labels_initializer, trainable=False, collections=[]) image, label = tf.train.slice_input_producer( [input_images, input_labels], num_epochs=FLAGS.num_epochs) label = tf.cast(label, tf.int32) images, labels = tf.train.batch([image, label], batch_size=FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create the op for initializing variables. init_op = tf.initialize_all_variables() # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. sess.run(init_op) sess.run(input_images.initializer, feed_dict={images_initializer: data_sets.train.images}) sess.run(input_labels.initializer, feed_dict={labels_initializer: data_sets.train.labels}) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # And then after everything is built, start the training loop. try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) step += 1 # Save a checkpoint periodically. if (step + 1) % 1000 == 0: print('Saving') saver.save(sess, FLAGS.train_dir, global_step=step) step += 1 except tf.errors.OutOfRangeError: print('Saving') saver.save(sess, FLAGS.train_dir, global_step=step) print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def downpour_training_local_op(self): """ Validation baseline function: run locally. """ # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): FLAGS = self.flags.FLAGS images_placeholder, labels_placeholder = self.placeholder_inputs( FLAGS.batch_size) # Do inference: logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Calculate loss after generating logits: loss = mnist.loss(logits, labels_placeholder) # Add loss to training: train_op = mnist.training(loss, FLAGS.learning_rate) # Add summary summary = tf.merge_all_summaries() # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Initialize Variable init = tf.initialize_all_variables() sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) sess.run(init) for step in range(FLAGS.max_steps + 1): """ We want to inspect loss value on each step as a local benchmark for fully connected network. """ start_time = time.time() feed_dict = self.fill_feed_dict(self.data_set.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if step % 1000 == 0: print('Training Data Eval:') self.do_eval(sess, eval_correct, images_placeholder, labels_placeholder, self.data_set.train) # Evaluate against the validation set. print('Validation Data Eval:') self.do_eval(sess, eval_correct, images_placeholder, labels_placeholder, self.data_set.validation) # Evaluate against the test set. print('Test Data Eval:') self.do_eval(sess, eval_correct, images_placeholder, labels_placeholder, self.data_set.test)
def run_training(): """Train MNIST for a number of steps.""" # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Input images and labels. images, labels = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the loss calculation. loss = mnist.loss(logits, labels) # Add to the Graph operations that train the model. train_op = mnist.training(loss, FLAGS.learning_rate) # The op for initializing the variables. init_op = tf.group(tf.initialize_all_variables(), tf.initialize_local_variables()) # Create a session for running operations in the Graph. sess = tf.Session() # Initialize the variables (the trained variables and the # epoch counter). sess.run(init_op) # Start input enqueue threads. print("Queue runners: %s" %([qr.name for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)])) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # waiting for queue to get loaded time.sleep(15) run_metadata = tf.RunMetadata() try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. The return values are # the activations from the `train_op` (which is # discarded) and the `loss` op. To inspect the values # of your ops or variables, you may include them in # the list passed to sess.run() and the value tensors # will be returned in the tuple from the call. if step == 500: _, loss_value = sess.run([train_op, loss], options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) with open("run_metadata.pbtxt", "w") as out: out.write(str(run_metadata)) from tensorflow.python.client import timeline trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file = open('timeline.reader-1thread.json', 'w') trace_file.write(trace.generate_chrome_trace_format()) else: _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # Print an overview fairly often. if step % 100 == 0: print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def run_training(): data_sets = input_data.read_data_sets(FLAGS.input_data_dir) max_steps = math.ceil(CONFIG.epoch * data_sets.train.num_examples / CONFIG.batch_size) with tf.Graph().as_default(): images_placeholder, labels_placeholder = placeholder_inputs( CONFIG.batch_size) logits = mnist.inference(images_placeholder, CONFIG.size_hidden_1, CONFIG.size_hidden_2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, CONFIG.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary Tensor based on the TF collection of Summaries. summary = tf.summary.merge_all() # Add the variable initializer Op. init = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() if FLAGS.c: saver.restore(sess, os.path.join(FLAGS.log_dir, 'model.ckpt')) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) progbar = Progbar(target=CONFIG.eval_every_n_steps) for step in xrange(max_steps): start_time = time.time() feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) progbar.update((step % CONFIG.eval_every_n_steps) + 1, [("Loss", loss_value)], force=True) duration = time.time() - start_time # Save a checkpoint and evaluate the model periodically. if (step + 1) % CONFIG.eval_every_n_steps == 0 or (step + 1) == max_steps: print("Total : ", int( (step + 1) / CONFIG.eval_every_n_steps), "/", int(math.ceil(max_steps / CONFIG.eval_every_n_steps))) summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) progbar = Progbar(target=CONFIG.eval_every_n_steps)
def main(unused_argv): # mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # if FLAGS.download_only: # sys.exit(0) print(FLAGS) if FLAGS.job_name is None or FLAGS.job_name == "": raise ValueError("Must specify an explicit `job_name`") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") print("job name = %s" % FLAGS.job_name) print("task index = %d" % FLAGS.task_index) # Construct the cluster and start the server ps_spec = FLAGS.ps_hosts.split(",") worker_spec = FLAGS.worker_hosts.split(",") # Get the number of workers. num_workers = len(worker_spec) cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() else: is_chief = (FLAGS.task_index == 0) worker_device = "/job:worker/task:%d" % (FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)): global_step = tf.contrib.framework.get_or_create_global_step() images, labels = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2) loss = mnist.loss(logits, labels) tf.summary.scalar(loss.op.name, loss) opt = tf.train.AdamOptimizer(FLAGS.learning_rate) if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="mnist_sync_replicas") train_op = opt.minimize(loss, global_step=global_step) if is_chief: # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) my_summary_op = tf.summary.merge_all() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, summary_op=None, init_op=init_op, recovery_wait_secs=1, global_step=global_step, save_model_secs=60, save_summaries_secs=60) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index]) with sv.managed_session(master=server.target, config=sess_config) as sess: start_time = time.time() step = 1 # if is_chief: # if FLAGS.train_dir: # sv.start_standard_services(sess) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) if is_chief: # Chief worker will start the chief queue runner and call the init op. sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(sync_init_op) try: while not sv.should_stop(): if step > 0 and step % 100 == 0: # Create the summary every 100 chief steps. _, loss_value, global_step_value, summ = sess.run( [train_op, loss, global_step, my_summary_op]) if is_chief: sv.summary_computed(sess, summ) duration = time.time() - start_time sec_per_batch = duration / (global_step_value * num_workers) format_str = ( "After %d training steps (%d global steps), " "loss on training batch is %g. " "(%.3f sec/batch)") print(format_str % (step, global_step_value, loss_value, sec_per_batch)) else: # Train normally _, loss_value, global_step_value = sess.run( [train_op, loss, global_step]) step += 1 except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. print('Caught OutOfRangeError. Stopping Training.')
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(tempfile.mkdtemp(), FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels and mark as input. placeholders = placeholder_inputs() keys_placeholder, images_placeholder, labels_placeholder = placeholders inputs = { 'key': keys_placeholder.name, 'image': images_placeholder.name } tf.add_to_collection('inputs', json.dumps(inputs)) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # To be able to extract the id, we need to add the identity function. keys = tf.identity(keys_placeholder) # The prediction will be the index in logits with the highest score. # We also use a softmax operation to produce a probability distribution # over all possible digits. prediction = tf.argmax(logits, 1) scores = tf.nn.softmax(logits) # Mark the outputs. outputs = { 'key': keys.name, 'prediction': prediction.name, 'scores': scores.name } tf.add_to_collection('outputs', json.dumps(outputs)) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test) # Export the model so that it can be loaded and used later for predictions. file_io.create_dir(FLAGS.model_dir) saver.save(sess, os.path.join(FLAGS.model_dir, 'export'))
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. init = tf.initialize_all_variables() sess.run(init) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) # And then after everything is built, start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: saver.save(sess, FLAGS.train_dir, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def run_training(): with tf.Graph().as_default(): # train data and run valid after each epoch, so nb_epochs=1 images, labels = inputs(train=True, batch_size=cfg.FLAGS.batch_size, nb_epochs=cfg.FLAGS.nb_epochs) logits = mnist.inference(images, cfg.FLAGS.hidden1, cfg.FLAGS.hidden2) loss = mnist.loss(logits, labels) train_op = mnist.training(loss, cfg.FLAGS.learning_rate) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) data_sets = mnist_datasets.read_data_sets(cfg.FLAGS.train_dir, dtype=tf.uint8, reshape=False, validation_size=cfg.FLAGS.validation_size) nb_train_samples = data_sets.train.num_examples # print('training samples: {}; batch_size: {}'.format(nb_train_samples, cfg.FLAGS.batch_size)) # .. 55000 and 100 # prepare validation data in terms of tf.constant image_valid_np = data_sets.validation.images.reshape((cfg.FLAGS.validation_size, mnist.IMAGE_PIXELS)) label_valid_np = data_sets.validation.labels # shape (5000,) # to fit the batch size idx_valid = np.random.choice(cfg.FLAGS.validation_size, cfg.FLAGS.batch_size, replace=False) image_valid_np = image_valid_np[idx_valid, :] image_valid_np = image_valid_np * (1. / 255) - 0.5 # remember to preprocessing label_valid_np = label_valid_np[idx_valid] step = 0 epoch_idx = 0 try: start_time = time.time() while not coord.should_stop(): _, loss_value = sess.run([train_op, loss]) step += 1 if step >= nb_train_samples // cfg.FLAGS.batch_size: epoch_idx += 1 end_time = time.time() duration = end_time - start_time print('Training Epoch {}, Step {}: loss = {:.02f} ({:.03f} sec)' .format(epoch_idx, step, loss_value, duration)) start_time = end_time # re-timing step = 0 # reset step counter # derive loss on validation dataset loss_valid_value = sess.run(loss, feed_dict={images: image_valid_np, labels: label_valid_np}) print('Validation Epoch {}: loss = {:.02f}' .format(epoch_idx, loss_valid_value)) except tf.errors.OutOfRangeError: print('Done training for epoch {}, {} steps'.format(epoch_idx, step)) finally: coord.request_stop() # # restart runner for validation data # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess=sess, coord=coord) # # step = 0 # try: # start_time = time.time() # while not coord.should_stop(): # loss_value_valid = sess.run(loss_valid) # step += 1 # except tf.errors.OutOfRangeError: # print('Done validation for epoch {}, {} steps'.format(epoch_idx, step)) # finally: # coord.request_stop() # duration = time.time() - start_time # print('Validation: Epoch {}, Step {}: loss = {:.02f} ({:.03f} sec)' # .format(epoch_idx, step, loss_value_valid, duration)) coord.join(threads) sess.close()