def main(unused_argv): logging.set_verbosity(logging.INFO) trainer = Trainer() project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment("PCL RL") as experiment: trainer.run(experiment)
def train(hps, datasets): """Train the LFADS model. Args: hps: The dictionary of hyperparameters. datasets: A dictionary of data dictionaries. The dataset dict is simply a name(string)-> data dictionary mapping (See top of lfads.py). """ project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment("LFADS") as experiment: model = build_model(hps, kind="train", datasets=datasets) if hps.do_reset_learning_rate: sess = tf.get_default_session() sess.run(model.learning_rate.initializer) model.train_model(datasets, experiment)
def run_training(): """Train MNIST for a number of steps.""" data_sets = input_data.read_data_sets(INPUT_DATA_DIR) with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder = tf.placeholder(tf.float32, shape=(BATCH_SIZE, IMAGE_PIXELS)) labels_placeholder = tf.placeholder(tf.int32, shape=(BATCH_SIZE,)) # Build a Graph that computes predictions from the inference model. logits = inference(images_placeholder, HIDDEN1_UNITS, HIDDEN2_UNITS) # Add to the Graph the Ops for loss calculation. labels = tf.to_int64(labels_placeholder) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits, name='xentropy') loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') # Add to the Graph the Ops that calculate and apply gradients. optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = optimizer.minimize(loss, global_step=global_step) # Add the Op to compare the logits to the labels during evaluation. correct = tf.nn.in_top_k(logits, labels_placeholder, 1) eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) # Initialize the graph init = tf.global_variables_initializer() session = tf.Session() session.run(init) # Now that our neural net is ready, let's integrate MissingLinkAI SDK and start the training! # Create a project manager with credentials to communicate with MissingLinkAI's backend missinglink_project = missinglink.TensorFlowProject(OWNER_ID, PROJECT_TOKEN) # Create an experiment as a context manager so MissingLinkAI can monitor the # progress of the experiment. with missinglink_project.create_experiment( display_name='MNIST multilayer perception', description='Two fully connected hidden layers') as experiment: NUM_SAMPLE = 2000 NUM_BATCHES = int(NUM_SAMPLE / BATCH_SIZE) for epoch in experiment.epoch_loop(10): for batch in experiment.batch_loop(NUM_BATCHES): feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Use `experiment.train` scope before the `session.run` which runs the optimizer # to let the SDK know it should collect the metrics as training metrics. with experiment.train( monitored_metrics={'loss': loss, 'acc': eval_correct}): # Note that you only need to provide the optimizer op. The SDK will automatically run the metric # tensors provided in the `experiment.train` context (and `experiment` context). _, loss_value = session.run([train_op, loss], feed_dict=feed_dict) # Validate the model with the validation dataset with experiment.validation( monitored_metrics={'loss': loss, 'acc': eval_correct}): do_eval(session, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Use `experiment.test` generator to manage the testing loop. total_test_iterations = data_set.num_examples with experiment.test( total_test_iterations, expected=labels_placeholder, predicted=logits): sess.run([train_op, loss], feed_dict=feed_dict)
from __future__ import print_function import tensorflow as tf import numpy as np from ops import * from data import * from net import * from utils import * import os import time import missinglink OWNER_ID = '605f9f21-80b1-ddd5-b2ed-ecdd662f035c' PROJECT_TOKEN = 'hwotoGzZzPqwaZiL' missinglink_project = missinglink.TensorFlowProject(OWNER_ID, PROJECT_TOKEN) flags = tf.app.flags conf = flags.FLAGS class Solver(object): def __init__(self): self.device_id = conf.device_id self.train_dir = conf.train_dir self.samples_dir = conf.samples_dir if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) if not os.path.exists(self.samples_dir): os.makedirs(self.samples_dir) #datasets params
# In this example, we will build a simple neural network with 2 fully connected layers. # We will then integrate MissingLink SDK in order to remotely monitor our training, validation # and testing process. import os import math import argparse import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data old_v = tf.logging.get_verbosity() tf.logging.set_verbosity(tf.logging.ERROR) import missinglink missinglink_project = missinglink.TensorFlowProject(project='6201689270910976') # Input params NUM_CLASSES = 10 # The MNIST dataset has 10 classes, representing the digits 0 through 9. IMAGE_SIZE = 28 # The MNIST images are always 28x28 pixels. IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE # Network params HIDDEN1_UNITS = 128 HIDDEN2_UNITS = 32 # Training params LEARNING_RATE = 0.01 MAX_STEPS = 2000 BATCH_SIZE = 100
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/mnist/fully_connected_feed.py # # In this example, we will build a simple neural network with 2 fully connected layers. # We will then integrate MissingLink SDK in order to remotely monitor our training, validation # and testing process. import os import math import argparse import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data import missinglink missinglink_project = missinglink.TensorFlowProject() # Input params NUM_CLASSES = 10 # The MNIST dataset has 10 classes, representing the digits 0 through 9. IMAGE_SIZE = 28 # The MNIST images are always 28x28 pixels. IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE # Network params HIDDEN1_UNITS = 128 HIDDEN2_UNITS = 32 # Training params LEARNING_RATE = 0.01 MAX_STEPS = 9000 BATCH_SIZE = 100
X_train, X_test = standard_scale(mnist.train.images, mnist.test.images) n_samples = int(mnist.train.num_examples) training_epochs = 20 batch_size = 128 display_step = 1 autoencoder = AdditiveGaussianNoiseAutoencoder( n_input=784, n_hidden=200, transfer_function=tf.nn.softplus, optimizer=tf.train.AdamOptimizer(learning_rate=0.001), scale=0.01) project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") avg_cost = 0. with project.create_experiment("Autoencoder", custom_metrics={'avg cost': lambda: avg_cost}, monitored_metrics={'cost': autoencoder.cost }) as experiment: for epoch in experiment.epoch_loop(training_epochs): avg_cost = 0. total_batch = int(n_samples / batch_size) # Loop over all batches for i in experiment.batch_loop(total_batch): batch_xs = get_random_block_from_data(X_train, batch_size) # Fit training using batch data with experiment.train():
def main(_): if FLAGS.self_test: print('Running self-test.') train_data, train_labels = fake_data(256) validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) test_data, test_labels = fake_data(EVAL_BATCH_SIZE) num_epochs = 1 else: # Get the data. train_data_filename = maybe_download('train-images-idx3-ubyte.gz') train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') # Extract it into numpy arrays. train_data = extract_data(train_data_filename, 60000) train_labels = extract_labels(train_labels_filename, 60000) test_data = extract_data(test_data_filename, 10000) test_labels = extract_labels(test_labels_filename, 10000) # Generate a validation set. validation_data = train_data[:VALIDATION_SIZE, ...] validation_labels = train_labels[:VALIDATION_SIZE] train_data = train_data[VALIDATION_SIZE:, ...] train_labels = train_labels[VALIDATION_SIZE:] num_epochs = NUM_EPOCHS train_size = train_labels.shape[0] # This is where training samples and labels are fed to the graph. # These placeholder nodes will be fed a batch of training data at each # training step using the {feed_dict} argument to the Run() call below. train_data_node = tf.placeholder(data_type(), shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE, )) eval_data = tf.placeholder(data_type(), shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) # The variables below hold all the trainable weights. They are passed an # initial value which will be assigned when we call: # {tf.global_variables_initializer().run()} conv1_weights = tf.Variable( tf.truncated_normal( [5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. stddev=0.1, seed=SEED, dtype=data_type())) conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) conv2_weights = tf.Variable( tf.truncated_normal([5, 5, 32, 64], stddev=0.1, seed=SEED, dtype=data_type())) conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) fc1_weights = tf.Variable( # fully connected, depth 512. tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], stddev=0.1, seed=SEED, dtype=data_type())) fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) fc2_weights = tf.Variable( tf.truncated_normal([512, NUM_LABELS], stddev=0.1, seed=SEED, dtype=data_type())) fc2_biases = tf.Variable( tf.constant(0.1, shape=[NUM_LABELS], dtype=data_type())) # We will replicate the model structure for the training subgraph, as well # as the evaluation subgraphs, while sharing the trainable parameters. def model(data, train=False): """The Model definition.""" # 2D convolution, with 'SAME' padding (i.e. the output feature map has # the same size as the input). Note that {strides} is a 4D array whose # shape matches the data layout: [image index, y, x, depth]. conv = tf.nn.conv2d(data, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') # Bias and rectified linear non-linearity. relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) # Max pooling. The kernel size spec {ksize} also follows the layout of # the data. Here we have a pooling window of 2, and a stride of 2. pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') conv = tf.nn.conv2d(pool, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # Reshape the feature map cuboid into a 2D matrix to feed it to the # fully connected layers. pool_shape = pool.get_shape().as_list() reshape = tf.reshape( pool, [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) # Fully connected layer. Note that the '+' operation automatically # broadcasts the biases. hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) # Add a 50% dropout during training only. Dropout also scales # activations such that no rescaling is needed at evaluation time. if train: hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) return tf.matmul(hidden, fc2_weights) + fc2_biases # Training computation: logits + cross-entropy loss. logits = model(train_data_node, True) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=train_labels_node, logits=logits)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Optimizer: set up a variable that's incremented once per batch and # controls the learning rate decay. batch = tf.Variable(0, dtype=data_type()) # Decay once per epoch, using an exponential schedule starting at 0.01. learning_rate = tf.train.exponential_decay( 0.01, # Base learning rate. batch * BATCH_SIZE, # Current index into the dataset. train_size, # Decay step. 0.95, # Decay rate. staircase=True) # Use simple momentum for the optimization. optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(loss, global_step=batch) # Predictions for the current training minibatch. train_prediction = tf.nn.softmax(logits) # Predictions for the test and validation, which we'll compute less often. eval_prediction = tf.nn.softmax(model(eval_data)) # Small utility function to evaluate a dataset by feeding batches of data to # {eval_data} and pulling the results from {eval_predictions}. # Saves memory and enables this to run on smaller GPUs. def eval_in_batches(data, sess): """Get all predictions for a dataset by running it in small batches.""" size = data.shape[0] if size < EVAL_BATCH_SIZE: raise ValueError("batch size for evals larger than dataset: %d" % size) predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) for begin in xrange(0, size, EVAL_BATCH_SIZE): end = begin + EVAL_BATCH_SIZE if end <= size: predictions[begin:end, :] = sess.run( eval_prediction, feed_dict={eval_data: data[begin:end, ...]}) else: batch_predictions = sess.run( eval_prediction, feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) predictions[begin:, :] = batch_predictions[begin - size:, :] return predictions project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment("mnist") as experiment: # Create a local session to run the training. start_time = time.time() expected = tf.Variable(test_labels, name="expected") tf.global_variables_initializer() with tf.Session() as sess: # Run all the initializers to prepare the trainable parameters. tf.global_variables_initializer().run() print('Initialized!') # Loop through training steps. for step in experiment.loop( max_iterations=int(num_epochs * train_size) // BATCH_SIZE): # Compute the offset of the current minibatch in the data. # Note that we could use better randomization across epochs. offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) batch_data = train_data[offset:(offset + BATCH_SIZE), ...] batch_labels = train_labels[offset:(offset + BATCH_SIZE)] # This dictionary maps the batch data (as a numpy array) to the # node in the graph it should be fed to. feed_dict = { train_data_node: batch_data, train_labels_node: batch_labels } # Run the optimizer to update weights. sess.run(optimizer, feed_dict=feed_dict) # print some extra information once reach the evaluation frequency if step % EVAL_FREQUENCY == 0: minibatch_error = 0. # fetch some extra nodes' data with experiment.train(monitored_metrics={ 'loss': loss, 'learning rate': learning_rate }, custom_metrics={ 'error rate': lambda: minibatch_error }): l, lr, predictions = sess.run( [loss, learning_rate, train_prediction], feed_dict=feed_dict) elapsed_time = time.time() - start_time start_time = time.time() print('Step %d (epoch %.2f), %.1f ms' % (step, float(step) * BATCH_SIZE / train_size, 1000 * elapsed_time / EVAL_FREQUENCY)) print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) minibatch_error = error_rate(predictions, batch_labels) print('Minibatch error: %.1f%%' % minibatch_error) validation_error = 0. with experiment.validation( custom_metrics={ 'validation error': lambda: validation_error }): validation_error = error_rate( eval_in_batches(validation_data, sess), validation_labels) print('Validation error: %.1f%%' % validation_error) sys.stdout.flush() # Finally print the result! with experiment.test(expected=expected, predicted=eval_prediction): test_error = error_rate(eval_in_batches(test_data, sess), test_labels) print('Test error: %.1f%%' % test_error) if FLAGS.self_test: print('test_error', test_error) assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % ( test_error, )
import time import tensorflow as tf import missinglink NUM_EPOCHS = 4 NUM_BATCHES = 10 missinglink_project = missinglink.TensorFlowProject( project_token='KzcbCxZWjewiqxCi') # Build a graph. a = tf.constant(5.0) b = tf.constant(6.0) c = a * b with missinglink_project.create_experiment() as experiment: for epoch in experiment.epoch_loop(NUM_EPOCHS): for batch in experiment.batch_loop(NUM_BATCHES): time.sleep(0.5) with experiment.train(): # Launch the graph in a session. sess = tf.Session() # Evaluate the tensor `c`. print('sess run result', sess.run(c)) loss = batch * 0.1 - epoch * 0.05 print('loss', loss) experiment.add_metric('loss', loss)
def main(unused_argv): train_data, valid_data = data_utils.get_data() trainer = Trainer(train_data, valid_data, data_utils.IMAGE_NEW_SIZE ** 2) project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment(display_name="Learning to remeber") as experiment: trainer.run(experiment)
def train(): if FLAGS.train_dir is None: raise ValueError('Parameter train_dir must be provided') if FLAGS.task is None: raise ValueError('Parameter task must be provided') if FLAGS.model is None: raise ValueError('Parameter model must be provided') input_config_string = config_helper.GetConfigString(FLAGS.input_config) input_config = config_helper.InputConfig(input_config_string) # Training parameters. train_config_string = config_helper.GetConfigString(FLAGS.train_config) train_config = config_helper.TrainConfig(train_config_string) batch_size = train_config.batch_size initial_learning_rate = train_config.learning_rate decay_rate = train_config.decay_rate samples_per_decay = train_config.samples_per_decay # Parameters for learning-rate decay. # The formula is decay_rate ** floor(steps / decay_steps). decay_steps = samples_per_decay / batch_size decay_steps = max(decay_steps, 1) first_code = code_loader.ReadFirstCode(input_config.data) first_code_height = ( first_code.features.feature['code_shape'].int64_list.value[0]) first_code_width = ( first_code.features.feature['code_shape'].int64_list.value[1]) max_bit_depth = ( first_code.features.feature['code_shape'].int64_list.value[2]) print('Maximum code depth: {}'.format(max_bit_depth)) project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment("Entropy Coder") as experiment: with tf.Graph().as_default(): ps_ops = [ "Variable", "VariableV2", "AutoReloadVariable", "VarHandleOp" ] with tf.device( tf.train.replica_device_setter(FLAGS.ps_tasks, ps_ops=ps_ops)): codes = code_loader.LoadBinaryCode(input_config=input_config, batch_size=batch_size) if input_config.unique_code_size: print('Input code size: {} x {}'.format( first_code_height, first_code_width)) codes.set_shape([ batch_size, first_code_height, first_code_width, max_bit_depth ]) else: codes.set_shape([batch_size, None, None, max_bit_depth]) codes_effective_shape = tf.shape(codes) global_step = tf.contrib.framework.create_global_step() # Apply learning-rate decay. learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=decay_rate, staircase=True) tf.summary.scalar('Learning Rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1.0) # Create the entropy coder model. model = model_factory.GetModelRegistry().CreateModel( FLAGS.model) model_config_string = config_helper.GetConfigString( FLAGS.model_config) model.Initialize(global_step, optimizer, model_config_string) model.BuildGraph(codes) summary_op = tf.summary.merge_all() # Verify that the model can actually be trained. if model.train_op is None: raise ValueError('Input model {} is not trainable'.format( FLAGS.model)) # We disable the summary thread run by Supervisor class by passing # summary_op=None. We still pass save_summaries_secs because it is used by # the global step counter thread. is_chief = (FLAGS.task == 0) sv = tf.train.Supervisor( logdir=FLAGS.train_dir, is_chief=is_chief, global_step=global_step, # saver=model.saver, summary_op=None, save_summaries_secs=120, save_model_secs=600, recovery_wait_secs=30) sess = sv.PrepareSession(FLAGS.master) sv.StartQueueRunners(sess) step = sess.run(global_step) print('Trainer initial step: {}.'.format(step)) # Once everything has been setup properly, save the configs. if is_chief: config_helper.SaveConfig(FLAGS.train_dir, 'input_config.json', input_config_string) config_helper.SaveConfig(FLAGS.train_dir, 'model_config.json', model_config_string) config_helper.SaveConfig(FLAGS.train_dir, 'train_config.json', train_config_string) # Train the model. next_summary_time = time.time() for step in experiment.loop( condition=lambda _: not sv.ShouldStop()): feed_dict = None # Once in a while, update the summaries on the chief worker. if is_chief and next_summary_time < time.time(): summary_str = sess.run(summary_op, feed_dict=feed_dict) sv.SummaryComputed(sess, summary_str) next_summary_time = time.time( ) + sv.save_summaries_secs else: tf_tensors = { 'train': model.train_op, 'code_length': model.average_code_length } with experiment.train(monitored_metrics={ 'avg code length': model.average_code_length }): np_tensors = sess.run(tf_tensors, feed_dict=feed_dict) print np_tensors['code_length'] sv.Stop()
--train_vggish=False \ --checkpoint /path/to/model/checkpoint """ from __future__ import print_function from random import shuffle import numpy as np import tensorflow as tf import vggish_input import vggish_params import vggish_slim import missinglink project = missinglink.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") flags = tf.app.flags slim = tf.contrib.slim flags.DEFINE_integer( 'num_batches', 30, 'Number of batches of examples to feed into the model. Each batch is of ' 'variable size and contains shuffled examples of each class of audio.') flags.DEFINE_boolean( 'train_vggish', True, 'If Frue, allow VGGish parameters to change during training, thus ' 'fine-tuning VGGish. If False, VGGish parameters are fixed, thus using ' 'VGGish as a fixed feature extractor.')
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir, train_files, eval_files, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, num_epochs, export_format): """Run the training and evaluation graph. Args: target (string): Tensorflow server target is_chief (bool): Boolean flag to specify a chief server train_steps (int): Maximum number of training steps eval_steps (int): Number of steps to run evaluation for at each checkpoint. if eval_steps is None, evaluation will run for 1 epoch. job_dir (string): Output dir for checkpoint and summary train_files (string): List of CSV files to read train data eval_files (string): List of CSV files to read eval data train_batch_size (int): Batch size for training eval_batch_size (int): Batch size for evaluation learning_rate (float): Learning rate for Gradient Descent eval_frequency (int): Run evaluation frequency every n training steps. Do not evaluate too frequently otherwise you will pay for performance and do not evaluate too in-frequently otherwise you will not know how soon to stop training. Use default values to start with first_layer_size (int): Size of the first DNN layer num_layers (int): Number of hidden layers in the DNN scale_factor (float): Decay rate for the size of hidden layers num_epochs (int): Maximum number of training data epochs on which to train export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format for the outputed saved_model binary. """ # Calculate the number of hidden units hidden_units = [ max(2, int(first_layer_size * scale_factor**i)) for i in range(num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info("Created DNN hidden units {}".format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( eval_files, num_epochs=None if eval_steps else 1, batch_size=eval_batch_size, shuffle=False) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn(model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) hooks = [ EvaluationRunHook( job_dir, metric_dict, evaluation_graph, eval_frequency, eval_steps=eval_steps, ) ] else: hooks = [] # Create a new graph and specify that as default with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue features, labels = model.input_fn(train_files, num_epochs=num_epochs, batch_size=train_batch_size) # Returns the training graph and global step tensor loss, train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment(display_name='Census') as experiment: # Creates a MonitoredSession for training # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. for _ in experiment.loop(condition=lambda i: ( train_steps is None or i < train_steps) and not session .should_stop()): with experiment.train(monitored_metrics={'loss': loss}): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, job_dir, model.SERVING_INPUT_FUNCTIONS[export_format], hidden_units)
def train(dataset): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) # Calculate the gradients for each model tower. tower_grads = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. with slim.arg_scope([slim.variables.variable], device='/cpu:0'): # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables) # Reuse variables for the next tower. reuse_variables = True # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. batchnorm_updates = tf.get_collection( slim.ops.UPDATE_OPS_COLLECTION, scope) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=sess.graph) project = missinglink.TensorFlowProject( owner_id="your-owner-id", project_token="your-project-token") with project.create_experiment(display_name='Inception with flowers', optimizer=opt) as experiment: for step in experiment.loop(FLAGS.max_steps): start_time = time.time() with experiment.train(monitored_metrics={'loss': loss}): _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)