def input_fn(): tf.logging.info("Loading MNIST into memory.") return mnist.load_mnist(data_dir, num_epochs=num_epochs, batch_size=64, flatten_images=False, use_fake_data=use_fake_data)
def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False): """Train a ConvNet on MNIST. Args: data_dir: string. Directory to read MNIST examples from. num_epochs: int. Number of passes to make over the training set. use_fake_data: bool. If True, generate a synthetic dataset. Returns: accuracy of model on the final minibatch of training data. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") examples, labels = mnist.load_mnist( data_dir, num_epochs=num_epochs, batch_size=128, use_fake_data=use_fake_data, flatten_images=False) # Build a ConvNet. layer_collection = kfac.LayerCollection() loss, accuracy = build_model( examples, labels, num_labels=10, layer_collection=layer_collection) # Fit model. return minimize_loss_single_machine(loss, accuracy, layer_collection)
def train_mnist_multitower(data_dir, num_epochs, num_towers, use_fake_data=True): """Train a ConvNet on MNIST. Args: data_dir: string. Directory to read MNIST examples from. num_epochs: int. Number of passes to make over the training set. num_towers: int. Number of CPUs to split inference across. use_fake_data: bool. If True, generate a synthetic dataset. Returns: accuracy of model on the final minibatch of training data. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") tower_batch_size = 128 batch_size = tower_batch_size * num_towers tf.logging.info( ("Loading MNIST into memory. Using batch_size = %d = %d towers * %d " "tower batch size.") % (batch_size, num_towers, tower_batch_size)) examples, labels = mnist.load_mnist( data_dir, num_epochs=num_epochs, batch_size=batch_size, use_fake_data=use_fake_data, flatten_images=False) # Split minibatch across towers. examples = tf.split(examples, num_towers) labels = tf.split(labels, num_towers) # Build an MLP. Each tower's layers will be added to the LayerCollection. layer_collection = kfac.LayerCollection() tower_results = [] for tower_id in range(num_towers): with tf.device("/cpu:%d" % tower_id): with tf.name_scope("tower%d" % tower_id): with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)): tf.logging.info("Building tower %d." % tower_id) tower_results.append( build_model(examples[tower_id], labels[tower_id], 10, layer_collection)) losses, accuracies = zip(*tower_results) # Average across towers. loss = tf.reduce_mean(losses) accuracy = tf.reduce_mean(accuracies) # Fit model. session_config = tf.ConfigProto( allow_soft_placement=False, device_count={ "CPU": num_towers }) return minimize_loss_single_machine( loss, accuracy, layer_collection, session_config=session_config)
def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False, device=None, manual_op_exec=False): """Train a ConvNet on MNIST. Args: data_dir: string. Directory to read MNIST examples from. num_epochs: int. Number of passes to make over the training set. use_fake_data: bool. If True, generate a synthetic dataset. device: string or None. The covariance and inverse update ops are run on this device. If empty or None, the default device will be used. (Default: None) manual_op_exec: bool, If `True` then `minimize_loss_single_machine_manual` is called for training which handles inverse and covariance computation. This is shown only for illustrative purpose. Otherwise `minimize_loss_single_machine` is called which relies on `PeriodicInvCovUpdateOpt` for op placement and execution. Returns: accuracy of model on the final minibatch of training data. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") examples, labels = mnist.load_mnist(data_dir, num_epochs=num_epochs, batch_size=128, use_fake_data=use_fake_data, flatten_images=False) # Build a ConvNet. layer_collection = kfac.LayerCollection() loss, accuracy = build_model(examples, labels, num_labels=10, layer_collection=layer_collection) # Fit model. if manual_op_exec: return minimize_loss_single_machine_manual(loss, accuracy, layer_collection, device=device) else: return minimize_loss_single_machine(loss, accuracy, layer_collection, device=device)
def train_mnist_distributed(task_id, num_worker_tasks, num_ps_tasks, master, data_dir, num_epochs, use_fake_data=False): """Train a ConvNet on MNIST. Args: task_id: int. Integer in [0, num_worker_tasks). ID for this worker. num_worker_tasks: int. Number of workers in this distributed training setup. num_ps_tasks: int. Number of parameter servers holding variables. master: string. IP and port of TensorFlow runtime process. data_dir: string. Directory to read MNIST examples from. num_epochs: int. Number of passes to make over the training set. use_fake_data: bool. If True, generate a synthetic dataset. Returns: accuracy of model on the final minibatch of training data. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") examples, labels = mnist.load_mnist( data_dir, num_epochs=num_epochs, batch_size=128, use_fake_data=use_fake_data, flatten_images=False) # Build a ConvNet. layer_collection = kfac.LayerCollection() with tf.device(tf.train.replica_device_setter(num_ps_tasks)): loss, accuracy = build_model( examples, labels, num_labels=10, layer_collection=layer_collection) # Fit model. checkpoint_dir = None if data_dir is None else os.path.join(data_dir, "kfac") return minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master, checkpoint_dir, loss, accuracy, layer_collection)
def load_mnist(batch_size): """Creates MNIST dataset and wraps it inside cached data reader. Args: batch_size: Scalar placeholder variable which needs to fed to read variable sized training data. Returns: cached_reader: `data_reader.CachedReader` instance which wraps MNIST dataset. training_batch: Tensor of shape `[batch_size, 784]`, MNIST training images. """ # Create a MNIST data batch with max training batch size. # data_set = datasets.Mnist(batch_size=_BATCH_SIZE, mode='train')() data_set = mnist.load_mnist(FLAGS.data_dir, num_epochs=FLAGS.num_epochs, batch_size=_BATCH_SIZE, flatten_images=True) # Wrap the data set into cached_reader which provides variable sized training # and caches the read train batch. cached_reader = data_reader.CachedDataReader(data_set, _BATCH_SIZE) return cached_reader, cached_reader(batch_size)[0]
def train_mnist_distributed_sync_replicas(task_id, is_chief, num_worker_tasks, num_ps_tasks, master, data_dir, num_epochs, op_strategy, use_fake_data=False): """Train a ConvNet on MNIST using Sync replicas optimizer. Args: task_id: int. Integer in [0, num_worker_tasks). ID for this worker. is_chief: `boolean`, `True` if the worker is chief worker. num_worker_tasks: int. Number of workers in this distributed training setup. num_ps_tasks: int. Number of parameter servers holding variables. master: string. IP and port of TensorFlow runtime process. data_dir: string. Directory to read MNIST examples from. num_epochs: int. Number of passes to make over the training set. op_strategy: `string`, Strategy to run the covariance and inverse ops. If op_strategy == `chief_worker` then covariance and inverse update ops are run on chief worker otherwise they are run on dedicated workers. use_fake_data: bool. If True, generate a synthetic dataset. Returns: accuracy of model on the final minibatch of training data. Raises: ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"]. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") examples, labels = mnist.load_mnist(data_dir, num_epochs=num_epochs, batch_size=128, use_fake_data=use_fake_data, flatten_images=False) # Build a ConvNet. layer_collection = kfac.LayerCollection() with tf.device(tf.train.replica_device_setter(num_ps_tasks)): loss, accuracy = build_model(examples, labels, num_labels=10, layer_collection=layer_collection) # Fit model. checkpoint_dir = None if data_dir is None else os.path.join( data_dir, "kfac") if op_strategy == "chief_worker": return distributed_grads_only_and_ops_chief_worker( task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir, loss, accuracy, layer_collection) elif op_strategy == "dedicated_workers": return distributed_grads_and_ops_dedicated_workers( task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir, loss, accuracy, layer_collection) else: raise ValueError("Only supported op strategies are : {}, {}".format( "chief_worker", "dedicated_workers"))
def train_mnist_multitower(data_dir, num_epochs, num_towers, devices, use_fake_data=True, session_config=None): """Train a ConvNet on MNIST. Training data is split equally among the towers. Each tower computes loss on its own batch of data and the loss is aggregated on the CPU. The model variables are placed on first tower. The covariance and inverse update ops and variables are placed on specified devices in a round robin manner. Args: data_dir: string. Directory to read MNIST examples from. num_epochs: int. Number of passes to make over the training set. num_towers: int. Number of towers. use_fake_data: bool. If True, generate a synthetic dataset. devices: list of strings. List of devices to place the towers. session_config: None or tf.ConfigProto. Configuration for tf.Session(). Returns: accuracy of model on the final minibatch of training data. """ num_towers = 1 if not devices else len(devices) # Load a dataset. tf.logging.info("Loading MNIST into memory.") tower_batch_size = 128 batch_size = tower_batch_size * num_towers tf.logging.info( ("Loading MNIST into memory. Using batch_size = %d = %d towers * %d " "tower batch size.") % (batch_size, num_towers, tower_batch_size)) examples, labels = mnist.load_mnist(data_dir, num_epochs=num_epochs, batch_size=batch_size, use_fake_data=use_fake_data, flatten_images=False) # Split minibatch across towers. examples = tf.split(examples, num_towers) labels = tf.split(labels, num_towers) # Build an MLP. Each tower's layers will be added to the LayerCollection. layer_collection = kfac.LayerCollection() tower_results = [] for tower_id in range(num_towers): with tf.device(devices[tower_id]): with tf.name_scope("tower%d" % tower_id): with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)): tf.logging.info("Building tower %d." % tower_id) tower_results.append( build_model(examples[tower_id], labels[tower_id], 10, layer_collection, register_layers=(tower_id == num_towers - 1))) losses, accuracies = zip(*tower_results) # Average across towers. loss = tf.reduce_mean(losses) accuracy = tf.reduce_mean(accuracies) # Fit model. g_step = tf.train.get_or_create_global_step() optimizer = kfac.PeriodicInvCovUpdateKfacOpt( invert_every=_INVERT_EVERY, cov_update_every=_COV_UPDATE_EVERY, learning_rate=0.0001, cov_ema_decay=0.95, damping=0.001, layer_collection=layer_collection, placement_strategy="round_robin", cov_devices=devices, inv_devices=devices, momentum=0.9) train_op = optimizer.minimize(loss, global_step=g_step) tf.logging.info("Starting training.") with tf.train.MonitoredTrainingSession(config=session_config) as sess: while not sess.should_stop(): global_step_, loss_, accuracy_, _ = sess.run( [g_step, loss, accuracy, train_op]) if global_step_ % _REPORT_EVERY == 0: tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_, loss_, accuracy_)