Beispiel #1
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    trainer = Trainer()

    project = ml.TensorFlowProject(owner_id="your-owner-id",
                                   project_token="your-project-token")

    with project.create_experiment("PCL RL") as experiment:
        trainer.run(experiment)
Beispiel #2
0
def train(hps, datasets):
    """Train the LFADS model.

  Args:
    hps: The dictionary of hyperparameters.
    datasets: A dictionary of data dictionaries.  The dataset dict is simply a
      name(string)-> data dictionary mapping (See top of lfads.py).
  """
    project = ml.TensorFlowProject(owner_id="your-owner-id",
                                   project_token="your-project-token")

    with project.create_experiment("LFADS") as experiment:
        model = build_model(hps, kind="train", datasets=datasets)
        if hps.do_reset_learning_rate:
            sess = tf.get_default_session()
            sess.run(model.learning_rate.initializer)

        model.train_model(datasets, experiment)
Beispiel #3
0
def run_training():
    """Train MNIST for a number of steps."""
    data_sets = input_data.read_data_sets(INPUT_DATA_DIR)

    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_placeholder = tf.placeholder(tf.float32, shape=(BATCH_SIZE, IMAGE_PIXELS))
        labels_placeholder = tf.placeholder(tf.int32, shape=(BATCH_SIZE,))

        # Build a Graph that computes predictions from the inference model.
        logits = inference(images_placeholder, HIDDEN1_UNITS, HIDDEN2_UNITS)

        # Add to the Graph the Ops for loss calculation.
        labels = tf.to_int64(labels_placeholder)
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits, name='xentropy')
        loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')

        # Add to the Graph the Ops that calculate and apply gradients.
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        train_op = optimizer.minimize(loss, global_step=global_step)

        # Add the Op to compare the logits to the labels during evaluation.
        correct = tf.nn.in_top_k(logits, labels_placeholder, 1)
        eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))

        # Initialize the graph
        init = tf.global_variables_initializer()
        session = tf.Session()
        session.run(init)

        # Now that our neural net is ready, let's integrate MissingLinkAI SDK and start the training!

        # Create a project manager with credentials to communicate with MissingLinkAI's backend
        missinglink_project = missinglink.TensorFlowProject(OWNER_ID, PROJECT_TOKEN)

        # Create an experiment as a context manager so MissingLinkAI can monitor the
        # progress of the experiment.
        with missinglink_project.create_experiment(
                display_name='MNIST multilayer perception',
                description='Two fully connected hidden layers') as experiment:

            NUM_SAMPLE = 2000
            NUM_BATCHES = int(NUM_SAMPLE / BATCH_SIZE)

            for epoch in experiment.epoch_loop(10):
                for batch in experiment.batch_loop(NUM_BATCHES):
                    feed_dict = fill_feed_dict(data_sets.train,
                                               images_placeholder, labels_placeholder)

                    # Use `experiment.train` scope before the `session.run` which runs the optimizer
                    # to let the SDK know it should collect the metrics as training metrics.
                    with experiment.train(
                        monitored_metrics={'loss': loss, 'acc': eval_correct}):
                        # Note that you only need to provide the optimizer op. The SDK will automatically run the metric
                        # tensors provided in the `experiment.train` context (and `experiment` context).
                        _, loss_value = session.run([train_op, loss], feed_dict=feed_dict)

                # Validate the model with the validation dataset
                with experiment.validation(
                    monitored_metrics={'loss': loss, 'acc': eval_correct}):
                    do_eval(session, eval_correct, images_placeholder,
                            labels_placeholder, data_sets.validation)

            # Use `experiment.test` generator to manage the testing loop.
            total_test_iterations = data_set.num_examples

            with experiment.test(
                total_test_iterations,
                expected=labels_placeholder,
                predicted=logits):
                sess.run([train_op, loss], feed_dict=feed_dict)
Beispiel #4
0
from __future__ import print_function

import tensorflow as tf
import numpy as np
from ops import *
from data import *
from net import *
from utils import *
import os
import time

import missinglink

OWNER_ID = '605f9f21-80b1-ddd5-b2ed-ecdd662f035c'
PROJECT_TOKEN = 'hwotoGzZzPqwaZiL'
missinglink_project = missinglink.TensorFlowProject(OWNER_ID, PROJECT_TOKEN)

flags = tf.app.flags
conf = flags.FLAGS


class Solver(object):
    def __init__(self):
        self.device_id = conf.device_id
        self.train_dir = conf.train_dir
        self.samples_dir = conf.samples_dir
        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)
        if not os.path.exists(self.samples_dir):
            os.makedirs(self.samples_dir)
        #datasets params
Beispiel #5
0
# In this example, we will build a simple neural network with 2 fully connected layers.
# We will then integrate MissingLink SDK in order to remotely monitor our training, validation
# and testing process.

import os
import math
import argparse

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)

import missinglink

missinglink_project = missinglink.TensorFlowProject(project='6201689270910976')

# Input params
NUM_CLASSES = 10  # The MNIST dataset has 10 classes, representing the digits 0 through 9.
IMAGE_SIZE = 28  # The MNIST images are always 28x28 pixels.
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

# Network params
HIDDEN1_UNITS = 128
HIDDEN2_UNITS = 32

# Training params
LEARNING_RATE = 0.01
MAX_STEPS = 2000
BATCH_SIZE = 100
Beispiel #6
0
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
#
# In this example, we will build a simple neural network with 2 fully connected layers.
# We will then integrate MissingLink SDK in order to remotely monitor our training, validation
# and testing process.

import os
import math
import argparse

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import missinglink

missinglink_project = missinglink.TensorFlowProject()

# Input params
NUM_CLASSES = 10  # The MNIST dataset has 10 classes, representing the digits 0 through 9.
IMAGE_SIZE = 28  # The MNIST images are always 28x28 pixels.
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

# Network params
HIDDEN1_UNITS = 128
HIDDEN2_UNITS = 32

# Training params
LEARNING_RATE = 0.01
MAX_STEPS = 9000
BATCH_SIZE = 100
X_train, X_test = standard_scale(mnist.train.images, mnist.test.images)

n_samples = int(mnist.train.num_examples)
training_epochs = 20
batch_size = 128
display_step = 1

autoencoder = AdditiveGaussianNoiseAutoencoder(
    n_input=784,
    n_hidden=200,
    transfer_function=tf.nn.softplus,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    scale=0.01)

project = ml.TensorFlowProject(owner_id="your-owner-id",
                               project_token="your-project-token")

avg_cost = 0.
with project.create_experiment("Autoencoder",
                               custom_metrics={'avg cost': lambda: avg_cost},
                               monitored_metrics={'cost': autoencoder.cost
                                                  }) as experiment:
    for epoch in experiment.epoch_loop(training_epochs):
        avg_cost = 0.
        total_batch = int(n_samples / batch_size)
        # Loop over all batches
        for i in experiment.batch_loop(total_batch):
            batch_xs = get_random_block_from_data(X_train, batch_size)

            # Fit training using batch data
            with experiment.train():
Beispiel #8
0
def main(_):
    if FLAGS.self_test:
        print('Running self-test.')
        train_data, train_labels = fake_data(256)
        validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE)
        test_data, test_labels = fake_data(EVAL_BATCH_SIZE)
        num_epochs = 1
    else:
        # Get the data.
        train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
        train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
        test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
        test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')

        # Extract it into numpy arrays.
        train_data = extract_data(train_data_filename, 60000)
        train_labels = extract_labels(train_labels_filename, 60000)
        test_data = extract_data(test_data_filename, 10000)
        test_labels = extract_labels(test_labels_filename, 10000)

        # Generate a validation set.
        validation_data = train_data[:VALIDATION_SIZE, ...]
        validation_labels = train_labels[:VALIDATION_SIZE]
        train_data = train_data[VALIDATION_SIZE:, ...]
        train_labels = train_labels[VALIDATION_SIZE:]
        num_epochs = NUM_EPOCHS
    train_size = train_labels.shape[0]

    # This is where training samples and labels are fed to the graph.
    # These placeholder nodes will be fed a batch of training data at each
    # training step using the {feed_dict} argument to the Run() call below.
    train_data_node = tf.placeholder(data_type(),
                                     shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE,
                                            NUM_CHANNELS))
    train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE, ))
    eval_data = tf.placeholder(data_type(),
                               shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE,
                                      NUM_CHANNELS))

    # The variables below hold all the trainable weights. They are passed an
    # initial value which will be assigned when we call:
    # {tf.global_variables_initializer().run()}
    conv1_weights = tf.Variable(
        tf.truncated_normal(
            [5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
            stddev=0.1,
            seed=SEED,
            dtype=data_type()))
    conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
    conv2_weights = tf.Variable(
        tf.truncated_normal([5, 5, 32, 64],
                            stddev=0.1,
                            seed=SEED,
                            dtype=data_type()))
    conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
    fc1_weights = tf.Variable(  # fully connected, depth 512.
        tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
                            stddev=0.1,
                            seed=SEED,
                            dtype=data_type()))
    fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
    fc2_weights = tf.Variable(
        tf.truncated_normal([512, NUM_LABELS],
                            stddev=0.1,
                            seed=SEED,
                            dtype=data_type()))
    fc2_biases = tf.Variable(
        tf.constant(0.1, shape=[NUM_LABELS], dtype=data_type()))

    # We will replicate the model structure for the training subgraph, as well
    # as the evaluation subgraphs, while sharing the trainable parameters.
    def model(data, train=False):
        """The Model definition."""
        # 2D convolution, with 'SAME' padding (i.e. the output feature map has
        # the same size as the input). Note that {strides} is a 4D array whose
        # shape matches the data layout: [image index, y, x, depth].
        conv = tf.nn.conv2d(data,
                            conv1_weights,
                            strides=[1, 1, 1, 1],
                            padding='SAME')
        # Bias and rectified linear non-linearity.
        relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
        # Max pooling. The kernel size spec {ksize} also follows the layout of
        # the data. Here we have a pooling window of 2, and a stride of 2.
        pool = tf.nn.max_pool(relu,
                              ksize=[1, 2, 2, 1],
                              strides=[1, 2, 2, 1],
                              padding='SAME')
        conv = tf.nn.conv2d(pool,
                            conv2_weights,
                            strides=[1, 1, 1, 1],
                            padding='SAME')
        relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
        pool = tf.nn.max_pool(relu,
                              ksize=[1, 2, 2, 1],
                              strides=[1, 2, 2, 1],
                              padding='SAME')
        # Reshape the feature map cuboid into a 2D matrix to feed it to the
        # fully connected layers.
        pool_shape = pool.get_shape().as_list()
        reshape = tf.reshape(
            pool,
            [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
        # Fully connected layer. Note that the '+' operation automatically
        # broadcasts the biases.
        hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
        # Add a 50% dropout during training only. Dropout also scales
        # activations such that no rescaling is needed at evaluation time.
        if train:
            hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
        return tf.matmul(hidden, fc2_weights) + fc2_biases

    # Training computation: logits + cross-entropy loss.
    logits = model(train_data_node, True)
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=train_labels_node, logits=logits))

    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                    tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
    # Add the regularization term to the loss.
    loss += 5e-4 * regularizers

    # Optimizer: set up a variable that's incremented once per batch and
    # controls the learning rate decay.
    batch = tf.Variable(0, dtype=data_type())
    # Decay once per epoch, using an exponential schedule starting at 0.01.
    learning_rate = tf.train.exponential_decay(
        0.01,  # Base learning rate.
        batch * BATCH_SIZE,  # Current index into the dataset.
        train_size,  # Decay step.
        0.95,  # Decay rate.
        staircase=True)
    # Use simple momentum for the optimization.
    optimizer = tf.train.MomentumOptimizer(learning_rate,
                                           0.9).minimize(loss,
                                                         global_step=batch)

    # Predictions for the current training minibatch.
    train_prediction = tf.nn.softmax(logits)

    # Predictions for the test and validation, which we'll compute less often.
    eval_prediction = tf.nn.softmax(model(eval_data))

    # Small utility function to evaluate a dataset by feeding batches of data to
    # {eval_data} and pulling the results from {eval_predictions}.
    # Saves memory and enables this to run on smaller GPUs.
    def eval_in_batches(data, sess):
        """Get all predictions for a dataset by running it in small batches."""
        size = data.shape[0]
        if size < EVAL_BATCH_SIZE:
            raise ValueError("batch size for evals larger than dataset: %d" %
                             size)
        predictions = numpy.ndarray(shape=(size, NUM_LABELS),
                                    dtype=numpy.float32)
        for begin in xrange(0, size, EVAL_BATCH_SIZE):
            end = begin + EVAL_BATCH_SIZE
            if end <= size:
                predictions[begin:end, :] = sess.run(
                    eval_prediction,
                    feed_dict={eval_data: data[begin:end, ...]})
            else:
                batch_predictions = sess.run(
                    eval_prediction,
                    feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
                predictions[begin:, :] = batch_predictions[begin - size:, :]
        return predictions

    project = ml.TensorFlowProject(owner_id="your-owner-id",
                                   project_token="your-project-token")

    with project.create_experiment("mnist") as experiment:
        # Create a local session to run the training.
        start_time = time.time()

        expected = tf.Variable(test_labels, name="expected")
        tf.global_variables_initializer()

        with tf.Session() as sess:
            # Run all the initializers to prepare the trainable parameters.
            tf.global_variables_initializer().run()
            print('Initialized!')
            # Loop through training steps.
            for step in experiment.loop(
                    max_iterations=int(num_epochs * train_size) // BATCH_SIZE):
                # Compute the offset of the current minibatch in the data.
                # Note that we could use better randomization across epochs.
                offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
                batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
                batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
                # This dictionary maps the batch data (as a numpy array) to the
                # node in the graph it should be fed to.
                feed_dict = {
                    train_data_node: batch_data,
                    train_labels_node: batch_labels
                }
                # Run the optimizer to update weights.
                sess.run(optimizer, feed_dict=feed_dict)
                # print some extra information once reach the evaluation frequency
                if step % EVAL_FREQUENCY == 0:
                    minibatch_error = 0.
                    # fetch some extra nodes' data
                    with experiment.train(monitored_metrics={
                            'loss': loss,
                            'learning rate': learning_rate
                    },
                                          custom_metrics={
                                              'error rate':
                                              lambda: minibatch_error
                                          }):
                        l, lr, predictions = sess.run(
                            [loss, learning_rate, train_prediction],
                            feed_dict=feed_dict)
                        elapsed_time = time.time() - start_time
                        start_time = time.time()
                        print('Step %d (epoch %.2f), %.1f ms' %
                              (step, float(step) * BATCH_SIZE / train_size,
                               1000 * elapsed_time / EVAL_FREQUENCY))
                        print('Minibatch loss: %.3f, learning rate: %.6f' %
                              (l, lr))
                        minibatch_error = error_rate(predictions, batch_labels)
                        print('Minibatch error: %.1f%%' % minibatch_error)

                    validation_error = 0.
                    with experiment.validation(
                            custom_metrics={
                                'validation error': lambda: validation_error
                            }):
                        validation_error = error_rate(
                            eval_in_batches(validation_data, sess),
                            validation_labels)
                        print('Validation error: %.1f%%' % validation_error)
                    sys.stdout.flush()

            # Finally print the result!
            with experiment.test(expected=expected, predicted=eval_prediction):
                test_error = error_rate(eval_in_batches(test_data, sess),
                                        test_labels)
                print('Test error: %.1f%%' % test_error)
            if FLAGS.self_test:
                print('test_error', test_error)
                assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % (
                    test_error, )
Beispiel #9
0
import time

import tensorflow as tf
import missinglink

NUM_EPOCHS = 4
NUM_BATCHES = 10

missinglink_project = missinglink.TensorFlowProject(
    project_token='KzcbCxZWjewiqxCi')

# Build a graph.
a = tf.constant(5.0)
b = tf.constant(6.0)
c = a * b

with missinglink_project.create_experiment() as experiment:
    for epoch in experiment.epoch_loop(NUM_EPOCHS):
        for batch in experiment.batch_loop(NUM_BATCHES):
            time.sleep(0.5)
            with experiment.train():
                # Launch the graph in a session.
                sess = tf.Session()

                # Evaluate the tensor `c`.
                print('sess run result', sess.run(c))
                loss = batch * 0.1 - epoch * 0.05
                print('loss', loss)
                experiment.add_metric('loss', loss)
Beispiel #10
0
def main(unused_argv):
  train_data, valid_data = data_utils.get_data()
  trainer = Trainer(train_data, valid_data, data_utils.IMAGE_NEW_SIZE ** 2)
  project = ml.TensorFlowProject(owner_id="your-owner-id", project_token="your-project-token")
  with project.create_experiment(display_name="Learning to remeber") as experiment:
    trainer.run(experiment)
Beispiel #11
0
def train():
    if FLAGS.train_dir is None:
        raise ValueError('Parameter train_dir must be provided')
    if FLAGS.task is None:
        raise ValueError('Parameter task must be provided')
    if FLAGS.model is None:
        raise ValueError('Parameter model must be provided')

    input_config_string = config_helper.GetConfigString(FLAGS.input_config)
    input_config = config_helper.InputConfig(input_config_string)

    # Training parameters.
    train_config_string = config_helper.GetConfigString(FLAGS.train_config)
    train_config = config_helper.TrainConfig(train_config_string)

    batch_size = train_config.batch_size
    initial_learning_rate = train_config.learning_rate
    decay_rate = train_config.decay_rate
    samples_per_decay = train_config.samples_per_decay

    # Parameters for learning-rate decay.
    # The formula is decay_rate ** floor(steps / decay_steps).
    decay_steps = samples_per_decay / batch_size
    decay_steps = max(decay_steps, 1)

    first_code = code_loader.ReadFirstCode(input_config.data)
    first_code_height = (
        first_code.features.feature['code_shape'].int64_list.value[0])
    first_code_width = (
        first_code.features.feature['code_shape'].int64_list.value[1])
    max_bit_depth = (
        first_code.features.feature['code_shape'].int64_list.value[2])
    print('Maximum code depth: {}'.format(max_bit_depth))

    project = ml.TensorFlowProject(owner_id="your-owner-id",
                                   project_token="your-project-token")

    with project.create_experiment("Entropy Coder") as experiment:
        with tf.Graph().as_default():
            ps_ops = [
                "Variable", "VariableV2", "AutoReloadVariable", "VarHandleOp"
            ]
            with tf.device(
                    tf.train.replica_device_setter(FLAGS.ps_tasks,
                                                   ps_ops=ps_ops)):
                codes = code_loader.LoadBinaryCode(input_config=input_config,
                                                   batch_size=batch_size)
                if input_config.unique_code_size:
                    print('Input code size: {} x {}'.format(
                        first_code_height, first_code_width))
                    codes.set_shape([
                        batch_size, first_code_height, first_code_width,
                        max_bit_depth
                    ])
                else:
                    codes.set_shape([batch_size, None, None, max_bit_depth])
                codes_effective_shape = tf.shape(codes)

                global_step = tf.contrib.framework.create_global_step()

                # Apply learning-rate decay.
                learning_rate = tf.train.exponential_decay(
                    learning_rate=initial_learning_rate,
                    global_step=global_step,
                    decay_steps=decay_steps,
                    decay_rate=decay_rate,
                    staircase=True)
                tf.summary.scalar('Learning Rate', learning_rate)
                optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                                   epsilon=1.0)

                # Create the entropy coder model.
                model = model_factory.GetModelRegistry().CreateModel(
                    FLAGS.model)
                model_config_string = config_helper.GetConfigString(
                    FLAGS.model_config)
                model.Initialize(global_step, optimizer, model_config_string)
                model.BuildGraph(codes)

                summary_op = tf.summary.merge_all()

                # Verify that the model can actually be trained.
                if model.train_op is None:
                    raise ValueError('Input model {} is not trainable'.format(
                        FLAGS.model))

                # We disable the summary thread run by Supervisor class by passing
                # summary_op=None. We still pass save_summaries_secs because it is used by
                # the global step counter thread.
                is_chief = (FLAGS.task == 0)
                sv = tf.train.Supervisor(
                    logdir=FLAGS.train_dir,
                    is_chief=is_chief,
                    global_step=global_step,
                    # saver=model.saver,
                    summary_op=None,
                    save_summaries_secs=120,
                    save_model_secs=600,
                    recovery_wait_secs=30)

                sess = sv.PrepareSession(FLAGS.master)
                sv.StartQueueRunners(sess)

                step = sess.run(global_step)
                print('Trainer initial step: {}.'.format(step))

                # Once everything has been setup properly, save the configs.
                if is_chief:
                    config_helper.SaveConfig(FLAGS.train_dir,
                                             'input_config.json',
                                             input_config_string)
                    config_helper.SaveConfig(FLAGS.train_dir,
                                             'model_config.json',
                                             model_config_string)
                    config_helper.SaveConfig(FLAGS.train_dir,
                                             'train_config.json',
                                             train_config_string)

                # Train the model.
                next_summary_time = time.time()
                for step in experiment.loop(
                        condition=lambda _: not sv.ShouldStop()):
                    feed_dict = None

                    # Once in a while, update the summaries on the chief worker.
                    if is_chief and next_summary_time < time.time():
                        summary_str = sess.run(summary_op, feed_dict=feed_dict)
                        sv.SummaryComputed(sess, summary_str)
                        next_summary_time = time.time(
                        ) + sv.save_summaries_secs
                    else:
                        tf_tensors = {
                            'train': model.train_op,
                            'code_length': model.average_code_length
                        }
                        with experiment.train(monitored_metrics={
                                'avg code length':
                                model.average_code_length
                        }):
                            np_tensors = sess.run(tf_tensors,
                                                  feed_dict=feed_dict)
                        print np_tensors['code_length']

                sv.Stop()
Beispiel #12
0
                                --train_vggish=False \
                                --checkpoint /path/to/model/checkpoint
"""

from __future__ import print_function

from random import shuffle

import numpy as np
import tensorflow as tf

import vggish_input
import vggish_params
import vggish_slim
import missinglink
project = missinglink.TensorFlowProject(owner_id="your-owner-id",
                                        project_token="your-project-token")

flags = tf.app.flags
slim = tf.contrib.slim

flags.DEFINE_integer(
    'num_batches', 30,
    'Number of batches of examples to feed into the model. Each batch is of '
    'variable size and contains shuffled examples of each class of audio.')

flags.DEFINE_boolean(
    'train_vggish', True,
    'If Frue, allow VGGish parameters to change during training, thus '
    'fine-tuning VGGish. If False, VGGish parameters are fixed, thus using '
    'VGGish as a fixed feature extractor.')
Beispiel #13
0
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir,
        train_files, eval_files, train_batch_size, eval_batch_size,
        learning_rate, eval_frequency, first_layer_size, num_layers,
        scale_factor, num_epochs, export_format):
    """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(first_layer_size * scale_factor**i))
        for i in range(num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info("Created DNN hidden units {}".format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                eval_files,
                num_epochs=None if eval_steps else 1,
                batch_size=eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=learning_rate)

        hooks = [
            EvaluationRunHook(
                job_dir,
                metric_dict,
                evaluation_graph,
                eval_frequency,
                eval_steps=eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue
            features, labels = model.input_fn(train_files,
                                              num_epochs=num_epochs,
                                              batch_size=train_batch_size)

            # Returns the training graph and global step tensor
            loss, train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=learning_rate)

            project = ml.TensorFlowProject(owner_id="your-owner-id",
                                           project_token="your-project-token")

        with project.create_experiment(display_name='Census') as experiment:

            # Creates a MonitoredSession for training
            # MonitoredSession is a Session-like object that handles
            # initialization, recovery and hooks
            # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
            with tf.train.MonitoredTrainingSession(
                    master=target,
                    is_chief=is_chief,
                    checkpoint_dir=job_dir,
                    hooks=hooks,
                    save_checkpoint_secs=20,
                    save_summaries_steps=50) as session:
                # Global step to keep track of global number of steps particularly in
                # distributed setting
                step = global_step_tensor.eval(session=session)

                # Run the training graph which returns the step number as tracked by
                # the global step tensor.
                # When train epochs is reached, session.should_stop() will be true.
                for _ in experiment.loop(condition=lambda i: (
                        train_steps is None or i < train_steps) and not session
                                         .should_stop()):
                    with experiment.train(monitored_metrics={'loss': loss}):
                        step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(latest_checkpoint, job_dir,
                                  model.SERVING_INPUT_FUNCTIONS[export_format],
                                  hidden_units)
Beispiel #14
0
def train(dataset):
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.RMSPropOptimizer(lr,
                                        RMSPROP_DECAY,
                                        momentum=RMSPROP_MOMENTUM,
                                        epsilon=RMSPROP_EPSILON)

        # Get images and labels for ImageNet and split the batch across GPUs.
        assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
            'Batch size must be divisible by number of GPUs')
        split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
        images, labels = image_processing.distorted_inputs(
            dataset, num_preprocess_threads=num_preprocess_threads)

        input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Number of classes in the Dataset label set plus 1.
        # Label 0 is reserved for an (unused) background class.
        num_classes = dataset.num_classes() + 1

        # Split the batch of images and labels for towers.
        images_splits = tf.split(axis=0,
                                 num_or_size_splits=FLAGS.num_gpus,
                                 value=images)
        labels_splits = tf.split(axis=0,
                                 num_or_size_splits=FLAGS.num_gpus,
                                 value=labels)

        # Calculate the gradients for each model tower.
        tower_grads = []
        reuse_variables = None
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' %
                                   (inception.TOWER_NAME, i)) as scope:
                    # Force all Variables to reside on the CPU.
                    with slim.arg_scope([slim.variables.variable],
                                        device='/cpu:0'):
                        # Calculate the loss for one tower of the ImageNet model. This
                        # function constructs the entire ImageNet model but shares the
                        # variables across all towers.
                        loss = _tower_loss(images_splits[i], labels_splits[i],
                                           num_classes, scope, reuse_variables)

                    # Reuse variables for the next tower.
                    reuse_variables = True

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Retain the Batch Normalization updates operations only from the
                    # final tower. Ideally, we should grab the updates from all towers
                    # but these stats accumulate extremely fast so we can ignore the
                    # other stats from the other towers without significant detriment.
                    batchnorm_updates = tf.get_collection(
                        slim.ops.UPDATE_OPS_COLLECTION, scope)

                    # Calculate the gradients for the batch of data on this ImageNet
                    # tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = _average_gradients(tower_grads)

        # Add a summaries for the input processing and global_step.
        summaries.extend(input_summaries)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            inception.MOVING_AVERAGE_DECAY, global_step)

        # Another possibility is to use tf.slim.get_variables().
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir,
                                               graph=sess.graph)

        project = missinglink.TensorFlowProject(
            owner_id="your-owner-id", project_token="your-project-token")

        with project.create_experiment(display_name='Inception with flowers',
                                       optimizer=opt) as experiment:
            for step in experiment.loop(FLAGS.max_steps):
                start_time = time.time()
                with experiment.train(monitored_metrics={'loss': loss}):
                    _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    examples_per_sec = FLAGS.batch_size / float(duration)
                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value,
                                        examples_per_sec, duration))

                if step % 100 == 0:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, step)

                # Save the model checkpoint periodically.
                if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)