Example #1
0
def DistributedTrain(FLAGS, task_index, TFCluster, TFServer, QueueHook):
    is_chief = (task_index == 0)
    master = TFServer.target
    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % task_index,
                cluster=TFCluster)):
        #Import data
        mnist = input_data.read_data_sets(FLAGS.data_dir + "-" +
                                          str(task_index),
                                          one_hot=True)
        #PlaceHolder for Images
        x = tf.placeholder(tf.float32, [None, 784])
        #PlaceHolder for Labels-OneHot Vector
        y_ = tf.placeholder(tf.float32, [None, 10])
        #Build the Deep Model
        y_conv, keep_prob = Deep.deepnn(x)
        #Define loss and optimizer
        SummaryDictionary = {}
        global_step = tf.train.get_or_create_global_step()
        with tf.name_scope('loss'):
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                labels=y_, logits=y_conv)
        cross_entropy = tf.reduce_mean(cross_entropy)
        SummaryDictionary.update({'cross_entropy': cross_entropy})
        with tf.name_scope('adam_optimizer'):
            Optimizer = tf.train.AdamOptimizer(1e-4)
            SyncOtimizer = tf.train.SyncReplicasOptimizer(
                Optimizer,
                replicas_to_aggregate=FLAGS.workers,
                total_num_replicas=FLAGS.workers,
                use_locking=True,
                name="sync_replicas")
            train_step = SyncOtimizer.minimize(
                cross_entropy,
                global_step=global_step,
                aggregation_method=tf.AggregationMethod.ADD_N)
        #Define Metric
        with tf.name_scope('accuracy'):
            correct_prediction = tf.equal(tf.argmax(y_conv, 1),
                                          tf.argmax(y_, 1))
            correct_prediction = tf.cast(correct_prediction, tf.float32)
        accuracy = tf.reduce_mean(correct_prediction)
        SummaryDictionary.update({'AccuracyMetric': accuracy})
    #Added Summaries for TensorBoard Visualization.
    ListOfSummaries = []
    for key in SummaryDictionary:
        ListOfSummaries.append(tf.summary.scalar(key, SummaryDictionary[key]))
    #for key, value in (SummaryDictionary).iteritems():
    #	ListOfSummaries.append(tf.summary.scalar(key,value))
    MergedSummaryOperation = tf.summary.merge(ListOfSummaries)

    #BatchSize=FLAGS.batch_size
    BatchSize = int(FLAGS.batch_size / FLAGS.workers)
    SummarySteps = 100
    hooks = [QueueHook]
    #Hook for Sync replicas
    sync_replicas_hook = SyncOtimizer.make_session_run_hook(is_chief,
                                                            num_tokens=0)
    hooks.append(sync_replicas_hook)
    #Hook for StopAtStepHook
    hooks.append(tf.train.StopAtStepHook(FLAGS.Iterations))
    if is_chief:
        #Get training subset for validation
        TrainingLenght = mnist.train.images.shape[0]
        #idx=np.random.randint(TrainingLenght,size=int(TrainingLenght/10))
        idx = np.random.randint(TrainingLenght, size=BatchSize)
        TrainingImagesValidation = mnist.train.images[idx]
        #print(TrainingImagesValidation.shape)
        TrainingLabelsValidation = mnist.train.labels[idx]
        #print(TrainingLabelsValidation.shape)
        hooks.append(
            HOOKS.NewSummarySaverHook(MergedSummaryOperation,
                                      FLAGS.model_dir + "/Training",
                                      SummarySteps,
                                      FLAGS.Iterations,
                                      features=x,
                                      labels=y_,
                                      dropout=keep_prob,
                                      batchx=TrainingImagesValidation,
                                      batchy=TrainingLabelsValidation,
                                      dropout_value=1.0))
        hooks.append(
            HOOKS.FinalSummaryHook(SummaryDictionary, x, y_, keep_prob,
                                   mnist.test.images, mnist.test.labels, 1.0,
                                   FLAGS.Iterations, "Testing"))
    tick = time.time()
    Config = None
    with tf.train.MonitoredTrainingSession(master=master,
                                           is_chief=is_chief,
                                           hooks=hooks,
                                           checkpoint_dir=FLAGS.model_dir,
                                           save_checkpoint_secs=None,
                                           save_summaries_steps=None,
                                           save_summaries_secs=None,
                                           config=Config) as sess:
        step = sess.run(global_step)
        while not sess.should_stop():  #and step <= FLAGS.Iterations:
            batch = mnist.train.next_batch(BatchSize)
            _, step = sess.run([train_step, global_step],
                               feed_dict={
                                   x: batch[0],
                                   y_: batch[1],
                                   keep_prob: 0.5
                               })
        tack = time.time()
    print("Training Time: " + str(tack - tick))
Example #2
0
def train(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
    #PlaceHolder for Images
    x = tf.placeholder(tf.float32, [None, 784])
    #PlaceHolder for Labels-OneHot Vector
    y_ = tf.placeholder(tf.float32, [None, 10])
    #Build the Deep Model
    y_conv, keep_prob = Deep.deepnn(x)
    #Define loss and optimizer
    global_step = tf.train.get_or_create_global_step()
    SummaryDictionary = {}
    with tf.name_scope('loss'):
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                                logits=y_conv)
    cross_entropy = tf.reduce_mean(cross_entropy)
    SummaryDictionary.update({'cross_entropy': cross_entropy})
    #Define Optimizer and global step
    with tf.name_scope('adam_optimizer'):
        Optimizer = tf.train.AdamOptimizer(1e-4)
        train_step = Optimizer.minimize(cross_entropy, global_step=global_step)
    #Define Metric
    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
        correct_prediction = tf.cast(correct_prediction, tf.float32)
    accuracy = tf.reduce_mean(correct_prediction)
    SummaryDictionary.update({'AccuracyMetric': accuracy})

    #Added Summaries for TensorBoard Visualization.
    ListOfSummaries = []
    for key in SummaryDictionary:
        ListOfSummaries.append(tf.summary.scalar(key, SummaryDictionary[key]))
    #for key, value in (SummaryDictionary).iteritems():
    #	ListOfSummaries.append(tf.summary.scalar(key,value))
    MergedSummaryOperation = tf.summary.merge(ListOfSummaries)

    BatchSize = FLAGS.batch_size
    SummarySteps = 100
    hooks = []
    #Get a sub set of the training DataSet for validation purpouses during Training
    #Get training subset for validation
    TrainingLenght = mnist.train.images.shape[0]
    #idx=np.random.randint(TrainingLenght,size=int(TrainingLenght/10))
    idx = np.random.randint(TrainingLenght, size=BatchSize)
    TrainingImagesValidation = mnist.train.images[idx]
    #print(TrainingImagesValidation.shape)
    TrainingLabelsValidation = mnist.train.labels[idx]
    #print(TrainingLabelsValidation.shape)

    #Create Hooks for saving summaries or Logging
    hooks.append(
        HOOKS.NewSummarySaverHook(MergedSummaryOperation,
                                  FLAGS.model_dir,
                                  SummarySteps,
                                  FLAGS.Iterations,
                                  features=x,
                                  labels=y_,
                                  dropout=keep_prob,
                                  batchx=TrainingImagesValidation,
                                  batchy=TrainingLabelsValidation,
                                  dropout_value=1.0))
    """
	hooks.append(HOOKS.FinalSummaryHook(SummaryDictionary,x, y_, keep_prob, mnist.train.images, mnist.train.labels, 1.0, FLAGS.Iterations,"Training"))
	hooks.append(HOOKS.NewSummarySaverHook(MergedSummaryOperation, FLAGS.model_dir+"/Testing", SummarySteps, FLAGS.Iterations,
		features=x,labels=y_,dropout=keep_prob,batchx=mnist.test.images, batchy=mnist.test.labels, dropout_value=1.0)
	)
	"""
    hooks.append(
        HOOKS.FinalSummaryHook(SummaryDictionary, x, y_, keep_prob,
                               mnist.test.images, mnist.test.labels, 1.0,
                               FLAGS.Iterations, "Testing"))

    tick = time.time()
    hooks.append(tf.train.StopAtStepHook(last_step=FLAGS.Iterations))
    with tf.train.MonitoredTrainingSession(master="",
                                           is_chief=True,
                                           hooks=hooks,
                                           checkpoint_dir=FLAGS.model_dir,
                                           save_checkpoint_secs=None,
                                           save_summaries_steps=None) as sess:
        step = sess.run(global_step)
        while not sess.should_stop():  #and step < FLAGS.Iterations:
            batch = mnist.train.next_batch(BatchSize)
            _, step = sess.run([train_step, global_step],
                               feed_dict={
                                   x: batch[0],
                                   y_: batch[1],
                                   keep_prob: 0.5
                               })
            tack = time.time()
    print("Training Time: " + str(tack - tick))