def DistributedTrain(FLAGS, task_index, TFCluster, TFServer, QueueHook): is_chief = (task_index == 0) master = TFServer.target with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=TFCluster)): #Import data mnist = input_data.read_data_sets(FLAGS.data_dir + "-" + str(task_index), one_hot=True) #PlaceHolder for Images x = tf.placeholder(tf.float32, [None, 784]) #PlaceHolder for Labels-OneHot Vector y_ = tf.placeholder(tf.float32, [None, 10]) #Build the Deep Model y_conv, keep_prob = Deep.deepnn(x) #Define loss and optimizer SummaryDictionary = {} global_step = tf.train.get_or_create_global_step() with tf.name_scope('loss'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=y_, logits=y_conv) cross_entropy = tf.reduce_mean(cross_entropy) SummaryDictionary.update({'cross_entropy': cross_entropy}) with tf.name_scope('adam_optimizer'): Optimizer = tf.train.AdamOptimizer(1e-4) SyncOtimizer = tf.train.SyncReplicasOptimizer( Optimizer, replicas_to_aggregate=FLAGS.workers, total_num_replicas=FLAGS.workers, use_locking=True, name="sync_replicas") train_step = SyncOtimizer.minimize( cross_entropy, global_step=global_step, aggregation_method=tf.AggregationMethod.ADD_N) #Define Metric with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) SummaryDictionary.update({'AccuracyMetric': accuracy}) #Added Summaries for TensorBoard Visualization. ListOfSummaries = [] for key in SummaryDictionary: ListOfSummaries.append(tf.summary.scalar(key, SummaryDictionary[key])) #for key, value in (SummaryDictionary).iteritems(): # ListOfSummaries.append(tf.summary.scalar(key,value)) MergedSummaryOperation = tf.summary.merge(ListOfSummaries) #BatchSize=FLAGS.batch_size BatchSize = int(FLAGS.batch_size / FLAGS.workers) SummarySteps = 100 hooks = [QueueHook] #Hook for Sync replicas sync_replicas_hook = SyncOtimizer.make_session_run_hook(is_chief, num_tokens=0) hooks.append(sync_replicas_hook) #Hook for StopAtStepHook hooks.append(tf.train.StopAtStepHook(FLAGS.Iterations)) if is_chief: #Get training subset for validation TrainingLenght = mnist.train.images.shape[0] #idx=np.random.randint(TrainingLenght,size=int(TrainingLenght/10)) idx = np.random.randint(TrainingLenght, size=BatchSize) TrainingImagesValidation = mnist.train.images[idx] #print(TrainingImagesValidation.shape) TrainingLabelsValidation = mnist.train.labels[idx] #print(TrainingLabelsValidation.shape) hooks.append( HOOKS.NewSummarySaverHook(MergedSummaryOperation, FLAGS.model_dir + "/Training", SummarySteps, FLAGS.Iterations, features=x, labels=y_, dropout=keep_prob, batchx=TrainingImagesValidation, batchy=TrainingLabelsValidation, dropout_value=1.0)) hooks.append( HOOKS.FinalSummaryHook(SummaryDictionary, x, y_, keep_prob, mnist.test.images, mnist.test.labels, 1.0, FLAGS.Iterations, "Testing")) tick = time.time() Config = None with tf.train.MonitoredTrainingSession(master=master, is_chief=is_chief, hooks=hooks, checkpoint_dir=FLAGS.model_dir, save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None, config=Config) as sess: step = sess.run(global_step) while not sess.should_stop(): #and step <= FLAGS.Iterations: batch = mnist.train.next_batch(BatchSize) _, step = sess.run([train_step, global_step], feed_dict={ x: batch[0], y_: batch[1], keep_prob: 0.5 }) tack = time.time() print("Training Time: " + str(tack - tick))
def train(_): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) #PlaceHolder for Images x = tf.placeholder(tf.float32, [None, 784]) #PlaceHolder for Labels-OneHot Vector y_ = tf.placeholder(tf.float32, [None, 10]) #Build the Deep Model y_conv, keep_prob = Deep.deepnn(x) #Define loss and optimizer global_step = tf.train.get_or_create_global_step() SummaryDictionary = {} with tf.name_scope('loss'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv) cross_entropy = tf.reduce_mean(cross_entropy) SummaryDictionary.update({'cross_entropy': cross_entropy}) #Define Optimizer and global step with tf.name_scope('adam_optimizer'): Optimizer = tf.train.AdamOptimizer(1e-4) train_step = Optimizer.minimize(cross_entropy, global_step=global_step) #Define Metric with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) SummaryDictionary.update({'AccuracyMetric': accuracy}) #Added Summaries for TensorBoard Visualization. ListOfSummaries = [] for key in SummaryDictionary: ListOfSummaries.append(tf.summary.scalar(key, SummaryDictionary[key])) #for key, value in (SummaryDictionary).iteritems(): # ListOfSummaries.append(tf.summary.scalar(key,value)) MergedSummaryOperation = tf.summary.merge(ListOfSummaries) BatchSize = FLAGS.batch_size SummarySteps = 100 hooks = [] #Get a sub set of the training DataSet for validation purpouses during Training #Get training subset for validation TrainingLenght = mnist.train.images.shape[0] #idx=np.random.randint(TrainingLenght,size=int(TrainingLenght/10)) idx = np.random.randint(TrainingLenght, size=BatchSize) TrainingImagesValidation = mnist.train.images[idx] #print(TrainingImagesValidation.shape) TrainingLabelsValidation = mnist.train.labels[idx] #print(TrainingLabelsValidation.shape) #Create Hooks for saving summaries or Logging hooks.append( HOOKS.NewSummarySaverHook(MergedSummaryOperation, FLAGS.model_dir, SummarySteps, FLAGS.Iterations, features=x, labels=y_, dropout=keep_prob, batchx=TrainingImagesValidation, batchy=TrainingLabelsValidation, dropout_value=1.0)) """ hooks.append(HOOKS.FinalSummaryHook(SummaryDictionary,x, y_, keep_prob, mnist.train.images, mnist.train.labels, 1.0, FLAGS.Iterations,"Training")) hooks.append(HOOKS.NewSummarySaverHook(MergedSummaryOperation, FLAGS.model_dir+"/Testing", SummarySteps, FLAGS.Iterations, features=x,labels=y_,dropout=keep_prob,batchx=mnist.test.images, batchy=mnist.test.labels, dropout_value=1.0) ) """ hooks.append( HOOKS.FinalSummaryHook(SummaryDictionary, x, y_, keep_prob, mnist.test.images, mnist.test.labels, 1.0, FLAGS.Iterations, "Testing")) tick = time.time() hooks.append(tf.train.StopAtStepHook(last_step=FLAGS.Iterations)) with tf.train.MonitoredTrainingSession(master="", is_chief=True, hooks=hooks, checkpoint_dir=FLAGS.model_dir, save_checkpoint_secs=None, save_summaries_steps=None) as sess: step = sess.run(global_step) while not sess.should_stop(): #and step < FLAGS.Iterations: batch = mnist.train.next_batch(BatchSize) _, step = sess.run([train_step, global_step], feed_dict={ x: batch[0], y_: batch[1], keep_prob: 0.5 }) tack = time.time() print("Training Time: " + str(tack - tick))