def tower_loss(self, scope, images, labels): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' images: Images. 4D tensor of shape [batch_size, height, width, 3]. labels: Labels. 1D tensor of shape [batch_size]. Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train( name, X_train, Y_train, X_val, Y_val): sess = tf.Session() sess.run(tf.global_variables_initializer()) num_training = len(X_train) batch_size = cifar10.FLAGS.batch_size cifar10.FLAGS step = 0 losses = [] accuracies = [] print('-' * 5 + ' Start training ' + '-' * 5) #Y_train = tf.one_hot(Y_train, 10) #Y_val = tf.one_hot(Y_val,10) for epoch in range(num_epoch): print('train for epoch %d' % epoch) for i in range(num_training // batch_size): X_ = X_train[i * batch_size:(i + 1) * batch_size][:] Y_ = Y_train[i * batch_size:(i + 1) * batch_size] global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. feed_dict = {images: X_, labels : Y_} fetches = [self.train_op, self.loss_op, self.accuracy_op] _, loss, accuracy = sess.run(fetches, feed_dict=feed_dict) losses.append(loss) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) accuracy = eval(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) losses.append(loss) accuracies.append(accuracy) #if step % self.log_step == 0: # print('iteration (%d): loss = %.3f, accuracy = %.3f' % # (step, loss, accuracy)) step += 1 # Print validation results print('validation for epoch %d' % epoch) #val_accuracy = self.evaluate(sess, X_val, Y_val) #print('- epoch %d: validation accuracy = %.3f' % (epoch, val_accuracy)) saver = tf.train.Saver() model_path = saver.save(sess, name+".ckpt")
def buildModel(self): # Placeholders self.images = tf.placeholder(tf.float32, [None, 32, 32, 3]) self.labels = tf.placeholder(tf.int64, [None]) # Build a Graph that computes the logits predictions from the # inference model. self.logits = cifar10.inference(self.images) # Calculate loss. self.loss = cifar10.loss(self.logits, self.labels) self.accuracy = eval(self.logits, self.labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. self.train_op = cifar10.train(self.loss, self.global_step)
def evaluate(eval_data, eval_dir, model_dir, client): """Eval CIFAR-10 for a number of steps.""" FLAGS = parser.parse_args() with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. images, labels = cifar10.inputs(eval_data=eval_data, client=client) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. predict = tf.argmax(logits, 1, output_type=tf.int32) correct = tf.equal(predict, labels) accuracy_op = tf.reduce_mean(tf.cast(correct, tf.float32)) top_k_op = tf.nn.in_top_k(logits, labels, 1) loss = cifar10.loss(logits, labels) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(eval_dir, g) while True: eval_once(saver, summary_writer, top_k_op, accuracy_op, summary_op, loss, model_dir, eval_dir) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def train(client): """Train CIFAR-10 for a number of steps.""" FLAGS = parser.parse_args() model_dir = modelDir(FLAGS.train_dir, client) with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs(client) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) def end(self, mon_sess): #print("END!") weights = [] #np.empty([len(tf.trainable_variables())]) i = 0 for t in tf.trainable_variables(): print(t) weights.append(t.eval(session=mon_sess)) print(weights) return weights with tf.train.MonitoredTrainingSession( checkpoint_dir=model_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def Evaluate(load_path, eval_dir, network_parameters=None, num_testing_images=10000, save_mistakes=False): """Evaluate CIFAR-10. Args: cifar_eval_data: Path of a file containing the CIFAR-10 images to process. network_parameters: parameters for defining and training the network. num_testing_images: the number of images we will evaluate on. randomize: if false, randomize; otherwise, read the testing images sequentially. load_path: path where to load trained parameters from. save_mistakes: save the mistakes if True. Returns: The evaluation accuracy as a float. """ network_parameters = buildNetwork() batch_size = 100 # Like for training, we need a session for executing the TensorFlow graph. with tf.Graph().as_default(), tf.Session() as sess: # Create the basic Mnist model. images, labels = cifar10_input.inputs(True, "./data/cifar10-batches-bin", TFFLAGS.batch_size) logits, _, _ = utils.BuildNetwork(images, network_parameters) softmax = tf.nn.softmax(logits) loss = cifar10.loss(logits, labels) # Load the variables. ckpt_state = tf.train.get_checkpoint_state(load_path) if not (ckpt_state and ckpt_state.model_checkpoint_path): raise ValueError("No model checkpoint to eval at %s\n" % load_path) saver = tf.train.Saver() saver.restore(sess, ckpt_state.model_checkpoint_path) coord = tf.train.Coordinator() _ = tf.train.start_queue_runners(sess=sess, coord=coord) total_examples = 0 correct_predictions = 0 image_index = 0 mistakes = [] for _ in range((num_testing_images + batch_size - 1) // batch_size): predictions, label_values = sess.run([softmax, labels]) # Count how many were predicted correctly. for prediction, label_value in zip(predictions, label_values): total_examples += 1 if np.argmax(prediction) == label_value: correct_predictions += 1 elif save_mistakes: mistakes.append({ "index": image_index, "label": label_value, "pred": np.argmax(prediction) }) image_index += 1 coord.request_stop() acc = float(correct_predictions) / float(total_examples) print("Test accuracy: ", acc) write(eval_dir, acc) return (acc, mistakes if save_mistakes else None)
def train(self): """Train CIFAR-10 for a number of steps.""" client = self.index FLAGS = parser.parse_args() model_dir=self.modelDir(FLAGS.train_dir,client) with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs(client) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) saver = tf.train.Saver() class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() self.loss=[] def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results self.loss.append(loss_value) examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) def end(self, mon_sess): #print tf.global_variables() filename = FLAGS.loss_dir + "loss_client"+str(client)+".pkl" old_loss=[] if os.path.exists(filename): with open(filename,'rb') as rfp: old_loss = pickle.load(rfp) old_loss.append(self.loss) with open(filename, "wb") as fp: #Pickling pickle.dump(old_loss, fp) cifar10_eval.evaluate(True, "./models/cifar/eval/client"+str(client), FLAGS.train_dir+str(client), client) for t in tf.trainable_variables(): #print(t) weights.append(t.eval(session=mon_sess)) with tf.train.MonitoredTrainingSession( checkpoint_dir=model_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: #save_path = saver.restore(mon_sess._sess._sess._sess._sess, "/Users/Sara/Dropbox/Class/NN/Project/models/cifar/tfmodel.ckpt") t = tf.trainable_variables()[0] #print(t.eval(mon_sess)[0][0][0]) if len(central_weights)>0 : d = central_weights[0][0][0][0] == t.eval(mon_sess)[0][0][0] #print("weights", d) while not mon_sess.should_stop(): mon_sess.run(train_op)