def get_tfrecords(self): # xtrain: all records # *_l : partial records from mnist import inputs xtrain, _ = inputs(self.batch_size, 'train') xtrain_l, ytrain_l = inputs(self.batch_size, 'train_labeled') xtest, ytest = inputs(self.batch_size, 'test') return (xtrain_l, ytrain_l), xtrain, (xtest, ytest)
def main(train_dir, batch_size, num_batches, log_dir, checkpoint_dir=None): if checkpoint_dir is None: checkpoint_dir = log_dir images, labels = inputs(train_dir, False, batch_size, num_batches) predictions, total_loss = network(images, labels) tf.summary.scalar('loss', total_loss) predictions = tf.to_int32(tf.argmax(predictions, 1)) tf.summary.scalar('accuracy', slim.metrics.accuracy(predictions, labels)) # These are streaming metrics which compute the "running" metric, # e.g running accuracy metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({ 'accuracy': slim.metrics.streaming_accuracy(predictions, labels), }) # Define the streaming summaries to write: for metric_name, metric_value in metrics_to_values.items(): tf.summary.scalar(metric_name, metric_value) # Evaluate every 30 seconds slim.evaluation.evaluation_loop('', checkpoint_dir, log_dir, num_evals=1, eval_op=list(metrics_to_updates.values()), summary_op=tf.summary.merge_all(), eval_interval_secs=30, max_number_of_evaluations=100000000)
def main(train_dir, batch_size, num_batches, log_dir, checkpoint_dir=None): if checkpoint_dir is None: checkpoint_dir = log_dir images, labels = inputs(train_dir, False, batch_size, num_batches) predictions = lenet(images) predictions = tf.to_int32(tf.argmax(predictions, 1)) tf.scalar_summary('accuracy', slim.metrics.accuracy(predictions, labels)) # These are streaming metrics which compute the "running" metric, # e.g running accuracy metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({ "streaming_mse": slim.metrics.streaming_mean_squared_error(predictions, labels), }) # Define the streaming summaries to write: for metric_name, metric_value in metrics_to_values.iteritems(): tf.scalar_summary(metric_name, metric_value) # Evaluate every 30 seconds slim.evaluation.evaluation_loop('', checkpoint_dir, log_dir, num_evals=num_batches, eval_op=metrics_to_updates.values(), summary_op=tf.merge_all_summaries(), eval_interval_secs=30)
def train(): images, labels = mnist.inputs(['train_img.tfrecords'], mnist.TRAIN_EXAMPLES_NUM, FLAGS.batch_size, shuffle=True) global_step = tf.train.get_or_create_global_step() logits, pred = mnist.inference(images, training=True) loss = mnist.loss(logits, labels) train_op = mnist.train(loss, global_step) saver = tf.train.Saver() with tf.Session() as sess: init_op = tf.group( tf.local_variables_initializer(), tf.global_variables_initializer()) sess.run(init_op) ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt') coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord=coord) for i in range(1, FLAGS.max_step + 1): _, train_loss, predict, label = sess.run([train_op, loss, pred, labels]) # print(predict, '\n', label) if i % 100 == 0: print('step: {}, loss: {}'.format(i, train_loss)) # print(predict, '\n', label) saver.save(sess, ckpt, global_step=i) coord.request_stop() coord.join(threads)
def main(train_dir, batch_size, num_batches, log_dir): images, labels = inputs(train_dir, True, batch_size, num_batches) predictions, total_loss = network(images, labels) tf.summary.scalar('loss', total_loss) optimizer = tf.train.GradientDescentOptimizer(0.001) train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) slim.learning.train(train_op, log_dir, save_summaries_secs=10, save_interval_secs=10)
def evaluation(): images, labels = mnist.inputs(['./validation_img.tfrecords'], mnist.VALIDATION_EXAMPLES_NUM, batch_size=FLAGS.batch_size, shuffle=False) logits, pred = mnist.inference(images, training=False) top_k_op = tf.nn.in_top_k(logits, labels, 1) saver = tf.train.Saver() while True: eval_once(saver, top_k_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def main(train_dir, batch_size, num_batches, log_dir): images, labels = inputs(train_dir, True, batch_size, num_batches) predictions, total_loss = network(images, labels) tf.summary.scalar('loss', total_loss) optimizer = tf.train.MomentumOptimizer(learning_rate=0.025, momentum=0.9, use_nesterov=True) train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) slim.learning.train(train_op, log_dir, save_summaries_secs=20, save_interval_secs=20)
def main(train_dir, batch_size, num_batches, log_dir): images, labels = inputs(train_dir, True, batch_size, num_batches, one_hot_labels=True) predictions = lenet(images) slim.losses.softmax_cross_entropy(predictions, labels) total_loss = slim.losses.get_total_loss() tf.scalar_summary('loss', total_loss) optimizer = tf.train.RMSPropOptimizer(0.001, 0.9) train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) slim.learning.train(train_op, log_dir, save_summaries_secs=20)
def train(): """ 训练mnist网络 """ # with tf.Graph().as_default(): # global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.contrib.framework.get_or_create_global_step() #初始化所有参数 init = tf.global_variables_initializer() #获取(image, label)batch pair image_batch, label_batch = mnist.inputs('train') #损失函数sparse_softmax_cross_entropy_with_logits要求rank_of_labels = rank_of_images - 1 #对label_batch作扁平化处理 label_batch = tf.reshape(label_batch, [50]) #扩展image维度,从[batch, row, col]转换为[batch, row, col, depth=1] expand_image_batch = tf.expand_dims(image_batch, -1) #损失函数使用sparse_softmax_cross_entropy_with_logits(),自动完成one_hot编码转化 #将label数据由标量转换为one_hot编码形式 # labels_one_hot = dense_to_one_hot(label_batch, 10) #创建mnist模型,并计算每个batch样本的logits logits = mnist.inference(expand_image_batch, dropout=0.5) loss = mnist.loss(logits=logits, labels=label_batch) accuracy = mnist.train_accuracy(logits, label_batch) train_op = mnist.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ 记录损失和运行时间日志信息 """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 # self._start_time = time.time() #请求目标tensor的值,在after_run方法中获取 return tf.train.SessionRunArgs([loss, accuracy]) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: _current_time = time.time() duration = _current_time - self._start_time self._start_time = _current_time #提取before_run中请求的损失和精确度值 loss_value, accuracy_value = run_values.results #样本数/秒,秒/batch_size数样本 examples_per_sec = FLAGS.batch_size * FLAGS.log_frequency / duration sec_per_batch = float(duration / FLAGS.log_frequency) #console打印训练状态数据 #时间:步数,损失,精确度(每秒样本数,每batch样本处理时间) format_str = ( '%s: step %d, loss=%.2f, accuracy=%.2f(%.1f examples/sec, %.3f sec/batch)' ) print(format_str % (datetime.now(), self._step, loss_value, accuracy_value, examples_per_sec, sec_per_batch)) #最大训练步数20000,每10步打印一次输出 #MonitoredTrainingSession默认情况下600s保存一次检查点,每100步保存一次summary with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], save_checkpoint_secs=60, config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as mon_sess: mon_sess.run(init) while not mon_sess.should_stop(): # mon_sess.run(init) mon_sess.run(train_op)
def evaluate(): image_batch, label_batch = mnist.inputs(data_type='test') image_batch = tf.expand_dims(image_batch, -1) label_batch = tf.reshape(label_batch, [50]) #default value of dropout:1.0 logits = mnist.inference(image_batch) #计算预测值 top_k_op = tf.nn.in_top_k(logits, label_batch, 1) saver = tf.train.Saver() with tf.Session() as sess: #从检查点恢复模型 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print(ckpt.model_checkpoint_path) #提取global_step变量值 global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] else: print('No Checkpoint File Found!') return graph = tf.get_default_graph() keep_prob = graph.get_tensor_by_name('keep_prob:0') #开启queue runner coord = tf.train.Coordinator() try: threads = [] #为数据读取队列创建线程 for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend( qr.create_threads(sess, coord, daemon=True, start=True)) #防止样本总数无法被batch_size整除,batch()默认不接受小于设定大小的batch输出情况 num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) total_example_cnt = num_iter * FLAGS.batch_size true_cnt = 0 #统计正确预测的数量 step = 0 while step < num_iter and not coord.should_stop(): predictions = sess.run(top_k_op, feed_dict={keep_prob: 1.0}) true_cnt += np.sum(predictions) step += 1 #计算测试精确度:precision @ 1 precision = true_cnt / total_example_cnt print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) #参考的code中还有用于tensorboard的summary存储操作 except Exception as e: coord.request_stop(e) coord.request_stop() #当一个线程调用了request_stop()方法后,其余线程有grace period的时间来停止(default 2min) #超时后由coordinator.join()报运行时异常 coord.join(threads, stop_grace_period_secs=10)