def _tower_fn(is_training, images, score_maps, geo_maps, training_masks, reuse_variables=None): # Build inference graph with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): f_score, f_geometry = model.model(images, is_training=True) model_loss = model.loss(score_maps, f_score, geo_maps, f_geometry, training_masks) total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # add summary summaries = None if reuse_variables is None: image_sum = tf.summary.image('input', images) score_sum = tf.summary.image('score_map', score_maps) f_score_sum = tf.summary.image('score_map_pred', f_score * 255) geo_sum = tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1]) f_geo_sum = tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1]) mask_sum = tf.summary.image('training_masks', training_masks) loss1_sum = tf.summary.scalar('model_loss', model_loss) loss_sum = tf.summary.scalar('total_loss', total_loss) summaries = [image_sum, score_sum, f_score_sum, geo_sum, f_geo_sum, mask_sum, loss1_sum, loss_sum] model_params = tf.trainable_variables() tower_grad = tf.gradients(total_loss, model_params) return total_loss, zip(tower_grad, model_params), summaries
def tower_loss(images, score_maps, geo_maps, training_masks, reuse_variables=None): # Build inference graph with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): f_score, f_geometry = model.model(images, is_training=True) model_loss = model.loss(score_maps, f_score, geo_maps, f_geometry, training_masks) total_loss = tf.add_n( [model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # add summary if reuse_variables is None: tf.summary.image('input', images) tf.summary.image('score_map', score_maps) tf.summary.image('score_map_pred', f_score * 255) tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1]) tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1]) tf.summary.image('training_masks', training_masks) tf.summary.scalar('model_loss', model_loss) tf.summary.scalar('total_loss', total_loss) return total_loss, model_loss
def run_training(args): with tf.Graph().as_default(): # data_train, data_validation, im_size = data.load_data_random(args.n_images, im_size=(256,256), light_size=(8,8)) # data_train, data_validation, im_size = data.load_data_smooth(args.n_images, im_size=(256,256), light_size=(8,8)) # data_train, data_validation, im_size = data.load_data_grid(args.n_images, im_size=(256,256), light_size=(8,8)) # data_train, data_validation = data.load_Tgray_mat(args.n_images) data_train, data_validation, im_size = data.load_Green_mat( args.n_images) X_tensor = tf.placeholder(tf.float32, shape=(None, data.INPUT_DIM), name="input") yt_tensor = tf.placeholder(tf.float32, shape=(None, data.OUTPUT_DIM), name="output") y_tensor = model.inference(X_tensor, n_units=15, output_dim=data.OUTPUT_DIM) loss_tensor = model.loss(y_tensor, yt_tensor) error_tensor = model.training_error(loss_tensor, yt_tensor) train_op = model.training(loss_tensor, args.learning_rate) config = tf.ConfigProto(device_count={'GPU': 0}) if args.gpu: config = tf.ConfigProto() init = tf.initialize_all_variables() saver = tf.train.Saver() sess = tf.Session(config=config) sess.run(init) # show_image(data_train[0,...,-2], im_size) show_image(data_train[0, ..., -1], im_size) # y_ = run_inference(sess, X_tensor, y_tensor, data_train[0,...,:-1]) # show_image(y_[:,0], im_size) for step in range(args.max_steps): X_data, yt_data = data.split_input_output( data.next_batch_images(data_train, args.batch_size)) # print(X_data.min(axis=0)) # print(X_data.max(axis=0)) # print(yt_data.min(axis=0)) # print(yt_data.max(axis=0)) feed_dict = {X_tensor: X_data, yt_tensor: yt_data} _, loss_value, error = sess.run( [train_op, loss_tensor, error_tensor], feed_dict=feed_dict) if step % 5 == 0: epoch = step * args.batch_size / data_train.shape[0] print('Step %d (epoch %.2f): loss = %.2f (error = %.3f)' % (step, epoch, loss_value, error)) # y_ = run_inference(sess, X, y_tensor, (0.5, 0.5), data.TGRAY_SIZE) # show_image(y_[:,0], data.TGRAY_SIZE) if (step + 1) % 5 == 0: y_ = run_inference(sess, X_tensor, y_tensor, data_train[0, ..., :-1]) # y_ = run_inference(sess, X_tensor, y_tensor, X_data[:im_size[0]*im_size[1]]) # show_image(y_[:,0], im_size) write_image(y_[:, 0], im_size, 'results/green-%i.jpg' % step)
def train(): with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() print ("Global step", global_step) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step /duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir = FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, labels = get_batch(train_data, train_labels, corpus.train_seq_lens, i) hidden = model.init_hidden(args.batch_size) model.zero_grad() output, _ = model(data, hidden) mask = (data >= 0).float() loss, _ = model.loss(output, labels, mask) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) image, label = input.get_input(LABEL_PATH, LABEL_FORMAT, IMAGE_PATH, IMAGE_FORMAT) logits = model.inference(image) loss = model.loss(logits, label) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(log_device_placement=input.FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(input.FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(input.FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), "Model diverged with loss = NaN" if step % 1 == 0: num_examples_per_step = input.FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)" print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 25 == 0: checkpoint_path = os.path.join(input.FLAGS.train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None): labels_data = labels_json() tf.Variable(labels_data, trainable=False, name='labels') batch_size = 128 files = [os.path.join(FLAGS.datadir, f) for f in os.listdir(os.path.join(FLAGS.datadir)) if f.endswith('.tfrecords')] images, labels = inputs(batch_size, files) logits = model.inference(images, len(json.loads(labels_data)) + 1) losses = model.loss(logits, labels) train_op = model.train(losses) summary_op = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables(), max_to_keep=21) with tf.Session() as sess: summary_writer = tf.summary.FileWriter(FLAGS.logdir, graph=sess.graph) restore_or_initialize(sess) tf.train.start_queue_runners(sess=sess) for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, losses]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' format_str = '%s: step %d, loss = %.5f (%.3f sec/batch)' print(format_str % (datetime.now(), step, loss_value, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 250 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.logdir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step, write_meta_graph=False, write_state=False)
def __init__(self, model_dir=None, gpu_fraction=0.7): config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction self.sess = tf.Session(config=config) self.imgs_ph, self.bn, self.output_tensors, self.pred_labels, self.pred_locs = model.model( self.sess) total_boxes = self.pred_labels.get_shape().as_list()[1] self.positives_ph, self.negatives_ph, self.true_labels_ph, self.true_locs_ph, self.total_loss, self.class_loss, self.loc_loss = \ model.loss(self.pred_labels, self.pred_locs, total_boxes) out_shapes = [out.get_shape().as_list() for out in self.output_tensors] c.out_shapes = out_shapes c.defaults = model.default_boxes(out_shapes) # variables in model are already initialized, so only initialize those declared after with tf.variable_scope("optimizer"): self.global_step = tf.Variable(0) self.lr_ph = tf.placeholder(tf.float32, shape=[]) self.optimizer = tf.train.AdamOptimizer(1e-3).minimize( self.total_loss, global_step=self.global_step) new_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope="optimizer") self.sess.run(tf.initialize_variables(new_vars)) if model_dir is None: model_dir = FLAGS.model_dir ckpt = tf.train.get_checkpoint_state(model_dir) self.saver = tf.train.Saver() if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) print("restored %s" % ckpt.model_checkpoint_path)
def train(): tr, va, te = read_dataset('../mnist.pkl.gz') binarizer = LabelBinarizer().fit(range(10)) x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) keep_prob = tf.placeholder(tf.float32) preds = model.inference(x, keep_prob) loss, total_loss = model.loss(preds, y) acc = model.evaluation(preds, y) # learning rate: 0.1 train_op = model.training(total_loss, 0.1) init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) for i in xrange(10000): batch_xs, batch_ys = tr.next_batch(50) if i % 100 == 0: train_acc = acc.eval(feed_dict={ x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 1.0}, session=sess) print "step: {0}, training accuracy {1}".format(i, train_acc) validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess) print("Validation accuracy : {0}".format(validation_accuracy)) train_op.run(feed_dict={ x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 0.5}, session=sess) test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess) print("Test accuracy : ", test_accuracy)
def tower_loss(images, score_maps, geo_maps, training_masks, labels, reuse_variables=None): # Build inference graph with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): f_score, f_geometry = model.model(images, is_training=True) f_dat = labels model_loss = model.loss(score_maps, f_score, geo_maps, f_geometry, training_masks) #total_loss = tf.add_n([model_loss] + 0.7*sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))) total_loss = sum([model_loss]) + reg_constant * sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # add summary if reuse_variables is None: tf.summary.image('input', images) tf.summary.image('score_map', score_maps) tf.summary.image('score_map_pred', f_score * 255) tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1]) tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1]) tf.summary.image('training_masks', training_masks) #tf.summary.image('weight_vis', [v for v in tf.trainable_variables() if 'resnet_v1_50' in v.name][0]) tf.summary.scalar('model_loss', model_loss) tf.summary.scalar('total_loss', total_loss) return total_loss, model_loss, f_score, f_geometry, f_dat
def model_fn(features, labels): tf.keras.backend.set_learning_phase(True) predictions = model.build(features) loss = None train_op = None eval_metric_ops = None if mode in [tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.TRAIN]: loss_dict, total_loss = model.loss(predictions, labels) loss = toal_loss eval_metric_ops = eval_metric_operate(loss_dict) if mode == tf.estimator.ModeKeys.TRAIN: train_op = get_train_op(features, labels, total_loss, params) export_outputs = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions) } spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs) return spec
def run_training(): data_dir = 'D:/WCsPy/data/train/' log_dir = 'saves' image, label = inputData.get_files(data_dir) image_batches, label_batches = inputData.get_batches( image, label, 32, 32, 16, 20) print(image_batches.shape) p = model.mmodel(image_batches, 16) cost = model.loss(p, label_batches) train_op = model.training(cost, 0.001) acc = model.get_accuracy(p, label_batches) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for step in np.arange(1000): print(step) if coord.should_stop(): break _, train_acc, train_loss = sess.run([train_op, acc, cost]) print("loss:{} accuracy:{}".format(train_loss, train_acc)) if step % 100 == 0: check = os.path.join(log_dir, "model.ckpt") saver.save(sess, check, global_step=step) except tf.errors.OutOfRangeError: print("Done!!!") finally: coord.request_stop() coord.join(threads) sess.close()
def tower_loss(scope, images, labels): """Calculate the total loss on a single tower running the model. Args: scope: unique prefix string identifying the tower, e.g. 'tower_0' images: Images. 4D tensor of shape [batch_size, height, width, 1]. labels: Labels. 4D tensor of shape [batch_size, height, width, 1]. Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build inference Graph. resize_images = model.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = model.loss(resize_images, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. tf.summary.scalar('total_loss', total_loss) return total_loss
def __init__(self, model_dir=None): self.sess = tf.Session() self.imgs_ph, self.bn, self.output_tensors, self.pred_labels, self.pred_locs = model.model(self.sess) total_boxes = self.pred_labels.get_shape().as_list()[1] self.positives_ph, self.negatives_ph, self.true_labels_ph, self.true_locs_ph, self.total_loss, self.class_loss, self.loc_loss = \ model.loss(self.pred_labels, self.pred_locs, total_boxes) out_shapes = [out.get_shape().as_list() for out in self.output_tensors] c.out_shapes = out_shapes c.defaults = model.default_boxes(out_shapes) # variables in model are already initialized, so only initialize those declared after with tf.variable_scope("optimizer"): self.global_step = tf.Variable(0) self.lr_ph = tf.placeholder(tf.float32) self.optimizer = tf.train.AdamOptimizer(1e-3).minimize(self.total_loss, global_step=self.global_step) new_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="optimizer") init = tf.variables_initializer(new_vars) self.sess.run(init) if model_dir is None: model_dir = FLAGS.model_dir ckpt = tf.train.get_checkpoint_state(model_dir) self.saver = tf.train.Saver() if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) print("restored %s" % ckpt.model_checkpoint_path)
def train(tfrecord_file, train_dir, batch_size, num_epochs): _, vectors, labels = data_loader.inputs([tfrecord_file], batch_size=batch_size, num_threads=16, capacity=batch_size * 4, min_after_dequeue=batch_size * 2, num_epochs=num_epochs, is_training=True) loss = model.loss(vectors, labels) global_step = tf.Variable(0, name='global_step', trainable=False) # Create training op with dependencies on update ops for batch norm update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = tf.train.AdamOptimizer(learning_rate=0.001). \ minimize(loss, global_step=global_step) # Create training supervisor to manage model logging and saving sv = tf.train.Supervisor(logdir=train_dir, global_step=global_step, save_summaries_secs=60, save_model_secs=600) with sv.managed_session() as sess: while not sv.should_stop(): _, loss_out, step_out = sess.run([train_op, loss, global_step]) if step_out % 100 == 0: print('Step {}: Loss {}'.format(step_out, loss_out))
def train(): """Train datasets for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for model. images, labels = model.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images) # Calculate loss. loss = model.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))# log_device_placement=True,该参数表示程序会将运行每一个操作的设备输出到屏幕 sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def eval(module, test_iter, args, write_to_file=False): mode = module.training module.eval() correct = 0 total = 0 loss_tot = 0 eval_results = {} predictions = [] for batch in tqdm(test_iter): scores = module.forward(batch.text) loss = model.loss(scores, batch.label) loss_tot += loss.item() preds = scores.argmax(1).squeeze() correct += sum((preds == batch.label)).item() total += batch.text.shape[0] if write_to_file: predictions += list(preds.cpu().numpy()) eval_results['loss'] = loss_tot / len(test_iter) eval_results['accuracy'] = correct / total # Write predictions to file. if write_to_file: write_predictions(predictions, args, eval_results) module.train(mode) return eval_results
def setup_refine_model(input_images, depth_maps, depth_maps_sigma, keep_conv, keep_hidden): print("refine train.") if USE_ORIGINAL_MODEL: coarse = original_model.globalDepthMap(input_images, keep_conv, trainable=False) # coarse7, coarse6, coarse5, coarse3 = original_model.globalDepthMap(input_images, keep_conv, trainable=False) logits = original_model.localDepthMap(input_images, coarse, keep_conv, keep_hidden) loss = original_model.loss(logits, depth_maps, depth_maps_sigma) #c7 = tf.Print(coarse7, [coarse7], summarize=100) #c6 = tf.Print(coarse6, [coarse6], summarize=100) #c5 = tf.Print(coarse5, [coarse5], summarize=100) #c3 = tf.Print(coarse3, [coarse3], summarize=100) #logits, f3_d, f3, f2, f1_d, f1, pf1 = original_model.localDepthMap(images, coarse, keep_conv, keep_hidden) #o_p_logits = tf.Print(logits, [logits], summarize=100) #o_p_f3_d = tf.Print(f3_d, [f3_d], "fine3_dropout", summarize=100) #o_p_f3 = tf.Print(f3, [f3], "fine3", summarize=100) #o_p_f2 = tf.Print(f2, [f2], "fine2", summarize=100) #o_p_f1_d = tf.Print(f1_d, [f1_d], "fine1_dropout", summarize=100) #o_p_f1 = tf.Print(f1, [f1], "fine1", summarize=100) #o_p_pf1 = tf.Print(pf1, [pf1], "pre_fine1", summarize=100) else: coarse = maurice_model.globalDepthMap(input_images, keep_conv, trainable=False) logits = maurice_model.localDepthMap(input_images, coarse, keep_conv, keep_hidden) loss = maurice_model.loss(logits, depth_maps, depth_maps_sigma) return logits, loss
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) current_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: current_step = 0 init = tf.initialize_all_variables() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(SUMMARY_DIR, graph_def=sess.graph_def) for step in xrange(current_step, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 50 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): with tf.Graph().as_default(), tf.device("/gpu:0"): global_step = tf.Variable(0, trainable=False) dirs_gray, dirs_color = make_data_directory_list() gray_images, color_images = input.read_dirs(dirs_gray, dirs_color, is_train=True) inferenced = model.inference(gray_images) raw_loss, total_loss = model.loss(inferenced, color_images) train_op = get_train_op(raw_loss, total_loss, global_step) summary_op = tf.merge_all_summaries() #saver = tf.train.Saver(tf.all_variables()) saver = tf.train.Saver(tf.trainable_variables()) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: print "restore from {}".format(ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, value_raw_loss, value_total_loss = sess.run([train_op, raw_loss, total_loss]) duration = time.time() - start_time if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, raw_loss = %.2f, total_loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, value_raw_loss, value_total_loss, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() print(global_step) # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. images, labels = cifar10_input.distorted_inputs() print(images) print(labels) # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images) # Calculate loss. loss = model.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) print(logits) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook() ]) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train SUN3D for a number of steps.""" with tf.Graph().as_default(), tf.device('/gpu:1'): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for SUN3D. images, depths = model.inputs() # Build a Graph that computes the logits predictions from the # inference model. phase_train = True scores = model.inference(images, phase_train) # Calculate loss. loss = model.loss(scores, depths) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.96) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = BATCH_SIZE examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=TRAIN_LOG, hooks=[ tf.train.StopAtStepHook(last_step=NUM_ITER), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options, log_device_placement=LOG_DEVICE_PLACEMENT)) as mon_sess: while not mon_sess.should_stop(): print(mon_sess.run(loss)) mon_sess.run(train_op)
def pgd_conv(x, y, images_pl, labels_pl, logits_pl, exp_config, sess, eps=None, step_alpha=None, epochs=None, sizes=None, weights=None): mask_tensor_shape = [1] + list(exp_config.image_size) # compute loss loss = model.loss(logits_pl, labels_pl, nlabels=exp_config.nlabels, loss_type=exp_config.loss_type, weight_decay=exp_config.weight_decay) crafting_input = x.copy() crafting_output = crafting_input # crafting_target = y.copy() for i in range(epochs): grad_pl, = tf.gradients(loss, images_pl) grad = sess.run([grad_pl], feed_dict={images_pl: crafting_input, labels_pl: y})[0] assert grad is not None added = np.sign(grad) step_output = crafting_input + step_alpha * added total_adv = step_output - x total_adv = np.clip(total_adv, -eps, eps) crafting_output = x + total_adv crafting_input = crafting_output added = crafting_output - x print('PGD DONE') for i in range(epochs * 2): temp = tf.nn.conv2d(input=added, filter=weights[0], padding='SAME', data_format='NHWC') for j in range(len(sizes) - 1): temp = temp + tf.nn.conv2d(input=added, filter=weights[j + 1], padding='SAME', data_format='NHWC') temp = temp / float(len(sizes)) # average over multiple convolutions temp = temp.eval(session=sess) grad_pl, = tf.gradients(loss, images_pl) grad = sess.run([grad_pl], feed_dict={images_pl: temp, labels_pl: y})[0] assert grad is not None del temp added = added + step_alpha * np.sign(grad) added = np.clip(added, -eps, eps) print('SMOOTH PGD1 DONE') temp = tf.nn.conv2d(input=added, filter=weights[0], padding='SAME', data_format='NHWC') for j in range(len(sizes) - 1): temp = temp + tf.nn.conv2d(input=added, filter=weights[j + 1], padding='SAME', data_format='NHWC') temp = temp / float(len(sizes)) temp = temp.eval(session=sess) crafting_output = x + temp del temp print('SMOOTH PGD2 DONE') return crafting_output
def get_attack_batch(model_name, count): if not os.path.exists(FGSM_DIR): os.makedirs(FGSM_DIR) tf.reset_default_graph() # computational graph img_batch = tf.placeholder(tf.float32, shape=[None, 28 * 28], name='img_batch') label_batch = tf.placeholder(tf.float32, shape=[None, 10], name='labels_batch') out = model.cnn(img_batch) logits = out.get('logits') probabilities = out.get('probabilities') loss = model.loss(label_batch, logits) img_batch_val, label_batch_val = data.get_test_batch(count) classes_batch_val = np.argmax(label_batch_val, axis=1) saver = tf.train.Saver() with tf.Session() as sess: saver.restore( sess, pp + MODEL_DIR + os.sep + 'model_' + model_name + '.ckpt') gradients = tf.gradients(loss, img_batch) grad_vals, probabilities_val = sess.run([gradients, probabilities], feed_dict={ img_batch: img_batch_val, label_batch: label_batch_val }) grad_vals_sign = np.sign(grad_vals[0]) * 1.0 / 255. assigned_classes = np.argmax(probabilities_val, axis=1) original_images = [] successful_attacks = [] for i, grad in enumerate(grad_vals_sign): if assigned_classes[i] != classes_batch_val[i]: # classification should have been correct continue epss = np.arange(0., 100., 1) # epsilon values attacks = [img_batch_val[i] + grad * 1 * x for x in epss] attacks = np.clip(attacks, 0, 1) # clip image pixels to [0,1] # run classification on attacks probabilities_val = sess.run(probabilities, feed_dict={img_batch: attacks}) best_attack = get_first_successful(probabilities_val, attacks) if best_attack is not None: successful_attacks.append(best_attack) original_images.append(img_batch_val[i]) log_attacks(original_images, successful_attacks) return original_images, successful_attacks
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) current_step = int(ckpt.model_checkpoint_path .split('/')[-1].split('-')[-1]) else: current_step = 0 init = tf.initialize_all_variables() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(SUMMARY_DIR, graph_def=sess.graph_def) for step in xrange(current_step, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 50 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def run_net(Xs, Ys, YsBoW=None): Xs, Ys = torch.from_numpy(Xs).to(args.device), torch.from_numpy(Ys).to( args.device) if (args.mt): YsBoW = torch.from_numpy(YsBoW).to(args.device) N, H, W = Xs.size()[0], Xs.size()[1], Xs.size()[2] Xs = Xs.unsqueeze(dim=1) # .view(N, 1, H, W) if (args.mt): fVectorSp, Ys_predBoW, Ys_pred = modelSp(Xs, modelType=args.mtType) else: fVectorSp, Ys_pred = modelSp(Xs) loss = model.loss(Ys_pred, Ys) if (args.mt): lossBoW = model.loss(Ys_predBoW, YsBoW) return loss, lossBoW, Ys_pred.cpu().data.numpy(), Ys_predBoW.cpu( ).data.numpy(), fVectorSp else: return loss, Ys_pred.cpu().data.numpy(), fVectorSp
def __init__(self, datafold, adam_rate=0.0001, batch_size=256, n_epochs=30, penalty_intensity=0.05): path_folder = PATH_TO_DATA + 'datafold_' + str(datafold) + '/' train_csv = pd.read_csv(path_folder + "train_set.csv") self.training_set_size = len(train_csv) self.train_tf_records_path = path_folder + 'train_256_3d.tfrecords' test_csv = pd.read_csv(path_folder + "test_set.csv") self.test_set_size = len(test_csv) self.test_tf_records_path = path_folder + 'test_256_3d.tfrecords' self.adam_rate = adam_rate self.batch_size = batch_size self.n_epochs = n_epochs self.penalty_intensity = penalty_intensity print("adam_rate: " + str(adam_rate)) print("batch_size: " + str(batch_size)) print("n_epochs: " + str(n_epochs)) print("penalty_intensity: " + str(penalty_intensity)) self.logdir = path_folder + '/logs_3D_CNN_LR_' + str( adam_rate) + '_BS_' + str(batch_size) + '_L2_' + str( penalty_intensity) + '/' self.tensorboard_n_checkpoint = self.logdir + 'tensorboard_n_checkpoint/' self.chkpt = self.tensorboard_n_checkpoint + 'model.ckpt' with tf.variable_scope('3D_CNN'): self.X = tf.placeholder(tf.float32, [ None, MODIFIED_SIZE, MODIFIED_SIZE, MODIFIED_SIZE, NUM_CHANNEL ], name='X') self.y = tf.placeholder(tf.float32, [None, OUTPUT_SIZE], name='y') self.keep_rate = tf.placeholder(tf.float32) score = inference(self.X, self.keep_rate, OUTPUT_SIZE) softmax = tf.nn.softmax(score) self.cost = loss(score, self.y, self.penalty_intensity) self.optimizer = tf.train.AdamOptimizer(self.adam_rate).minimize( self.cost, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) self.preds = tf.equal(tf.argmax(softmax, axis=1), tf.argmax(self.y, axis=1)) self.accuracy = tf.reduce_mean(tf.cast(self.preds, tf.float32)) self.cost_summary = tf.summary.scalar(name='Cost', tensor=self.cost) self.accuracy_summary = tf.summary.scalar(name='Accuracy', tensor=self.accuracy) self.summary = tf.summary.merge_all()
def train(): # 数据集 print("start") image_dir = r'E:\VOC2013\JPEGImages/' #My dir--20170727-csq xml_dir = r'E:\VOC2013\Annotations/' #获取图片和参数 imagesname, labels, w_h_s, number = reader.input_data() label_data = reader.data_normalizer(labels, w_h_s, number) label_32 = tf.cast(label_data, tf.float32) print(label_32) print("start2") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' image_batch = [] image_batch, label_batch = reader.get_batch(imagesname, label_data, number) print(image_batch) #tensor=np.array(image_batch) sess = tf.Session() coord = tf.train.Coordinator() ps1, ps2, ps3 = model.model(image_batch, True) scale1, scale2, scale3 = model.scales(ps1, ps2, ps3, True) loss = model.loss(scale1, label_batch) tf.squeeze(loss, 2) print(loss) tf.summary.scalar('loss', loss) train_op = model.op(loss, 0.01) summary_op = tf.summary.merge_all() train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) q = tf.train.start_queue_runners(sess=sess, coord=coord) #sess.run(tensor) for step in range(10000): #sess.run([ps1,ps2,ps3]) #sess.run([scale1, scale2, scale3]) op, loss_result = sess.run([train_op, loss]) if step % 50 == 0: print(step) print(loss_result) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) if step % 2000 == 0 or (step + 1) == 10000: # 每隔2000步保存一下模型,模型保存在 checkpoint_path 中 checkpoint_path = os.path.join(logs_train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) print('finish')
def train(): with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() images, labels = input.createBatch(tfrecords_name, batch_size) logits = model.inference(images, batch_size, n_classes) loss = model.loss(logits, labels) accuracy = model.evaluation(logits, labels) train_op = model.trainning(loss, learning_rate, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 #if self._step % log_frequency == 0: # print(self.run(accuracy)) # print("step %d, accuracy = %.2f"%(self._step ,accuracy)) return tf.train.SessionRunArgs([loss, accuracy ]) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time [loss_value, accuracy_value] = run_values.results #accuracy_value = run_context.accuracy examples_per_sec = log_frequency * batch_size / duration sec_per_batch = float(duration / log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) print('Accuracy = %.2f' % accuracy_value) with tf.train.MonitoredTrainingSession( checkpoint_dir=train_dir, hooks=[ tf.train.StopAtStepHook(last_step=max_steps), tf.train.NanTensorHook(loss), tf.train.SummarySaverHook( save_steps=5, output_dir=board_dir, summary_op=tf.summary.merge_all()), _LoggerHook() ]) as mon_sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=mon_sess, coord=coord) while not mon_sess.should_stop(): mon_sess.run(train_op) #print('dont stop') coord.request_stop() coord.join(threads)
def run_net(Xs, Ys, YsBoW=None): Xs, Ys = torch.from_numpy(Xs).to(args.device), torch.from_numpy(Ys).to( args.device) if (args.mt): YsBoW = torch.from_numpy(YsBoW).to(args.device) N, H, W = Xs.size()[0], Xs.size()[1], Xs.size()[2] Xs = Xs.unsqueeze(dim=1) # .view(N, 1, H, W) if (args.attn): Ys_pred, attn_weights = network(Xs) elif (args.mt): Ys_predBoW, Ys_pred = network(Xs) else: Ys_pred = network(Xs) loss = model.loss(Ys_pred, Ys) # pdb.set_trace() if (args.mt): lossBoW = model.loss(Ys_predBoW, YsBoW) if (args.attn): return loss, Ys_pred.cpu().data.numpy(), attn_weights elif (args.mt): return loss, lossBoW, Ys_pred.cpu().data.numpy(), Ys_predBoW.cpu( ).data.numpy() else: return loss, Ys_pred.cpu().data.numpy()
def run_training(): train_dir = "D:\新建文件夹\python foot/train/" log_train_dir = "D:\新建文件夹\python foot/train_savenet/" vadiation_dir = 'D:\新建文件夹\python foot/valiation/' train, train_labels = pre_process.get_files(train_dir) train_batch, train_label_batch = pre_process.get_batch( train, train_labels, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) train_logits = model.inference(train_batch, BATCH_SIZE, N_CLASSES) train_loss = model.loss(train_logits, train_label_batch) train_op = model.training(train_loss, LEARNING_RATE) train_acc = model.evalution(train_logits, train_label_batch) summary_op = tf.summary.merge_all( ) #merge_all 可以将所有summary全部保存到磁盘,以便tensorboard显示。 # 一般这一句就可显示训练时的各种信息。 #vadiation, vadiation_labels = pre_process.get_files(vadiation_dir) #vadiation_batch, vadiation_label_batch = pre_process.get_batch(vadiation, vadiation_labels, IMG_W,IMG_H,BATCH_SIZE, CAPACITY) #vadiation_logits = model.inference(vadiation_batch, BATCH_SIZE, N_CLASSES) #vadiation_loss = model.loss(vadiation_logits, vadiation_label_batch) #vadiation_acc = model.evalution(vadiation_logits, vadiation_label_batch) sess = tf.Session() train_writer = tf.summary.FileWriter(log_train_dir, sess.graph) #指定一个文件用来保存图 saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Coordinator 和 start_queue_runners 监控 queue 的状态,不停的入队出队 coord = tf.train.Coordinator( ) #https://blog.csdn.net/weixin_42052460/article/details/80714539 threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for step in np.arange(STEP): if coord.should_stop(): break _, tra_loss, tra_acc = sess.run([train_op, train_loss, train_acc]) if step % 50 == 0: #%.2f表示输出浮点数并保留两位小数。%%表示直接输出一个% print("step %d, train loss = %.2f, train accuracy = %.2f%%" % (step, tra_loss, tra_acc * 100.0)) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) #????????????? if step % 2000 == 0 or (step + 1) == STEP: # 每隔2000步保存一下模型,模型保存在 checkpoint_path 中 print( "step %d, vadiation loss = %.2f, vadiation accuracy = %.2f%%" % (step, vadiation_loss, vadiation_acc * 100.0)) checkpoint_path = os.path.join(log_train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads) sess.close()
def train(): train, validation = data.get_nameset() with tf.Graph().as_default(): img1_placeholder, img2_placeholder, flo_placeholder = model.placeholder_inputs( ) predict6, predict5, predict4, predict3, predict2 = model.inference( img1_placeholder, img2_placeholder) loss = model.loss(predict6, predict5, predict4, predict3, predict2, flo_placeholder) global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, decay_steps=200000, decay_rate=0.1) optimizer = tf.train.AdamOptimizer(learning_rate) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = optimizer.minimize(loss, global_step=global_step) summary = tf.summary.merge_all() init = tf.initialize_all_variables() saver = tf.train.Saver() sess = tf.Session() train_timer = Timer() sess.run(init) for step in xrange(max_steps): train_timer.tic() feed_dict = model.fill_feed_dict(train, img1_placeholder, img2_placeholder, flo_placeholder) _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) train_timer.toc() if step % 100 == 0: if step % 20 == 0: log_str = ( '{} Epoch: {}, Step: {}, Learning rate: {},' ' Loss: {:5.3f}\nSpeed: {:.3f}s/iter, Remain: {}' ).format( datetime.datetime.now().strftime('%m/%d %H:%M:%S'), train.epochs_completed, int(step), learning_rate.eval(session=sess), loss_value, train_timer.average_time, train_timer.remain(step, max_steps)) print log_str summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) summary_writer.add_summary(summary_str, step) summary_writer.flush() if (step + 1) % 1000 == 0 or (step + 1) == max_steps: checkpoint_file = os.path.join(log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=step) print('Validation Data Eval:') run_val(sess, img1_placeholder, img2_placeholder, flo_placeholder, loss, validation)
def eval_h5(conf, ckpt): """ Train model for a number of steps. Args: conf: configuration dictionary ckpt: restore from ckpt """ cw = conf["cw"] mb_size = conf["mb_size"] path_tmp = conf["path_tmp"] n_epochs = conf["n_epochs"] iw = conf["iw"] grad_norm_thresh = conf["grad_norm_thresh"] # Prepare data tr_stream, te_stream = tools.prepare_data(conf) n_tr = tr_stream.dataset.num_examples n_te = te_stream.dataset.num_examples with tf.Graph().as_default(), tf.device("/cpu:0" if FLAGS.dev_assign else None): # Placeholders Xs = [tf.placeholder(tf.float32, [None, iw, iw, 1], name="X_%02d" % i) for i in range(FLAGS.num_gpus)] Ys = [ tf.placeholder(tf.float32, [None, iw - 2 * cw, iw - 2 * cw, 1], name="Y_%02d" % i) for i in range(FLAGS.num_gpus) ] # Calculate the gradients for each model tower tower_grads = [] y_splits = [] for i in range(FLAGS.num_gpus): with tf.device(("/gpu:%d" % i) if FLAGS.dev_assign else None): with tf.name_scope("%s_%02d" % (FLAGS.tower_name, i)) as scope: # Calculate the loss for one tower. This function constructs # the entire model but shares the variables across all towers. y_split = model.inference(Xs[i], conf) y_splits.append(y_split) total_loss = model.loss(y_split, Ys[i], conf, scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() y = tf.concat(0, y_splits, name="y") # Tensorflow boilerplate sess, saver, summ_writer, summ_op = tools.tf_boilerplate(None, conf, ckpt) # Evaluation psnr_tr = eval_epoch(Xs, Ys, y, sess, tr_stream, cw) psnr_te = eval_epoch(Xs, Ys, y, sess, te_stream, cw) print("approx psnr_tr=%.3f" % psnr_tr) print("approx psnr_te=%.3f" % psnr_te) tr_stream.close() te_stream.close()
def eval_h5(conf, ckpt): """ Train model for a number of steps. Args: conf: configuration dictionary ckpt: restore from ckpt """ cw = conf['cw'] mb_size = conf['mb_size'] path_tmp = conf['path_tmp'] n_epochs = conf['n_epochs'] iw = conf['iw'] grad_norm_thresh = conf['grad_norm_thresh'] # Prepare data tr_stream, te_stream = tools.prepare_data(conf) n_tr = tr_stream.dataset.num_examples n_te = te_stream.dataset.num_examples with tf.Graph().as_default(), tf.device('/cpu:0' if FLAGS.dev_assign else None): # Placeholders Xs = [tf.placeholder(tf.float32, [None, iw, iw, 1], name='X_%02d' % i) \ for i in range(FLAGS.num_gpus)] Ys = [tf.placeholder(tf.float32, [None, iw - 2*cw, iw - 2*cw, 1], name='Y_%02d' % i) \ for i in range(FLAGS.num_gpus)] # Calculate the gradients for each model tower tower_grads = [] y_splits = [] for i in range(FLAGS.num_gpus): with tf.device(('/gpu:%d' % i) if FLAGS.dev_assign else None): with tf.name_scope('%s_%02d' % (FLAGS.tower_name, i)) as scope: # Calculate the loss for one tower. This function constructs # the entire model but shares the variables across all towers. y_split = model.inference(Xs[i], conf) y_splits.append(y_split) total_loss = model.loss(y_split, Ys[i], conf, scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() y = tf.concat(0, y_splits, name='y') # Tensorflow boilerplate sess, saver, summ_writer, summ_op = tools.tf_boilerplate(None, conf, ckpt) # Evaluation psnr_tr = eval_epoch(Xs, Ys, y, sess, tr_stream, cw) psnr_te = eval_epoch(Xs, Ys, y, sess, te_stream, cw) print('approx psnr_tr=%.3f' % psnr_tr) print('approx psnr_te=%.3f' % psnr_te) tr_stream.close() te_stream.close()
def _tower_loss(images, labels, num_classes, scope, reuse_variables=None): """Calculate the total loss on a single tower running the ImageNet model. We perform 'batch splitting'. This means that we cut up a batch across multiple GPUs. Args: images: Images. 5D tensor of size [cfg.TRAIN.MINIBATCH, cfg.TRAIN.SEGMENT_NUM, cfg.TRAIN.IMAGE_HEIGHT, cfg.TRAIN.IMAGE_WIDTH, cfg.TRAIN.INPUT_CHS]. labels: 1-D integer Tensor of [cfg.TRAIN.MINIBATCH]. num_classes: number of classes scope: unique prefix string identifying the ImageNet tower, e.g. 'tower_0'. Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build inference Graph. with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): logits = inception.inference(images, num_classes, for_training=True, scope=scope) split_batch_size = tf.shape(images)[0] inception.loss(logits, labels, batch_size=split_batch_size) losses = tf.get_collection(tf.GraphKeys.LOSSES, scope) regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses + regularization_losses, name='total_loss') loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; do the same for the averaged version of the losses. for l in losses + [total_loss]: loss_name = re.sub('%s_[0-9]*/' % inception.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name + '_raw', l) tf.summary.scalar(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return logits, total_loss
def train(self, epoch_idx, batch_size, max_norm): logger, model, data = self.logger, self.model, self.data logger.info('At %d-th epoch with lr %f.', epoch_idx, self.optimizer.param_groups[0]['lr']) model.train() nb_train_batch = ceil(data.nb_train / batch_size) for src, src_mask, trg, _ in tqdm( data.train_batch_sample(batch_size), total=nb_train_batch): out = model(src, src_mask, trg) loss = model.loss(out, trg[1:]) self.optimizer.zero_grad() loss.backward() if max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) logger.debug('loss %f with total grad norm %f', loss, util.grad_norm(model.parameters())) self.optimizer.step()
def tower_loss(images, score_maps, geo_maps, training_masks, reuse_variables=None): # Build inference graph with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): f_score, f_geometry = model.model(images, is_training=True) model_loss = model.loss(score_maps, f_score, geo_maps, f_geometry, training_masks) total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # add summary if reuse_variables is None: tf.summary.image('input', images) tf.summary.image('score_map', score_maps) tf.summary.image('score_map_pred', f_score * 255) tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1]) tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1]) tf.summary.image('training_masks', training_masks) tf.summary.scalar('model_loss', model_loss) tf.summary.scalar('total_loss', total_loss) return total_loss, model_loss
def train(conf, ckpt=False): """ Train model for a number of steps. Args: conf: configuration dictionary ckpt: restore from ckpt """ cw = conf["cw"] mb_size = conf["mb_size"] path_tmp = conf["path_tmp"] n_epochs = conf["n_epochs"] iw = conf["iw"] grad_norm_thresh = conf["grad_norm_thresh"] tools.reset_tmp(path_tmp, ckpt) # Prepare data tr_stream, te_stream = tools.prepare_data(conf) n_tr = tr_stream.dataset.num_examples n_te = te_stream.dataset.num_examples with tf.Graph().as_default(), tf.device("/cpu:0" if FLAGS.dev_assign else None): # Exponential decay learning rate global_step = tf.get_variable( "global_step", [], initializer=tf.constant_initializer(0), dtype=tf.int32, trainable=False ) lr = tools.exp_decay_lr(global_step, n_tr, conf) # Create an optimizer that performs gradient descent opt = tf.train.AdamOptimizer(lr) # Placeholders Xs = [tf.placeholder(tf.float32, [None, iw, iw, 1], name="X_%02d" % i) for i in range(FLAGS.num_gpus)] Ys = [ tf.placeholder(tf.float32, [None, iw - 2 * cw, iw - 2 * cw, 1], name="Y_%02d" % i) for i in range(FLAGS.num_gpus) ] # Calculate the gradients for each model tower tower_grads = [] y_splits = [] for i in range(FLAGS.num_gpus): with tf.device(("/gpu:%d" % i) if FLAGS.dev_assign else None): with tf.name_scope("%s_%02d" % (FLAGS.tower_name, i)) as scope: # Calculate the loss for one tower. This function constructs # the entire model but shares the variables across all towers. y_split, model_vars = model.inference(Xs[i], conf) y_splits.append(y_split) total_loss = model.loss(y_split, model_vars, Ys[i], conf["l2_reg"], scope) # Calculate the gradients for the batch of data on this tower. gvs = opt.compute_gradients(total_loss) # Optionally clip gradients. if grad_norm_thresh > 0: gvs = tools.clip_by_norm(gvs, grad_norm_thresh) # Keep track of the gradients across all towers. tower_grads.append(gvs) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summs = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) y = tf.concat(0, y_splits, name="y") # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. gvs = tools.average_gradients(tower_grads) # Apply the gradients to adjust the shared variables. apply_grad_op = opt.apply_gradients(gvs, global_step=global_step) # Add a summary to track the learning rate. summs.append(tf.scalar_summary("learning_rate", lr)) # Add histograms for gradients. for g, v in gvs: if g: v_name = re.sub("%s_[0-9]*/" % FLAGS.tower_name, "", v.op.name) summs.append(tf.histogram_summary(v_name + "/gradients", g)) # Tensorflow boilerplate sess, saver, summ_writer, summ_op = tools.tf_boilerplate(summs, conf, ckpt) # Baseline error # bpsnr_tr = tools.baseline_psnr(tr_stream) # bpsnr_te = tools.baseline_psnr(te_stream) # print('approx baseline psnr_tr=%.3f' % bpsnr_tr) # print('approx baseline psnr_te=%.3f' % bpsnr_te) # Train format_str = "%s| %04d PSNR=%.3f (%.3f) (F+B: %.1fex/s; %.1fs/batch)" "(F: %.1fex/s; %.1fs/batch)" # step = 0 step = sess.run(global_step) for epoch in range(n_epochs): print("--- Epoch %d ---" % epoch) # Training for X_c, y_c in tr_stream.get_epoch_iterator(): if X_c.shape[0] < FLAGS.num_gpus: continue y_c = y_c[:, cw:-cw, cw:-cw] chunk_size = X_c.shape[0] gpu_chunk = chunk_size // FLAGS.num_gpus dict_input1 = [ (Xs[i], X_c[i * gpu_chunk : ((i + 1) * gpu_chunk) if (i != FLAGS.num_gpus - 1) else chunk_size]) for i in range(FLAGS.num_gpus) ] dict_input2 = [ (Ys[i], y_c[i * gpu_chunk : ((i + 1) * gpu_chunk) if (i != FLAGS.num_gpus - 1) else chunk_size]) for i in range(FLAGS.num_gpus) ] feed = dict(dict_input1 + dict_input2) start_time = time.time() sess.run(apply_grad_op, feed_dict=feed) duration_tr = time.time() - start_time if step % 40 == 0: feed2 = dict(dict_input1) start_time = time.time() y_eval = sess.run(y, feed_dict=feed2) duration_eval = time.time() - start_time psnr = tools.eval_psnr(y_c, y_eval) bl_psnr = tools.eval_psnr(y_c, X_c[:, cw:-cw, cw:-cw]) ex_per_step_tr = mb_size * FLAGS.num_gpus / duration_tr ex_per_step_eval = mb_size * FLAGS.num_gpus / duration_eval print( format_str % ( datetime.now().time(), step, psnr, bl_psnr, ex_per_step_tr, float(duration_tr / FLAGS.num_gpus), ex_per_step_eval, float(duration_eval / FLAGS.num_gpus), ) ) if step % 50 == 0: summ_str = sess.run(summ_op, feed_dict=feed) summ_writer.add_summary(summ_str, step) if step % 150 == 0: saver.save(sess, os.path.join(path_tmp, "ckpt"), global_step=step) step += 1 # Evaluation # psnr_tr = eval_epoch(Xs, Ys, y, sess, tr_stream, cw) # psnr_te = eval_epoch(Xs, Ys, y, sess, te_stream, cw) # print('approx psnr_tr=%.3f' % psnr_tr) # print('approx psnr_te=%.3f' % psnr_te) saver.save(sess, os.path.join(path_tmp, "ckpt"), global_step=step) saver.save(sess, os.path.join(path_tmp, "ckpt"), global_step=step) tr_stream.close() te_stream.close()
def run_training(): """ Train the Classy model for a number of steps """ with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for runway images, labels = rw.inputs(FLAGS.batch_size, NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN) # Batch normalization if FLAGS.batch_norm: phase_train = tf.Variable(True, trainable=False, dtype=tf.bool) images = batch_norm(images, 3, phase_train=phase_train) # Build a Graph that computes the logits predictions from the # inference model. logits = cl.inference(images, keep_prob=FLAGS.keep_prob, overlap_pool=FLAGS.overlap_pool) # Calculate loss. loss = cl.loss(logits, labels) # Calculate accuracy accuracy = cl.accuracy(logits, labels) cl.add_accuracy_summaries(accuracy) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = train(loss, global_step) # Create a saver. Store 2 files per epoch, plus 2 for the beginning and end of training saver = tf.train.Saver(tf.all_variables(), max_to_keep=FLAGS.num_epochs*2+2) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) # start the summary writer summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # start the training! accuracies = [] losses = [] steps_per_epoch = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) steps_per_checkpoint = int(steps_per_epoch / 2) max_steps = FLAGS.num_epochs * steps_per_epoch for step in range(max_steps): start_time = time.time() _, loss_value, acc_value = sess.run([train_op, loss, accuracy]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' losses.append(loss_value) accuracies.append(acc_value) if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f, train_acc = %.2f, (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, acc_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) np.save(os.path.join(FLAGS.train_dir, 'tr_losses'), np.array(losses)) np.save(os.path.join(FLAGS.train_dir, 'tr_accuracies'), np.array(accuracies)) # Save the model checkpoint periodically. if step % steps_per_checkpoint == 0 or (step + 1) == max_steps or _shutdown: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if _shutdown: break print('Classy training finished!')
#TensorBoardのグラフに出力するスコープを指定 with tf.Graph().as_default(): # 画像を入れるためのTensor(28*28*3(IMAGE_PIXELS)次元の画像が任意の枚数(None)分はいる) images_placeholder = tf.placeholder("float", shape=(None, IMAGE_PIXELS)) # ラベルを入れるためのTensor(3(NUM_CLASSES)次元のラベルが任意の枚数(None)分入る) labels_placeholder = tf.placeholder("float", shape=(None, NUM_CLASSES)) # dropout率を入れる仮のTensor keep_prob = tf.placeholder("float") # inference()を呼び出してモデルを作る logits = model.inference(images_placeholder, keep_prob) # loss()を呼び出して損失を計算 loss_value = model.loss(logits, labels_placeholder) # training()を呼び出して訓練して学習モデルのパラメーターを調整する train_op = model.training(loss_value, FLAGS.learning_rate) # 精度の計算 acc = model.accuracy(logits, labels_placeholder) # 保存の準備 saver = tf.train.Saver() # Sessionの作成(TensorFlowの計算は絶対Sessionの中でやらなきゃだめ) sess = tf.Session() # 変数の初期化(Sessionを開始したらまず初期化) sess.run(tf.global_variables_initializer())
def run_training(): """ Train the Listnr model for a number of steps """ with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for runway # tr_frames_t, tr_labels_t = tm.inputs(FLAGS.batch_size) # ts_frames_t, ts_labels_t = tm.inputs(FLAGS.batch_size, train=False) # frames, labels = placeholder_inputs() frames, labels = tm.inputs(FLAGS.batch_size, NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN) # Build a Graph that computes the logits predictions from the # inference model. logits = md.inference(frames) # Calculate loss. looss = md.loss(logits, labels) # calculate accuracy accuracy = md.accuracy(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = train(looss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables(), max_to_keep=FLAGS.num_epochs) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # run the training steps_per_epoch = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) max_steps = FLAGS.num_epochs * steps_per_epoch losses_epochs = [] losses_batches = [] accuracies_epochs = [] accuracies_batches = [] for step in range(max_steps+1): start_time = time.time() _, loss_value, acc_value = sess.run([train_op, looss, accuracy]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 100 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f, train_acc = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, acc_value, examples_per_sec, sec_per_batch)) summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) losses_batches.append(loss_value) accuracies_batches.append(acc_value) # Save the model checkpoint periodically. if (step-1) % steps_per_epoch == 0 or (step + 1) == max_steps or _shutdown: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) #accuracies_epochs.append(np.mean(accuracies_batches)) #losses_epochs.append(np.mean(losses_batches)) # save accuracy and loss np.save(os.path.join(FLAGS.train_dir, 'tr_loss'), np.array(losses_batches)) np.save(os.path.join(FLAGS.train_dir, 'tr_accuracy'), np.array(accuracies_batches)) print('Saving model: ', (step-1) / steps_per_epoch) if _shutdown: break print('Listnr training finished!')
def run_training(): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. data_sets = tf_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = model.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = model.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = model.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = model.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. init = tf.initialize_all_variables() sess.run(init) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) # And then after everything is built, start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: saver.save(sess, FLAGS.train_dir, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def train(): ''' Train ''' with tf.Graph().as_default(): # globalなstep数 global_step = tf.Variable(0, trainable=False) # NYU Dataset V2 original size(480 x 640 x 3) -> crop -> (460 x 620 x 3) image_input = ImageInput('./data/nyu_depth_v2_labeled.mat') print("the number of train data: %d" % (len(image_input.images))) images = tf.placeholder(tf.float32, [None, FLAGS.crop_size_height, FLAGS.crop_size_width, FLAGS.image_depth]) depths = tf.placeholder(tf.float32, [None, 1, 55, 74]) invalid_depths = tf.placeholder(tf.float32, [None, 1, 55, 74]) keep_conv = tf.placeholder(tf.float32) keep_hidden = tf.placeholder(tf.float32) # graphのoutput if FLAGS.refine_train: print("refine train.") logits = model.inference_refine(images, keep_conv, keep_hidden) else: print("coarse train.") logits = model.inference(images, keep_conv, keep_hidden) # loss graphのoutputとlabelを利用 loss = model.loss(logits, depths, invalid_depths) # 学習オペレーション train_op = op.train(loss, global_step) # サマリー summary_op = tf.merge_all_summaries() # 初期化オペレーション init_op = tf.initialize_all_variables() # Session sess = tf.Session(config=tf.ConfigProto(log_device_placement=LOG_DEVICE_PLACEMENT)) # saver #saver = tf.train.Saver(tf.all_variables()) sess.run(init_op) # coarseとrefineを分けて保存 coarse_params = {} refine_params = {} if FLAGS.refine_train: for variable in tf.all_variables(): variable_name = variable.name print("parameter: %s" % (variable_name)) if variable_name.find("/") < 0 or variable_name.count("/") != 1: print("ignore.") continue scope, name = variable_name.split("/") target, _ = name.split(":") if variable_name.find('coarse') >= 0: print("coarse parameter: %s" % (variable_name)) coarse_params[variable_name] = variable if variable_name.find('fine') >= 0: print("refine parameter: %s" % (variable_name)) refine_params[variable_name] = variable else: for variable in tf.trainable_variables(): variable_name = variable.name print("parameter: %s" %(variable_name)) if variable_name.find("/") < 0 or variable_name.count("/") != 1: print("ignore.") continue scope, name = variable_name.split("/") target, _ = name.split(":") if variable_name.find('coarse') >= 0: print("coarse parameter: %s" %(variable_name)) coarse_params[variable_name] = variable if variable_name.find('fine') >= 0: print("refine parameter: %s" %(variable_name)) refine_params[variable_name] = variable # define saver saver_coarse = tf.train.Saver(coarse_params) saver_refine = tf.train.Saver(refine_params) # fine tune if FLAGS.fine_tune: # load coarse paramteters coarse_ckpt = tf.train.get_checkpoint_state(COARSE_DIR) if coarse_ckpt and coarse_ckpt.model_checkpoint_path: print("Pretrained coarse Model Loading.") saver_coarse.restore(sess, coarse_ckpt.model_checkpoint_path) print("Pretrained coarse Model Restored.") else: print("No Pretrained coarse Model.") # load refine parameters refine_ckpt = tf.train.get_checkpoint_state(REFINE_DIR) if refine_ckpt and refine_ckpt.model_checkpoint_path: print("Pretrained refine Model Loading.") saver_refine.restore(sess, refine_ckpt.model_checkpoint_path) print("Pretrained refine Model Restored.") else: print("No Pretrained refine Model.") # TODO train coarse or refine (change trainable) #if not FLAGS.coarse_train: # for val in coarse_params: # print val #if not FLAGS.refine_train: # for val in coarse_params: # print val # train refine coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # debug # サマリーのライターを設定 #summary_writer = tf.train.SummaryWriter(TRAIN_DIR, graph_def=sess.graph_def) #batches = image_input.get_batches(FLAGS.batch_size)a #d = np.asarray(batches[0][0]) #print d.shape #a = np.asarray(batches[0][1]) #print a.shape #logits_val, logits_fine_val, loss_value = sess.run([logits, logits_fine, loss], feed_dict={images: batches[0][0], depths: batches[0][1], invalid_depths: batches[0][2], keep_conv: 1.0, keep_hidden: 1.0}) #print len(logits_val[0]) #print len(logits_fine_val[0]) #print loss_value # max_stepまで繰り返し学習 for step in xrange(MAX_STEPS): start_time = time.time() previous_time = start_time index = 0 batches = image_input.get_batches(FLAGS.batch_size) vals = image_input.get_validation() for batch in batches: train = batch[0] depth = batch[1] ignore_depth = batch[2] _, loss_value = sess.run([train_op, loss], feed_dict={images: train, depths: depth, invalid_depths: ignore_depth, keep_conv: 0.8, keep_hidden: 0.5}) if index % 10 == 0: end_time = time.time() duration = end_time - previous_time num_examples_per_step = BATCH_SIZE * 10 examples_per_sec = num_examples_per_step / duration print("%s: %d[epoch]: %d[iteration]: train loss %f: %d[examples/iteration]: %f[examples/sec]: %f[sec/iteration]" % (datetime.now(), step, index, loss_value, num_examples_per_step, examples_per_sec, duration)) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if index % 50 == 0: output_vec, cost_value = sess.run([logits, loss], feed_dict={images: vals[0], depths: vals[1], invalid_depths: vals[2], keep_conv: 1.0, keep_hidden: 1.0}) print("%s: %d[epoch]: %d[iteration]: validation loss: %f" % (datetime.now(), step, index, cost_value)) if index % 100 == 0: output_dir = "predicts_%05d_%08d" % (step, index) print("predicts output: %s" % output_dir) data_feed_inputs_nyu.output_predict(output_vec, output_dir) previous_time = end_time index += 1 # if index % 100 == 0: # pass # summary_str = sess.run(summary_op, feed_dict={images: train, labels: label, keep_conv: 0.8, keep_hidden: 0.5}) # # サマリーに書き込む # summary_writer.add_summary(summary_str, step) # if step % 5 == 0 or (step * 1) == MAX_STEPS: if FLAGS.refine_train: refine_checkpoint_path = REFINE_DIR + '/model.ckpt' saver_refine.save(sess, refine_checkpoint_path, global_step=step) else: coarse_checkpoint_path = COARSE_DIR + '/model.ckpt' saver_coarse.save(sess, coarse_checkpoint_path, global_step=step) coord.request_stop() coord.join(threads) sess.close()
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.get_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits, fc1_w, fc2_w, fc1_b, fc2_b = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): print "Building training graph ..." with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale) with tf.variable_scope("char-rnn", initializer=initializer): keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob') cell = model.build_cell(keep_prob) inputs = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size, FLAGS.num_steps], name='inputs') targets = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size, FLAGS.num_steps], name='targets') lr = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate') initial_state = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size, cell.state_size], name='initial_state') logits, final_state = model.predict(inputs, cell, initial_state, keep_prob) loss = model.loss(logits, targets) train_op = model.train_batch(loss, lr) # create saver and summary saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() sess = tf.Session() sess.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) # load data print "Loading data ..." reader = text_input.TextReader(os.path.join(FLAGS.data_dir, FLAGS.data_file)) reader.prepare_data() train_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'train.cPickle'), FLAGS.batch_size, FLAGS.num_steps) test_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'test.cPickle'), FLAGS.batch_size, FLAGS.num_steps) total_steps = FLAGS.num_epochs * train_loader.num_batch save_path = os.path.join(FLAGS.train_dir, 'model.ckpt') zero_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32).eval(session=sess) global_step = 0 def eval(sess, loader, state): test_loss = 0. for _ in xrange(loader.num_batch): x_batch, y_batch = loader.next_batch() feed = {inputs: x_batch, targets: y_batch, keep_prob: 1., initial_state: state} state, loss_value = sess.run([final_state, loss], feed_dict=feed) test_loss += loss_value return test_loss / loader.num_batch # training for epoch in xrange(FLAGS.num_epochs): current_lr = FLAGS.init_lr * (FLAGS.lr_decay ** (max(epoch - FLAGS.decay_after + 1, 0))) state = zero_state training_loss = 0. for _ in xrange(train_loader.num_batch): global_step += 1 start_time = time.time() x_batch, y_batch = train_loader.next_batch() feed = {inputs: x_batch, targets: y_batch, keep_prob: (1.-FLAGS.dropout), lr: current_lr, initial_state: state} state, loss_value, _ = sess.run([final_state, loss, train_op], feed_dict=feed) duration = time.time() - start_time training_loss += loss_value if global_step % FLAGS.log_steps == 0: format_str = ('%s: step %d/%d (epoch %d/%d), loss = %.2f (%.3f sec/batch), lr: %.5f') print(format_str % (datetime.now(), global_step, total_steps, epoch+1, FLAGS.num_epochs, loss_value, duration, current_lr)) if global_step % FLAGS.summary_steps == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, global_step) if epoch % FLAGS.save_epochs == 0: saver.save(sess, save_path, global_step) train_loader.reset_pointer() # epoch summary training_loss /= train_loader.num_batch summary_writer.add_summary(_summary_for_scalar('training_loss', training_loss), global_step) test_loss = eval(sess, test_loader, zero_state) test_loader.reset_pointer() summary_writer.add_summary(_summary_for_scalar('test_loss', test_loss), global_step) print("Epoch %d: training_loss = %.2f, test_loss = %.2f" % (epoch+1, training_loss, test_loss))
def train(): with tf.Graph().as_default(): # globalなstep数 global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) dataset = DataSet() # get trainsets print("The number of train images: %d", (dataset.cnt_samples(FLAGS.tfcsv))) images, labels = dataset.csv_inputs(FLAGS.tfcsv, FLAGS.batch_size, distorted=True) images_debug = datasets.debug(images) # get testsets #test_cnt = dataset.cnt_samples(FLAGS.testcsv) test_cnt = 100 #test_cnt = 5 print("The number of train images: %d", ()) images_test, labels_test = dataset.test_inputs(FLAGS.testcsv, test_cnt) images_test_debug = datasets.debug(images_test) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) num_classes = FLAGS.num_classes restore_logits = not FLAGS.fine_tune # inference # logits is tuple (logits, aux_liary_logits, predictions) # logits: output of final layer, auxliary_logits: output of hidden layer, softmax: predictions logits = model.inference(images, num_classes, for_training=True, restore_logits=restore_logits) logits_test = model.inference(images_test, num_classes, for_training=False, restore_logits=restore_logits, reuse=True, dropout_keep_prob=1.0) # loss model.loss(logits, labels, batch_size=FLAGS.batch_size) model.loss_test(logits_test, labels_test, batch_size=test_cnt) losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses_test = tf.get_collection(slim.losses.LOSSES_COLLECTION_TEST) # Calculate the total loss for the current tower. regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses + regularization_losses, name='total_loss') #total_loss = tf.add_n(losses, name='total_loss') total_loss_test = tf.add_n(losses_test, name='total_loss_test') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) loss_averages_test = tf.train.ExponentialMovingAverage(0.9, name='avg_test') loss_averages_op_test = loss_averages_test.apply(losses_test + [total_loss_test]) print "="*10 print "loss length:" print len(losses) print len(losses_test) print "="*10 # for l in losses + [total_loss]: # # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # # session. This helps the clarity of presentation on TensorBoard. # loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name) # # Name each loss as '(raw)' and name the moving average version of the loss # # as the original loss name. # tf.scalar_summary(loss_name + ' (raw)', l) # tf.scalar_summary(loss_name, loss_averages.average(l)) # loss to calcurate gradients # with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) tf.scalar_summary("loss", total_loss) with tf.control_dependencies([loss_averages_op_test]): total_loss_test = tf.identity(total_loss_test) tf.scalar_summary("loss_eval", total_loss_test) # Reuse variables for the next tower. #tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) # add input summaries # summaries.extend(input_summaries) # train_operation and operation summaries train_op = train_operation.train(total_loss, global_step, summaries, batchnorm_updates) # trainable variables's summary #for var in tf.trainable_variables(): # summaries.append(tf.histogram_summary(var.op.name, var)) # saver saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. #summary_op = tf.merge_summary(summaries) summary_op = tf.merge_all_summaries() # initialization init = tf.initialize_all_variables() # session sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) summary_writer = tf.train.SummaryWriter( FLAGS.train_dir, graph_def=sess.graph.as_graph_def(add_shapes=True)) for step in xrange(FLAGS.max_steps): start_time = time.time() _, logits_eval, loss_value, labels_eval, images_debug_eval = sess.run([train_op, logits[0], total_loss, labels, images_debug]) duration = time.time() - start_time dataset.output_images(images_debug_eval, "debug", "train") assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('train %s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: print("predict:") print type(logits_eval) print logits_eval.shape print logits_eval.argmax(1) print("target:") print labels_eval summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) test_start_time = time.time() logits_test_eval, total_loss_test_val, labels_test_eval, images_test_debug_eval = sess.run([logits_test[0], total_loss_test, labels_test, images_test_debug]) test_duration = time.time() - test_start_time dataset.output_images(images_test_debug_eval, "debug_test", "test") print("test predict:") print type(logits_test_eval) print logits_test_eval.shape print logits_test_eval.argmax(1) print("test target:") print labels_test_eval test_examples_per_sec = test_cnt / float(test_duration) format_str_test = ('test %s: step %d, loss = %.2f, (%.1f examples/sec; %.3f sec/batch)') print(format_str_test % (datetime.now(), step, total_loss_test_val, test_examples_per_sec, test_duration)) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads) sess.close()