def build_optimizer(global_step): """Build the CLI specified optimizer, log the learning rate and enalble learning rate decay is specified. Args: global_step: integer tensor, the current training step Returns: optimizer: tf.Optimizer object initialized """ # Extract the initial learning rate initial_lr = float(ARGS.optimizer_args['learning_rate']) if ARGS.lr_decay: # Decay the learning rate exponentially based on the number of steps. steps_per_decay = STEPS_PER_EPOCH * ARGS.lr_decay_epochs learning_rate = tf.train.exponential_decay(initial_lr, global_step, steps_per_decay, ARGS.lr_decay_factor, staircase=True) # Update the learning rate parameter of the optimizer ARGS.optimizer_args['learning_rate'] = learning_rate else: learning_rate = tf.constant(initial_lr) # Log the learning rate tf_log(tf.summary.scalar('learning_rate', learning_rate)) # Instantiate the optimizer optimizer = getattr(tf.train, ARGS.optimizer)(**ARGS.optimizer_args) return optimizer
def log_io(inputs, outputs=None): """Log inputs and outputs batch of images. Args: inputs: tensor with shape [Batch_size, height, widht, depth] outputs: if present must be the same dimensions as inputs """ with tf.variable_scope('visualization'): grid_side = math.floor(math.sqrt(ARGS.batch_size)) inputs = put_kernels_on_grid( tf.transpose(inputs, perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2], grid_side) if outputs is None: tf_log(tf.summary.image('inputs', inputs, max_outputs=1)) return inputs = tf.pad(inputs, [[0, 0], [0, 0], [0, 10], [0, 0]]) outputs = put_kernels_on_grid( tf.transpose(outputs, perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2], grid_side) tf_log( tf.summary.image('input_output', tf.concat([inputs, outputs], axis=2), max_outputs=1))
def train(): """Train model. Returns: best validation error. Save best model""" best_validation_error_value = float('inf') with tf.Graph().as_default(), tf.device(TRAIN_DEVICE): global_step = tf.Variable(0, trainable=False, name="global_step") # Get images and labels for CIFAR-10. images, _ = DATASET.distorted_inputs(BATCH_SIZE) # Build a Graph that computes the reconstructions predictions from the # inference model. is_training_, reconstructions = MODEL.get(images, train_phase=True, l2_penalty=L2_PENALTY) # display original images next to reconstructed images with tf.variable_scope("visualization"): grid_side = math.floor(math.sqrt(BATCH_SIZE)) inputs = put_kernels_on_grid( tf.transpose(images, perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2], grid_side) outputs = put_kernels_on_grid( tf.transpose(reconstructions, perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2], grid_side) tf_log( tf.summary.image('input_output', tf.concat(2, [inputs, outputs]), max_outputs=1)) # Calculate loss. loss = MODEL.loss(reconstructions, images) # reconstruction error error_ = tf.placeholder(tf.float32, shape=()) error = tf.summary.scalar('error', error_) if LR_DECAY: # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(INITIAL_LR, global_step, STEPS_PER_DECAY, LR_DECAY_FACTOR, staircase=True) else: learning_rate = tf.constant(INITIAL_LR) tf_log(tf.summary.scalar('learning_rate', learning_rate)) train_op = OPTIMIZER.minimize(loss, global_step=global_step) # Create the train saver. variables = variables_to_save([global_step]) train_saver = tf.train.Saver(variables, max_to_keep=2) # Create the best model saver best_saver = tf.train.Saver(variables, max_to_keep=1) # read collection after that every op added its own # summaries in the train_summaries collection train_summaries = tf.summary.merge( tf.get_collection_ref(MODEL_SUMMARIES)) # Build an initialization operation to run below. init = tf.variables_initializer(tf.global_variables() + tf.local_variables()) # Start running operations on the Graph. with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run(init) # Start the queue runners with a coordinator coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not RESTART: # continue from the saved checkpoint # restore previous session if exists checkpoint = tf.train.latest_checkpoint(LOG_DIR) if checkpoint: train_saver.restore(sess, checkpoint) else: print("[I] Unable to restore from checkpoint") train_log = tf.summary.FileWriter(os.path.join( LOG_DIR, str(InputType.train)), graph=sess.graph) validation_log = tf.summary.FileWriter(os.path.join( LOG_DIR, str(InputType.validation)), graph=sess.graph) # Extract previous global step value old_gs = sess.run(global_step) # Restart from where we were for step in range(old_gs, MAX_STEPS): start_time = time.time() _, loss_value = sess.run([train_op, loss], feed_dict={is_training_: True}) duration = time.time() - start_time if np.isnan(loss_value): print('Model diverged with loss = NaN') break # update logs every 10 iterations if step % 10 == 0: num_examples_per_step = BATCH_SIZE examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('{}: step {}, loss = {:.2f} ' '({:.1f} examples/sec; {:.3f} sec/batch)') print( format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # log train error and summaries train_error_summary_line, train_summary_line = sess.run( [error, train_summaries], feed_dict={ error_: loss_value, is_training_: True }) train_log.add_summary(train_error_summary_line, global_step=step) train_log.add_summary(train_summary_line, global_step=step) # Save the model checkpoint at the end of every epoch # evaluate train and validation performance if (step > 0 and step % STEPS_PER_EPOCH == 0) or (step + 1) == MAX_STEPS: checkpoint_path = os.path.join(LOG_DIR, 'model.ckpt') train_saver.save(sess, checkpoint_path, global_step=step) # validation error validation_error_value = evaluate.error( LOG_DIR, MODEL, DATASET, InputType.validation, device=EVAL_DEVICE) summary_line = sess.run( error, feed_dict={error_: validation_error_value}) validation_log.add_summary(summary_line, global_step=step) print('{} ({}): train error = {} validation error = {}'. format(datetime.now(), int(step / STEPS_PER_EPOCH), loss_value, validation_error_value)) if validation_error_value < best_validation_error_value: best_validation_error_value = validation_error_value best_saver.save(sess, os.path.join(BEST_MODEL_DIR, 'model.ckpt'), global_step=step) # end of for validation_log.close() train_log.close() # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) return best_validation_error_value
def classifier(): """Trains the classifier and saves the best model: that's the model with the highest validation accuracy). """ best_va = 0.0 with tf.Graph().as_default(), tf.device(ARGS.train_device): global_step = tf.Variable(0, trainable=False, name='global_step') # Get images and labels with tf.device('/cpu:0'): images, labels = DATASET.distorted_inputs(ARGS.batch_size) log_io(images) # Build a Graph that computes the logits predictions from the # inference model. is_training_, logits = MODEL.get(images, DATASET.num_classes, train_phase=True, l2_penalty=ARGS.l2_penalty) # Calculate loss. loss = MODEL.loss(logits, labels) tf_log(tf.summary.scalar('loss', loss)) # Create optimizer and log learning rate optimizer = build_optimizer(global_step) train_op = optimizer.minimize(loss, global_step=global_step, var_list=variables_to_train( ARGS.trainable_scopes)) train_accuracy = metrics.accuracy_op(logits, labels) # General validation summary accuracy_value_ = tf.placeholder(tf.float32, shape=()) accuracy_summary = tf.summary.scalar('accuracy', accuracy_value_) # read collection after that every op added its own # summaries in the train_summaries collection train_summaries = tf.summary.merge( tf.get_collection_ref(MODEL_SUMMARIES)) # Build an initialization operation to run below. init = [ tf.variables_initializer(tf.global_variables() + tf.local_variables()), tf.tables_initializer() ] # Start running operations on the Graph. with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run(init) # Start the queue runners with a coordinator coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Create the savers. train_saver, best_saver = build_train_savers([global_step]) restore_or_restart(sess, global_step) train_log, validation_log = build_loggers(sess.graph) # Extract previous global step value old_gs = sess.run(global_step) # Restart from where we were for step in range(old_gs, MAX_STEPS): start_time = time.time() _, loss_value = sess.run([train_op, loss], feed_dict={is_training_: True}) duration = time.time() - start_time if np.isnan(loss_value): print('Model diverged with loss = NaN') break # update logs every 10 iterations if step % STEPS_PER_LOG == 0: examples_per_sec = ARGS.batch_size / duration sec_per_batch = float(duration) format_str = ('{}: step {}, loss = {:.4f} ' '({:.1f} examples/sec; {:.3f} sec/batch)') print( format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # log train values summary_lines = sess.run(train_summaries, feed_dict={is_training_: True}) train_log.add_summary(summary_lines, global_step=step) # Save the model checkpoint at the end of every epoch # evaluate train and validation performance if (step > 0 and step % STEPS_PER_EPOCH == 0) or (step + 1) == MAX_STEPS: checkpoint_path = os.path.join(LOG_DIR, 'model.ckpt') train_saver.save(sess, checkpoint_path, global_step=step) # validation accuracy va_value = eval_model(LOG_DIR, InputType.validation) summary_line = sess.run( accuracy_summary, feed_dict={accuracy_value_: va_value}) validation_log.add_summary(summary_line, global_step=step) # train accuracy ta_value = sess.run(train_accuracy, feed_dict={is_training_: False}) summary_line = sess.run( accuracy_summary, feed_dict={accuracy_value_: ta_value}) train_log.add_summary(summary_line, global_step=step) print( '{} ({}): train accuracy = {:.3f} validation accuracy = {:.3f}' .format(datetime.now(), int(step / STEPS_PER_EPOCH), ta_value, va_value)) # save best model if va_value > best_va: best_va = va_value best_saver.save(sess, os.path.join(BEST_MODEL_DIR, 'model.ckpt'), global_step=step) # end of for validation_log.close() train_log.close() # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads)
def detector(): """Trains the detector and saves the best model: that's the model with the highest IoU. """ best_iou = 0.0 with tf.Graph().as_default(), tf.device(ARGS.train_device): global_step = tf.Variable(0, trainable=False, name='global_step') with tf.device('/cpu:0'): images, ground_truth = DATASET.distorted_inputs(ARGS.batch_size) # Build a Graph that computes the logits predictions from the # inference model. # predictions has shape: [batch_size, n, m, num_bboxes, 4 + num_classes] # 4 = coords # n & m = 1 when training and input has the expected shape of the network is_training_, predictions = MODEL.get(images, DATASET.num_classes, train_phase=True, l2_penalty=ARGS.l2_penalty) # Calculate loss. loss = MODEL.loss(predictions, ground_truth) tf_log(tf.summary.scalar('loss', loss)) # reshape predictions in order to be useful in training predictions = tf.squeeze(predictions, axis=[1, 2]) angle = predictions[:, :1] logits = predictions[:, 1:] # reshape ground truth in order to be useful in training ground_truth = tf.squeeze(ground_truth, axis=[1]) real_angle = ground_truth[:, :1] labels = tf.cast(ground_truth[:, 1], tf.int32) # add dimension to real angle, in order to get a tensor with shape: # [batch_size, 1=num_bboxes, 1=angle] log_io(images) # Create optimizer and log learning rate optimizer = build_optimizer(global_step) train_op = optimizer.minimize(loss, global_step=global_step, var_list=variables_to_train( ARGS.trainable_scopes)) #iou_value_ = tf.placeholder(tf.float32, shape=()) #iou_summary = tf.summary.scalar('iou', iou_value_) # Train accuracy op train_accuracy = metrics.accuracy_op(logits, labels) # General validation summary accuracy_value_ = tf.placeholder(tf.float32, shape=()) accuracy_summary = tf.summary.scalar('accuracy', accuracy_value_) with tf.variable_scope("angle_distance"): angle_distance = tf.reduce_mean(180. - tf.mod(tf.abs(real_angle - angle), 360.) - 180.) tf_log(tf.summary.scalar('angle_distance', angle_distance)) # read collection after that every op added its own # summaries in the train_summaries collection train_summaries = tf.summary.merge( tf.get_collection_ref(MODEL_SUMMARIES)) # Build an initialization operation to run below. init = [ tf.variables_initializer(tf.global_variables() + tf.local_variables()), tf.tables_initializer() ] # Start running operations on the Graph. with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run(init) # Start the queue runners with a coordinator coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Create the savers. restore_or_restart(sess, global_step) train_saver, best_saver = build_train_savers([global_step]) train_log, validation_log = build_loggers(sess.graph) # Extract previous global step value old_gs = sess.run(global_step) # Restart from where we were for step in range(old_gs, MAX_STEPS): start_time = time.time() _, loss_value = sess.run([train_op, loss], feed_dict={is_training_: True}) duration = time.time() - start_time if np.isnan(loss_value): print('Model diverged with loss = NaN') break # update logs every 10 iterations if step % STEPS_PER_LOG == 0: examples_per_sec = ARGS.batch_size / duration sec_per_batch = float(duration) format_str = ('{}: step {}, loss = {:.4f} ' '({:.1f} examples/sec; {:.3f} sec/batch)') print( format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # log train values summary_lines = sess.run(train_summaries, feed_dict={is_training_: True}) train_log.add_summary(summary_lines, global_step=step) # Save the model checkpoint at the end of every epoch # evaluate train and validation performance if (step > 0 and step % STEPS_PER_EPOCH == 0) or (step + 1) == MAX_STEPS: checkpoint_path = os.path.join(LOG_DIR, 'model.ckpt') train_saver.save(sess, checkpoint_path, global_step=step) # train metrics ta_value = sess.run(train_accuracy, feed_dict={is_training_: False}) summary_line = sess.run( accuracy_summary, feed_dict={accuracy_value_: ta_value}) train_log.add_summary(summary_line, global_step=step) # TODO: validation metrics print('{} ({}): train acc: {:.3f}'.format( datetime.now(), int(step / STEPS_PER_EPOCH), ta_value)) # TODO: save best model #if validation_iou_value > best_iou: # best_iou = validation_iou_value # best_saver.save( # sess, # os.path.join(BEST_MODEL_DIR, 'model.ckpt'), # global_step=step) # end of for validation_log.close() train_log.close() # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads)