def generate_output(mode): """ Generate annotated images, videos, or sample images, based on mode """ # First, load mapping from integer class ID to sign name string sign_map = {} with open('signnames.csv', 'r') as f: for line in f: line = line[:-1] # strip newline at the end sign_id, sign_name = line.split(',') sign_map[int(sign_id)] = sign_name sign_map[0] = 'background' # class ID 0 reserved for background class # Launch the graph path = 'model/model.ckpt' with tf.Graph().as_default(), tf.Session() as sess: # "Instantiate" neural network, get relevant tensors model = SSDModel() # Load trained model saver = tf.train.Saver() print('Restoring previously trained model at %s' % path) saver.restore(sess, path) image_orig = cv2.imread('test.jpg', cv2.IMREAD_COLOR) t = time.time() image_orig = cv2.resize( image_orig, (int(image_orig.shape[1] / 2), int(image_orig.shape[0] / 2))) image = run_inference(image_orig, model, sess, mode, sign_map) print(image.shape) print(time.time() - t) show(image)
def run_training(): """ Load training and test data Run training process Plot train/validation losses Report test loss Save model """ # Load training and test data with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), mode='rb') as f: train = pickle.load(f) # with open('test.p', mode='rb') as f: # test = pickle.load(f) # Format the data X_train = [] y_train_conf = [] y_train_loc = [] for image_file in train.keys(): X_train.append(image_file) y_train_conf.append(train[image_file]['y_true_conf']) y_train_loc.append(train[image_file]['y_true_loc']) X_train = np.array(X_train) y_train_conf = np.array(y_train_conf) y_train_loc = np.array(y_train_loc) # Train/validation split X_train, X_valid, y_train_conf, y_valid_conf, y_train_loc, y_valid_loc = train_test_split( \ X_train, y_train_conf, y_train_loc, test_size=VALIDATION_SIZE, random_state=1) # Launch the graph with tf.Graph().as_default(), tf.Session() as sess: # "Instantiate" neural network, get relevant tensors model = SSDModel() x = model['x'] y_true_conf = model['y_true_conf'] y_true_loc = model['y_true_loc'] conf_loss_mask = model['conf_loss_mask'] is_training = model['is_training'] optimizer = model['optimizer'] reported_loss = model['loss'] # Training process # TF saver to save/restore trained model saver = tf.train.Saver() if RESUME: print('Restoring previously trained model at %s' % MODEL_SAVE_PATH) saver.restore(sess, MODEL_SAVE_PATH) # Restore previous loss history with open('loss_history.p', 'rb') as f: loss_history = pickle.load(f) else: print('Training model from scratch') # Variable initialization sess.run(tf.global_variables_initializer()) # For book-keeping, keep track of training and validation loss over epochs, like such: # [(train_acc_epoch1, valid_acc_epoch1), (train_acc_epoch2, valid_acc_epoch2), ...] loss_history = [] # Record time elapsed for performance check last_time = time.time() train_start_time = time.time() # Run NUM_EPOCH epochs of training for epoch in range(NUM_EPOCH): train_gen = next_batch(X_train, y_train_conf, y_train_loc, BATCH_SIZE) num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE) losses = [] # list of loss values for book-keeping # Run training on each batch for _ in range(num_batches_train): # Obtain the training data and labels from generator images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next( train_gen) # Perform gradient update (i.e. training step) on current batch _, loss = sess.run( [optimizer, reported_loss], feed_dict={ # _, loss, loc_loss_dbg, loc_loss_mask, loc_loss = sess.run([optimizer, reported_loss, model['loc_loss_dbg'], model['loc_loss_mask'], model['loc_loss']],feed_dict={ # DEBUG x: images, y_true_conf: y_true_conf_gen, y_true_loc: y_true_loc_gen, conf_loss_mask: conf_loss_mask_gen, is_training: True }) losses.append( loss) # TODO: Need mAP metric instead of raw loss # A rough estimate of loss for this epoch (overweights the last batch) train_loss = np.mean(losses) # Calculate validation loss at the end of the epoch valid_gen = next_batch(X_valid, y_valid_conf, y_valid_loc, BATCH_SIZE) num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE) losses = [] for _ in range(num_batches_valid): images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next( valid_gen) # Perform forward pass and calculate loss loss = sess.run(reported_loss, feed_dict={ x: images, y_true_conf: y_true_conf_gen, y_true_loc: y_true_loc_gen, conf_loss_mask: conf_loss_mask_gen, is_training: False }) losses.append(loss) valid_loss = np.mean(losses) # Record and report train/validation/test losses for this epoch loss_history.append((train_loss, valid_loss)) # Print accuracy every epoch print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' % \ (epoch + 1, train_loss, valid_loss, time.time() - last_time)) last_time = time.time() total_time = time.time() - train_start_time print('Total elapsed time: %d min %d sec' % (total_time / 60, total_time % 60)) test_loss = 0. # TODO: Add test set ''' # After training is complete, evaluate accuracy on test set print('Calculating test accuracy...') test_gen = next_batch(X_test, y_test, BATCH_SIZE) test_size = X_test.shape[0] test_acc = calculate_accuracy(test_gen, test_size, BATCH_SIZE, accuracy, x, y, keep_prob, sess) print('Test acc.: %.4f' % (test_acc,)) ''' if SAVE_MODEL: # Save model to disk save_path = saver.save(sess, MODEL_SAVE_PATH) print('Trained model saved at: %s' % save_path) # Also save accuracy history print('Loss history saved at loss_history.p') with open('loss_history.p', 'wb') as f: pickle.dump(loss_history, f) # Return final test accuracy and accuracy_history return test_loss, loss_history
y_valid_loc.append(ys_valid_loc) y_test_loc.append(ys_test_loc) X_train = np.squeeze(np.array(X_train),axis=0) X_valid = np.squeeze(np.array(X_valid),axis=0) X_test = np.squeeze(np.array(X_test),axis=0) y_train_conf = np.squeeze(np.array(y_train_conf),axis=0) y_valid_conf = np.squeeze(np.array(y_valid_conf),axis=0) y_test_conf = np.squeeze(np.array(y_test_conf),axis=0) y_train_loc = np.squeeze(np.array(y_train_loc),axis=0) y_valid_loc = np.squeeze(np.array(y_valid_loc),axis=0) y_test_loc = np.squeeze(np.array(y_test_loc),axis=0) # Launch the graph with tf.Graph().as_default(), tf.Session() as sess: # "Instantiate" neural network, get relevant tensors model = SSDModel() x = model['x'] y_true_conf = model['y_true_conf'] y_true_loc = model['y_true_loc'] conf_loss_mask = model['conf_loss_mask'] is_training = model['is_training'] optimizer = model['optimizer'] reported_loss = model['loss'] num_pos = model['num_pos'] # Training process # TF saver to save/restore trained model saver = tf.train.Saver() if RESUME: print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
if top is not None and len(res) > 2 * top: res = cut_top(res) return cut_top(res) def restore_rects(self, tensors, threshold=None, top=None): #print len(tensors) #print tensors.shape def cut_top(res): res = sorted(res, reverse=True, key=lambda val: val[0]) if top is not None: res = res[:top] return res lr, tb, cls = tensors result = self._restore_rects(lr, tb, cls, self.model.num_poolings, threshold, top) if self.verbose: print result result = cut_top(result) return tuple(r[0] for r in result), tuple(r[1] for r in result) if __name__ == '__main__': ssd_model = SSDModel()
def generate_output(input_files, mode): """ Generate annotated images, videos, or sample images, based on mode """ # First, load mapping from integer class ID to sign name string sign_map = {} with open('signnames.csv', 'r') as f: for line in f: line = line[:-1] # strip newline at the end sign_id, sign_name = line.split(',') sign_map[int(sign_id)] = sign_name sign_map[0] = 'background' # class ID 0 reserved for background class logging.info(sign_map) # Create output directory 'inference_out/' if needed if mode == 'image' or mode == 'video': if not os.path.isdir('./inference_out'): try: os.mkdir('./inference_out') except FileExistsError: print('Error: Cannot mkdir ./inference_out') return # Launch the graph with tf.Graph().as_default(), tf.Session() as sess: # "Instantiate" neural network, get relevant tensors model = SSDModel() # logging.info(model) # Load trained model saver = tf.train.Saver() logging.critical('开始加载已训练模型 %s' % MODEL_SAVE_PATH) saver.restore(sess, MODEL_SAVE_PATH) if mode == 'image': for image_file in input_files: print('Running inference on %s' % image_file) image_orig = np.asarray(Image.open(image_file)) image = run_inference(image_orig, model, sess, mode, sign_map) head, tail = os.path.split(image_file) plt.imsave('./inference_out/%s' % tail, image) print('输出文件保存至 inference_out/ 目录') elif mode == 'video': for video_file in input_files: print('Running inference on %s' % video_file) video = VideoFileClip(video_file) video = video.fl_image( lambda x: run_inference(x, model, sess, mode, sign_map)) head, tail = os.path.split(video_file) video.write_videofile('./inference_out/%s' % tail, audio=False) print('Output saved in inference_out/') elif mode == 'demo': print('Demo mode: Running inference on images in sample_images/') image_files = os.listdir('sample_images/') print("-" * 30) for image_file in image_files: print('Running inference on sample_images/%s' % image_file) # image_orig = np.asarray(Image.open('sample_images/' + image_file)) image_orig = Image.open('sample_images/' + image_file) image = run_inference(image_orig, model, sess, mode, sign_map) plt.imshow(image) plt.show() print("-" * 30) else: raise ValueError('Invalid mode: %s' % mode)
def main(async_executor=None): # Setup MLPerf logger mllog.config() mllogger = mllog.get_mllogger() mllogger.logger.propagate = False # Start MLPerf benchmark log_start(key=mlperf_constants.INIT_START, uniq=False) # Parse args args = parse_args() ############################################################################ # Initialize various libraries (horovod, logger, amp ...) ############################################################################ # Initialize async executor if args.async_val: assert async_executor is not None, 'Please use ssd_main_async.py to launch with async support' else: # (Force) disable async validation async_executor = None # Initialize horovod hvd.init() # Initialize AMP if args.precision == 'amp': amp.init(layout_optimization=True) # Set MXNET_SAFE_ACCUMULATION=1 if necessary if args.precision == 'fp16': os.environ["MXNET_SAFE_ACCUMULATION"] = "1" # Results folder network_name = f'ssd_{args.backbone}_{args.data_layout}_{args.dataset}_{args.data_shape}' save_prefix = None if args.results: save_prefix = os.path.join(args.results, network_name) else: logging.info( "No results folder was provided. The script will not write logs or save weight to disk" ) # Initialize logger log_file = None if args.results: log_file = f'{save_prefix}_{args.mode}_{hvd.rank()}.log' setup_logger(level=args.log_level if hvd.local_rank() in args.log_local_ranks else 'CRITICAL', log_file=log_file) # Set seed args.seed = set_seed_distributed(args.seed) ############################################################################ ############################################################################ # Validate arguments and print some useful information ############################################################################ logging.info(args) assert not (args.resume_from and args.pretrained_backbone), ( "--resume-from and --pretrained_backbone are " "mutually exclusive.") assert args.data_shape == 300, "only data_shape=300 is supported at the moment." assert args.input_batch_multiplier >= 1, "input_batch_multiplier must be >= 1" assert not (hvd.size() == 1 and args.gradient_predivide_factor > 1), ( "Gradient predivide factor is not supported " "with a single GPU") if args.data_layout == 'NCHW' or args.precision == 'fp32': assert args.bn_group == 1, "Group batch norm doesn't support FP32 data format or NCHW data layout." if not args.no_fuse_bn_relu: logging.warning(( "WARNING: fused batch norm relu is only supported with NHWC layout. " "A non fused version will be forced.")) args.no_fuse_bn_relu = True if not args.no_fuse_bn_add_relu: logging.warning(( "WARNING: fused batch norm add relu is only supported with NHWC layout. " "A non fused version will be forced.")) args.no_fuse_bn_add_relu = True if args.profile_no_horovod and hvd.size() > 1: logging.warning( "WARNING: hvd.size() > 1, so must IGNORE requested --profile-no-horovod" ) args.profile_no_horovod = False logging.info(f'Seed: {args.seed}') logging.info(f'precision: {args.precision}') if args.precision == 'fp16': logging.info(f'loss scaling: {args.fp16_loss_scale}') logging.info(f'network name: {network_name}') logging.info(f'fuse bn relu: {not args.no_fuse_bn_relu}') logging.info(f'fuse bn add relu: {not args.no_fuse_bn_add_relu}') logging.info(f'bn group: {args.bn_group}') logging.info(f'bn all reduce fp16: {args.bn_fp16}') logging.info(f'MPI size: {hvd.size()}') logging.info(f'MPI global rank: {hvd.rank()}') logging.info(f'MPI local rank: {hvd.local_rank()}') logging.info(f'async validation: {args.async_val}') ############################################################################ # TODO(ahmadki): load network and anchors based on args.backbone (JoC) # Load network net = ssd_300_resnet34_v1_mlperf_coco( pretrained_base=False, nms_overlap_thresh=args.nms_overlap_thresh, nms_topk=args.nms_topk, nms_valid_thresh=args.nms_valid_thresh, post_nms=args.post_nms, layout=args.data_layout, fuse_bn_add_relu=not args.no_fuse_bn_add_relu, fuse_bn_relu=not args.no_fuse_bn_relu, bn_fp16=args.bn_fp16, norm_kwargs={'bn_group': args.bn_group}) # precomputed anchors anchors_np = mlperf_xywh_anchors(image_size=args.data_shape, clip=True, normalize=True) if args.test_anchors and hvd.rank() == 0: logging.info(f'Normalized anchors: {anchors_np}') # Training mode train_net = None train_pipeline = None trainer_fn = None lr_scheduler = None if args.mode in ['train', 'train_val']: # Training iterator num_cropping_iterations = 1 if args.use_tfrecord: tfrecord_files = glob.glob( os.path.join(args.tfrecord_root, 'train.*.tfrecord')) index_files = glob.glob( os.path.join(args.tfrecord_root, 'train.*.idx')) tfrecords = [(tfrecod, index) for tfrecod, index in zip(tfrecord_files, index_files) ] train_pipeline = get_training_pipeline( coco_root=args.coco_root if not args.use_tfrecord else None, tfrecords=tfrecords if args.use_tfrecord else None, anchors=anchors_np, num_shards=hvd.size(), shard_id=hvd.rank(), device_id=hvd.local_rank(), batch_size=args.batch_size * args.input_batch_multiplier, dataset_size=args.dataset_size, data_layout=args.data_layout, data_shape=args.data_shape, num_cropping_iterations=num_cropping_iterations, num_workers=args.dali_workers, fp16=args.precision == 'fp16', input_jpg_decode=args.input_jpg_decode, hw_decoder_load=args.hw_decoder_load, decoder_cache_size=min( (100 * 1024 + hvd.size() - 1) // hvd.size(), 12 * 1024) if args.input_jpg_decode == 'cache' else 0, seed=args.seed) log_event(key=mlperf_constants.TRAIN_SAMPLES, value=train_pipeline.epoch_size) log_event(key=mlperf_constants.MAX_SAMPLES, value=num_cropping_iterations) # Training network train_net = SSDMultiBoxLoss(net=net, local_batch_size=args.batch_size, bulk_last_wgrad=args.bulk_last_wgrad) # Trainer function. SSDModel expects a function that takes 1 parameter - HybridBlock trainer_fn = functools.partial( sgd_trainer, learning_rate=args.lr, weight_decay=args.weight_decay, momentum=args.momentum, precision=args.precision, fp16_loss_scale=args.fp16_loss_scale, gradient_predivide_factor=args.gradient_predivide_factor, num_groups=args.horovod_num_groups, profile_no_horovod=args.profile_no_horovod) # Learning rate scheduler lr_scheduler = MLPerfLearningRateScheduler( learning_rate=args.lr, decay_factor=args.lr_decay_factor, decay_epochs=args.lr_decay_epochs, warmup_factor=args.lr_warmup_factor, warmup_epochs=args.lr_warmup_epochs, epoch_size=train_pipeline.epoch_size, global_batch_size=args.batch_size * hvd.size()) # Validation mode infer_net = None val_iterator = None if args.mode in ['infer', 'val', 'train_val']: # Validation iterator tfrecord_files = glob.glob( os.path.join(args.tfrecord_root, 'val.*.tfrecord')) index_files = glob.glob(os.path.join(args.tfrecord_root, 'val.*.idx')) tfrecords = [(tfrecod, index) for tfrecod, index in zip(tfrecord_files, index_files)] val_pipeline = get_inference_pipeline( coco_root=args.coco_root if not args.use_tfrecord else None, tfrecords=tfrecords if args.use_tfrecord else None, num_shards=hvd.size(), shard_id=hvd.rank(), device_id=hvd.local_rank(), batch_size=args.eval_batch_size, dataset_size=args.eval_dataset_size, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16') log_event(key=mlperf_constants.EVAL_SAMPLES, value=val_pipeline.epoch_size) # Inference network infer_net = COCOInference(net=net, ltrb=False, scale_bboxes=True, score_threshold=0.0) # annotations file cocoapi_annotation_file = os.path.join( args.coco_root, 'annotations', 'bbox_only_instances_val2017.json') # Prepare model model = SSDModel(net=net, anchors_np=anchors_np, precision=args.precision, fp16_loss_scale=args.fp16_loss_scale, train_net=train_net, trainer_fn=trainer_fn, lr_scheduler=lr_scheduler, metric=mx.metric.Loss(), infer_net=infer_net, async_executor=async_executor, save_prefix=save_prefix, ctx=mx.gpu(hvd.local_rank())) # Do a training and validation runs on fake data. # this will set layers shape (needed before loading pre-trained backbone), # allocate tensors and and cache optimized graph. # Training dry run: logging.info('Running training dry runs') dummy_train_pipeline = get_training_pipeline( coco_root=None, tfrecords=[('dummy.tfrecord', 'dummy.idx')], anchors=anchors_np, num_shards=1, shard_id=0, device_id=hvd.local_rank(), batch_size=args.batch_size * args.input_batch_multiplier, dataset_size=None, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16', seed=args.seed) dummy_train_iterator = get_training_iterator(pipeline=dummy_train_pipeline, batch_size=args.batch_size) for images, box_targets, cls_targets in dummy_train_iterator: model.train_step(images=images, box_targets=box_targets, cls_targets=cls_targets) # Freeing memory is disabled due a bug in CUDA graphs # del dummy_train_pipeline # del dummy_train_iterator mx.ndarray.waitall() logging.info('Done') # Validation dry run: logging.info('Running inference dry runs') dummy_val_pipeline = get_inference_pipeline( coco_root=None, tfrecords=[('dummy.tfrecord', 'dummy.idx')], num_shards=1, shard_id=0, device_id=hvd.local_rank(), batch_size=args.eval_batch_size, dataset_size=None, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16') dummy_val_iterator = get_inference_iterator(pipeline=dummy_val_pipeline) model.infer(data_iterator=dummy_val_iterator, log_interval=None) # Freeing memory is disabled due a bug in CUDA graphs # del dummy_val_pipeline # del dummy_val_iterator mx.ndarray.waitall() logging.info('Done') # re-initialize the model as a precaution in case the dry runs changed the parameters model.init_model(force_reinit=True) model.zero_grads() mx.ndarray.waitall() # load saved model or pretrained backbone if args.resume_from: model.load_parameters(filename=args.resume_from) elif args.pretrained_backbone: model.load_pretrain_backbone(picklefile_name=args.pretrained_backbone) # broadcast parameters model.broadcast_params() mx.ndarray.waitall() if args.test_initialization and hvd.rank() == 0: model.print_params_stats(net) log_end(key=mlperf_constants.INIT_STOP) # Main MLPerf loop (training+validation) mpiwrapper.barrier() log_start(key=mlperf_constants.RUN_START) mpiwrapper.barrier() # Real data iterators train_iterator = None val_iterator = None if train_pipeline: train_iterator = get_training_iterator(pipeline=train_pipeline, batch_size=args.batch_size, synthetic=args.synthetic) if val_pipeline: val_iterator = get_inference_iterator(pipeline=val_pipeline) model_map, epoch = model.train_val(train_iterator=train_iterator, start_epoch=args.start_epoch, end_epoch=args.epochs, val_iterator=val_iterator, val_interval=args.val_interval, val_epochs=args.val_epochs, annotation_file=cocoapi_annotation_file, target_map=args.target_map, train_log_interval=args.log_interval, val_log_interval=args.log_interval, save_interval=args.save_interval, cocoapi_threads=args.cocoapi_threads, profile_start=args.profile_start, profile_stop=args.profile_stop) status = 'success' if (model_map and model_map >= args.target_map) else 'aborted' mx.ndarray.waitall() log_end(key=mlperf_constants.RUN_STOP, metadata={"status": status}) logging.info(f'Rank {hvd.rank()} done. map={model_map} @ epoch={epoch}') mx.nd.waitall() hvd.shutdown()
def run_training(): """ Load training and test data Run training process Plot train/validation losses Report test loss Save model """ # Load training data - recall training data could be chunked # Training data in dict train, which is a merge of data_prep_*.p data_prep_list = [] data_prep = {} for dp_file in glob.glob('data_prep_%sx%s__*.p' % (IMG_W, IMG_H)): with open(dp_file, mode='rb') as f: dp = pickle.load(f) data_prep_list.append(dp) for dp in data_prep_list: data_prep = {**data_prep, **dp} # Manually do the train/validation split (sklearn train_test_split runs out of memory) train = {} valid = {} num_valid = int(len(data_prep.keys()) * VALIDATION_SIZE) random_keys = list(data_prep.keys()) random.shuffle(random_keys) # random.shuffle() shuffles list *in place* for i, k in enumerate(random_keys): if i < num_valid: valid[k] = data_prep[k] else: train[k] = data_prep[k] # Format the data, for both train and validation data X_train = [] y_train_conf = [] y_train_loc = [] for image_file in train.keys(): X_train.append(image_file) y_train_conf.append(train[image_file]['y_true_conf']) y_train_loc.append(train[image_file]['y_true_loc']) X_train = np.array(X_train) y_train_conf = np.array(y_train_conf) y_train_loc = np.array(y_train_loc) X_valid = [] y_valid_conf = [] y_valid_loc = [] for image_file in valid.keys(): X_valid.append(image_file) y_valid_conf.append(valid[image_file]['y_true_conf']) y_valid_loc.append(valid[image_file]['y_true_loc']) X_valid = np.array(X_valid) y_valid_conf = np.array(y_valid_conf) y_valid_loc = np.array(y_valid_loc) # Launch the graph with tf.Graph().as_default(), tf.Session() as sess: # "Instantiate" neural network, get relevant tensors model = SSDModel() x = model['x'] y_true_conf = model['y_true_conf'] y_true_loc = model['y_true_loc'] conf_loss_mask = model['conf_loss_mask'] is_training = model['is_training'] optimizer = model['optimizer'] reported_loss = model['loss'] # Training process # TF saver to save/restore trained model saver = tf.train.Saver() if RESUME: print('Restoring previously trained model at %s' % MODEL_SAVE_PATH) saver.restore(sess, MODEL_SAVE_PATH) # Restore previous loss history with open('loss_history.p', 'rb') as f: loss_history = pickle.load(f) else: print('Training model from scratch') # Variable initialization sess.run(tf.global_variables_initializer()) # For book-keeping, keep track of training and validation loss over epochs, like such: # [(train_acc_epoch1, valid_acc_epoch1), (train_acc_epoch2, valid_acc_epoch2), ...] loss_history = [] # Record time elapsed for performance check last_time = time.time() train_start_time = time.time() # Run NUM_EPOCH epochs of training for epoch in range(NUM_EPOCH): train_gen = next_batch(X_train, y_train_conf, y_train_loc, BATCH_SIZE) num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE) losses = [] # list of loss values for book-keeping # Run training on each batch for _ in range(num_batches_train): # Obtain the training data and labels from generator images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next( train_gen) # Perform gradient update (i.e. training step) on current batch _, loss = sess.run( [optimizer, reported_loss], feed_dict={ x: images, y_true_conf: y_true_conf_gen, y_true_loc: y_true_loc_gen, conf_loss_mask: conf_loss_mask_gen, is_training: True }) losses.append( loss) # TODO: Need mAP metric instead of raw loss # A rough estimate of loss for this epoch (overweights the last batch) train_loss = np.mean(losses) # Calculate validation loss at the end of the epoch valid_gen = next_batch(X_valid, y_valid_conf, y_valid_loc, BATCH_SIZE) num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE) losses = [] for _ in range(num_batches_valid): images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next( valid_gen) # Perform forward pass and calculate loss loss = sess.run(reported_loss, feed_dict={ x: images, y_true_conf: y_true_conf_gen, y_true_loc: y_true_loc_gen, conf_loss_mask: conf_loss_mask_gen, is_training: False }) losses.append(loss) valid_loss = np.mean(losses) # Record and report train/validation/test losses for this epoch loss_history.append((train_loss, valid_loss)) # Print accuracy every epoch print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' %\ (epoch+1, train_loss, valid_loss, time.time() - last_time)) last_time = time.time() if SAVE_MODEL and SAVE_MODEL_EVERY_EPOCH: _ = saver.save(sess, MODEL_SAVE_PATH) total_time = time.time() - train_start_time print('Total elapsed time: %d min %d sec' % (total_time / 60, total_time % 60)) test_loss = 0. # TODO: Add test set if SAVE_MODEL: # Save model to disk save_path = saver.save(sess, MODEL_SAVE_PATH) print('Trained model saved at: %s' % save_path) # Also save accuracy history print('Loss history saved at loss_history.p') with open('loss_history.p', 'wb') as f: pickle.dump(loss_history, f) # Return final test accuracy and accuracy_history return test_loss, loss_history
''' Visualize the model using TensorBoard ''' import tensorflow as tf from settings import * from model import SSDModel FM_ONLY = False # Only want to see feature map sizes? with tf.Graph().as_default(), tf.Session() as sess: if FM_ONLY: # Only want to see feature map sizes (e.g. loss function and vector concatenation not yet set up) if MODEL == 'AlexNet': from model import AlexNet as MyModel else: raise NotImplementedError('Model %s not supported' % MODEL) _ = MyModel() else: # This includes the entire graph, e.g. loss function, optimizer, etc. _ = SSDModel() tf.summary.merge_all() writer = tf.summary.FileWriter('./tensorboard_out', sess.graph) tf.global_variables_initializer().run()