Ejemplo n.º 1
0
def generate_output(mode):
    """
	Generate annotated images, videos, or sample images, based on mode
	"""
    # First, load mapping from integer class ID to sign name string
    sign_map = {}
    with open('signnames.csv', 'r') as f:
        for line in f:
            line = line[:-1]  # strip newline at the end
            sign_id, sign_name = line.split(',')
            sign_map[int(sign_id)] = sign_name
    sign_map[0] = 'background'  # class ID 0 reserved for background class

    # Launch the graph
    path = 'model/model.ckpt'
    with tf.Graph().as_default(), tf.Session() as sess:
        # "Instantiate" neural network, get relevant tensors
        model = SSDModel()

        # Load trained model
        saver = tf.train.Saver()
        print('Restoring previously trained model at %s' % path)
        saver.restore(sess, path)
        image_orig = cv2.imread('test.jpg', cv2.IMREAD_COLOR)
        t = time.time()
        image_orig = cv2.resize(
            image_orig,
            (int(image_orig.shape[1] / 2), int(image_orig.shape[0] / 2)))
        image = run_inference(image_orig, model, sess, mode, sign_map)
        print(image.shape)
        print(time.time() - t)
        show(image)
def run_training():
    """
	Load training and test data
	Run training process
	Plot train/validation losses
	Report test loss
	Save model
	"""
    # Load training and test data
    with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), mode='rb') as f:
        train = pickle.load(f)
    # with open('test.p', mode='rb') as f:
    #	test = pickle.load(f)

    # Format the data
    X_train = []
    y_train_conf = []
    y_train_loc = []
    for image_file in train.keys():
        X_train.append(image_file)
        y_train_conf.append(train[image_file]['y_true_conf'])
        y_train_loc.append(train[image_file]['y_true_loc'])
    X_train = np.array(X_train)
    y_train_conf = np.array(y_train_conf)
    y_train_loc = np.array(y_train_loc)

    # Train/validation split
    X_train, X_valid, y_train_conf, y_valid_conf, y_train_loc, y_valid_loc = train_test_split( \
        X_train, y_train_conf, y_train_loc, test_size=VALIDATION_SIZE, random_state=1)

    # Launch the graph
    with tf.Graph().as_default(), tf.Session() as sess:
        # "Instantiate" neural network, get relevant tensors
        model = SSDModel()
        x = model['x']
        y_true_conf = model['y_true_conf']
        y_true_loc = model['y_true_loc']
        conf_loss_mask = model['conf_loss_mask']
        is_training = model['is_training']
        optimizer = model['optimizer']
        reported_loss = model['loss']

        # Training process
        # TF saver to save/restore trained model
        saver = tf.train.Saver()

        if RESUME:
            print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
            saver.restore(sess, MODEL_SAVE_PATH)

            # Restore previous loss history
            with open('loss_history.p', 'rb') as f:
                loss_history = pickle.load(f)
        else:
            print('Training model from scratch')
            # Variable initialization
            sess.run(tf.global_variables_initializer())

            # For book-keeping, keep track of training and validation loss over epochs, like such:
            # [(train_acc_epoch1, valid_acc_epoch1), (train_acc_epoch2, valid_acc_epoch2), ...]
            loss_history = []

        # Record time elapsed for performance check
        last_time = time.time()
        train_start_time = time.time()

        # Run NUM_EPOCH epochs of training
        for epoch in range(NUM_EPOCH):
            train_gen = next_batch(X_train, y_train_conf, y_train_loc,
                                   BATCH_SIZE)
            num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE)
            losses = []  # list of loss values for book-keeping

            # Run training on each batch
            for _ in range(num_batches_train):
                # Obtain the training data and labels from generator
                images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(
                    train_gen)

                # Perform gradient update (i.e. training step) on current batch
                _, loss = sess.run(
                    [optimizer, reported_loss],
                    feed_dict={
                        # _, loss, loc_loss_dbg, loc_loss_mask, loc_loss = sess.run([optimizer, reported_loss, model['loc_loss_dbg'], model['loc_loss_mask'], model['loc_loss']],feed_dict={  # DEBUG
                        x: images,
                        y_true_conf: y_true_conf_gen,
                        y_true_loc: y_true_loc_gen,
                        conf_loss_mask: conf_loss_mask_gen,
                        is_training: True
                    })

                losses.append(
                    loss)  # TODO: Need mAP metric instead of raw loss

            # A rough estimate of loss for this epoch (overweights the last batch)
            train_loss = np.mean(losses)

            # Calculate validation loss at the end of the epoch
            valid_gen = next_batch(X_valid, y_valid_conf, y_valid_loc,
                                   BATCH_SIZE)
            num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE)
            losses = []
            for _ in range(num_batches_valid):
                images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(
                    valid_gen)

                # Perform forward pass and calculate loss
                loss = sess.run(reported_loss,
                                feed_dict={
                                    x: images,
                                    y_true_conf: y_true_conf_gen,
                                    y_true_loc: y_true_loc_gen,
                                    conf_loss_mask: conf_loss_mask_gen,
                                    is_training: False
                                })
                losses.append(loss)
            valid_loss = np.mean(losses)

            # Record and report train/validation/test losses for this epoch
            loss_history.append((train_loss, valid_loss))

            # Print accuracy every epoch
            print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' % \
                  (epoch + 1, train_loss, valid_loss, time.time() - last_time))
            last_time = time.time()

        total_time = time.time() - train_start_time
        print('Total elapsed time: %d min %d sec' %
              (total_time / 60, total_time % 60))

        test_loss = 0.  # TODO: Add test set
        '''
		# After training is complete, evaluate accuracy on test set
		print('Calculating test accuracy...')
		test_gen = next_batch(X_test, y_test, BATCH_SIZE)
		test_size = X_test.shape[0]
		test_acc = calculate_accuracy(test_gen, test_size, BATCH_SIZE, accuracy, x, y, keep_prob, sess)
		print('Test acc.: %.4f' % (test_acc,))
		'''

        if SAVE_MODEL:
            # Save model to disk
            save_path = saver.save(sess, MODEL_SAVE_PATH)
            print('Trained model saved at: %s' % save_path)

            # Also save accuracy history
            print('Loss history saved at loss_history.p')
            with open('loss_history.p', 'wb') as f:
                pickle.dump(loss_history, f)

    # Return final test accuracy and accuracy_history
    return test_loss, loss_history
Ejemplo n.º 3
0
		y_valid_loc.append(ys_valid_loc)
     y_test_loc.append(ys_test_loc)   
	X_train = np.squeeze(np.array(X_train),axis=0)
	X_valid = np.squeeze(np.array(X_valid),axis=0)
    X_test = np.squeeze(np.array(X_test),axis=0)
	y_train_conf = np.squeeze(np.array(y_train_conf),axis=0)
	y_valid_conf = np.squeeze(np.array(y_valid_conf),axis=0)
    y_test_conf = np.squeeze(np.array(y_test_conf),axis=0)
	y_train_loc = np.squeeze(np.array(y_train_loc),axis=0)
	y_valid_loc = np.squeeze(np.array(y_valid_loc),axis=0)
    y_test_loc = np.squeeze(np.array(y_test_loc),axis=0)

	# Launch the graph
	with tf.Graph().as_default(), tf.Session() as sess:
		# "Instantiate" neural network, get relevant tensors
		model = SSDModel()
		x = model['x']
		y_true_conf = model['y_true_conf']
		y_true_loc = model['y_true_loc']
		conf_loss_mask = model['conf_loss_mask']
		is_training = model['is_training']
		optimizer = model['optimizer']
		reported_loss = model['loss']
		num_pos = model['num_pos']

		# Training process
		# TF saver to save/restore trained model
		saver = tf.train.Saver()

		if RESUME:
			print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
Ejemplo n.º 4
0
                    if top is not None and len(res) > 2 * top:
                        res = cut_top(res)

        return cut_top(res)

    def restore_rects(self, tensors, threshold=None, top=None):

        #print len(tensors)
        #print tensors.shape

        def cut_top(res):
            res = sorted(res, reverse=True, key=lambda val: val[0])
            if top is not None:
                res = res[:top]
            return res

        lr, tb, cls = tensors

        result = self._restore_rects(lr, tb, cls, self.model.num_poolings,
                                     threshold, top)
        if self.verbose:
            print result

        result = cut_top(result)
        return tuple(r[0] for r in result), tuple(r[1] for r in result)


if __name__ == '__main__':
    ssd_model = SSDModel()
def generate_output(input_files, mode):
    """
	Generate annotated images, videos, or sample images, based on mode
	"""
    # First, load mapping from integer class ID to sign name string
    sign_map = {}
    with open('signnames.csv', 'r') as f:
        for line in f:
            line = line[:-1]  # strip newline at the end
            sign_id, sign_name = line.split(',')
            sign_map[int(sign_id)] = sign_name
    sign_map[0] = 'background'  # class ID 0 reserved for background class
    logging.info(sign_map)

    # Create output directory 'inference_out/' if needed
    if mode == 'image' or mode == 'video':
        if not os.path.isdir('./inference_out'):
            try:
                os.mkdir('./inference_out')
            except FileExistsError:
                print('Error: Cannot mkdir ./inference_out')
                return

    # Launch the graph
    with tf.Graph().as_default(), tf.Session() as sess:
        # "Instantiate" neural network, get relevant tensors
        model = SSDModel()
        # logging.info(model)

        # Load trained model
        saver = tf.train.Saver()
        logging.critical('开始加载已训练模型 %s' % MODEL_SAVE_PATH)
        saver.restore(sess, MODEL_SAVE_PATH)

        if mode == 'image':
            for image_file in input_files:
                print('Running inference on %s' % image_file)
                image_orig = np.asarray(Image.open(image_file))
                image = run_inference(image_orig, model, sess, mode, sign_map)

                head, tail = os.path.split(image_file)
                plt.imsave('./inference_out/%s' % tail, image)
            print('输出文件保存至 inference_out/ 目录')

        elif mode == 'video':
            for video_file in input_files:
                print('Running inference on %s' % video_file)
                video = VideoFileClip(video_file)
                video = video.fl_image(
                    lambda x: run_inference(x, model, sess, mode, sign_map))

                head, tail = os.path.split(video_file)
                video.write_videofile('./inference_out/%s' % tail, audio=False)
            print('Output saved in inference_out/')

        elif mode == 'demo':
            print('Demo mode: Running inference on images in sample_images/')
            image_files = os.listdir('sample_images/')

            print("-" * 30)
            for image_file in image_files:
                print('Running inference on sample_images/%s' % image_file)
                # image_orig = np.asarray(Image.open('sample_images/' + image_file))
                image_orig = Image.open('sample_images/' + image_file)
                image = run_inference(image_orig, model, sess, mode, sign_map)
                plt.imshow(image)
                plt.show()
                print("-" * 30)

        else:
            raise ValueError('Invalid mode: %s' % mode)
Ejemplo n.º 6
0
def main(async_executor=None):
    # Setup MLPerf logger
    mllog.config()
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
    # Start MLPerf benchmark
    log_start(key=mlperf_constants.INIT_START, uniq=False)

    # Parse args
    args = parse_args()

    ############################################################################
    # Initialize various libraries (horovod, logger, amp ...)
    ############################################################################
    # Initialize async executor
    if args.async_val:
        assert async_executor is not None, 'Please use ssd_main_async.py to launch with async support'
    else:
        # (Force) disable async validation
        async_executor = None

    # Initialize horovod
    hvd.init()

    # Initialize AMP
    if args.precision == 'amp':
        amp.init(layout_optimization=True)

    # Set MXNET_SAFE_ACCUMULATION=1 if necessary
    if args.precision == 'fp16':
        os.environ["MXNET_SAFE_ACCUMULATION"] = "1"

    # Results folder
    network_name = f'ssd_{args.backbone}_{args.data_layout}_{args.dataset}_{args.data_shape}'
    save_prefix = None
    if args.results:
        save_prefix = os.path.join(args.results, network_name)
    else:
        logging.info(
            "No results folder was provided. The script will not write logs or save weight to disk"
        )

    # Initialize logger
    log_file = None
    if args.results:
        log_file = f'{save_prefix}_{args.mode}_{hvd.rank()}.log'
    setup_logger(level=args.log_level
                 if hvd.local_rank() in args.log_local_ranks else 'CRITICAL',
                 log_file=log_file)

    # Set seed
    args.seed = set_seed_distributed(args.seed)
    ############################################################################

    ############################################################################
    # Validate arguments and print some useful information
    ############################################################################
    logging.info(args)

    assert not (args.resume_from and args.pretrained_backbone), (
        "--resume-from and --pretrained_backbone are "
        "mutually exclusive.")
    assert args.data_shape == 300, "only data_shape=300 is supported at the moment."
    assert args.input_batch_multiplier >= 1, "input_batch_multiplier must be >= 1"
    assert not (hvd.size() == 1 and args.gradient_predivide_factor > 1), (
        "Gradient predivide factor is not supported "
        "with a single GPU")
    if args.data_layout == 'NCHW' or args.precision == 'fp32':
        assert args.bn_group == 1, "Group batch norm doesn't support FP32 data format or NCHW data layout."
        if not args.no_fuse_bn_relu:
            logging.warning((
                "WARNING: fused batch norm relu is only supported with NHWC layout. "
                "A non fused version will be forced."))
            args.no_fuse_bn_relu = True
        if not args.no_fuse_bn_add_relu:
            logging.warning((
                "WARNING: fused batch norm add relu is only supported with NHWC layout. "
                "A non fused version will be forced."))
            args.no_fuse_bn_add_relu = True
    if args.profile_no_horovod and hvd.size() > 1:
        logging.warning(
            "WARNING: hvd.size() > 1, so must IGNORE requested --profile-no-horovod"
        )
        args.profile_no_horovod = False

    logging.info(f'Seed: {args.seed}')
    logging.info(f'precision: {args.precision}')
    if args.precision == 'fp16':
        logging.info(f'loss scaling: {args.fp16_loss_scale}')
    logging.info(f'network name: {network_name}')
    logging.info(f'fuse bn relu: {not args.no_fuse_bn_relu}')
    logging.info(f'fuse bn add relu: {not args.no_fuse_bn_add_relu}')
    logging.info(f'bn group: {args.bn_group}')
    logging.info(f'bn all reduce fp16: {args.bn_fp16}')
    logging.info(f'MPI size: {hvd.size()}')
    logging.info(f'MPI global rank: {hvd.rank()}')
    logging.info(f'MPI local rank: {hvd.local_rank()}')
    logging.info(f'async validation: {args.async_val}')
    ############################################################################

    # TODO(ahmadki): load network and anchors based on args.backbone (JoC)
    # Load network
    net = ssd_300_resnet34_v1_mlperf_coco(
        pretrained_base=False,
        nms_overlap_thresh=args.nms_overlap_thresh,
        nms_topk=args.nms_topk,
        nms_valid_thresh=args.nms_valid_thresh,
        post_nms=args.post_nms,
        layout=args.data_layout,
        fuse_bn_add_relu=not args.no_fuse_bn_add_relu,
        fuse_bn_relu=not args.no_fuse_bn_relu,
        bn_fp16=args.bn_fp16,
        norm_kwargs={'bn_group': args.bn_group})

    # precomputed anchors
    anchors_np = mlperf_xywh_anchors(image_size=args.data_shape,
                                     clip=True,
                                     normalize=True)
    if args.test_anchors and hvd.rank() == 0:
        logging.info(f'Normalized anchors: {anchors_np}')

    # Training mode
    train_net = None
    train_pipeline = None
    trainer_fn = None
    lr_scheduler = None
    if args.mode in ['train', 'train_val']:
        # Training iterator
        num_cropping_iterations = 1
        if args.use_tfrecord:
            tfrecord_files = glob.glob(
                os.path.join(args.tfrecord_root, 'train.*.tfrecord'))
            index_files = glob.glob(
                os.path.join(args.tfrecord_root, 'train.*.idx'))
            tfrecords = [(tfrecod, index)
                         for tfrecod, index in zip(tfrecord_files, index_files)
                         ]
        train_pipeline = get_training_pipeline(
            coco_root=args.coco_root if not args.use_tfrecord else None,
            tfrecords=tfrecords if args.use_tfrecord else None,
            anchors=anchors_np,
            num_shards=hvd.size(),
            shard_id=hvd.rank(),
            device_id=hvd.local_rank(),
            batch_size=args.batch_size * args.input_batch_multiplier,
            dataset_size=args.dataset_size,
            data_layout=args.data_layout,
            data_shape=args.data_shape,
            num_cropping_iterations=num_cropping_iterations,
            num_workers=args.dali_workers,
            fp16=args.precision == 'fp16',
            input_jpg_decode=args.input_jpg_decode,
            hw_decoder_load=args.hw_decoder_load,
            decoder_cache_size=min(
                (100 * 1024 + hvd.size() - 1) // hvd.size(), 12 *
                1024) if args.input_jpg_decode == 'cache' else 0,
            seed=args.seed)
        log_event(key=mlperf_constants.TRAIN_SAMPLES,
                  value=train_pipeline.epoch_size)
        log_event(key=mlperf_constants.MAX_SAMPLES,
                  value=num_cropping_iterations)

        # Training network
        train_net = SSDMultiBoxLoss(net=net,
                                    local_batch_size=args.batch_size,
                                    bulk_last_wgrad=args.bulk_last_wgrad)

        # Trainer function. SSDModel expects a function that takes 1 parameter - HybridBlock
        trainer_fn = functools.partial(
            sgd_trainer,
            learning_rate=args.lr,
            weight_decay=args.weight_decay,
            momentum=args.momentum,
            precision=args.precision,
            fp16_loss_scale=args.fp16_loss_scale,
            gradient_predivide_factor=args.gradient_predivide_factor,
            num_groups=args.horovod_num_groups,
            profile_no_horovod=args.profile_no_horovod)

        # Learning rate scheduler
        lr_scheduler = MLPerfLearningRateScheduler(
            learning_rate=args.lr,
            decay_factor=args.lr_decay_factor,
            decay_epochs=args.lr_decay_epochs,
            warmup_factor=args.lr_warmup_factor,
            warmup_epochs=args.lr_warmup_epochs,
            epoch_size=train_pipeline.epoch_size,
            global_batch_size=args.batch_size * hvd.size())

    # Validation mode
    infer_net = None
    val_iterator = None
    if args.mode in ['infer', 'val', 'train_val']:
        # Validation iterator
        tfrecord_files = glob.glob(
            os.path.join(args.tfrecord_root, 'val.*.tfrecord'))
        index_files = glob.glob(os.path.join(args.tfrecord_root, 'val.*.idx'))
        tfrecords = [(tfrecod, index)
                     for tfrecod, index in zip(tfrecord_files, index_files)]
        val_pipeline = get_inference_pipeline(
            coco_root=args.coco_root if not args.use_tfrecord else None,
            tfrecords=tfrecords if args.use_tfrecord else None,
            num_shards=hvd.size(),
            shard_id=hvd.rank(),
            device_id=hvd.local_rank(),
            batch_size=args.eval_batch_size,
            dataset_size=args.eval_dataset_size,
            data_layout=args.data_layout,
            data_shape=args.data_shape,
            num_workers=args.dali_workers,
            fp16=args.precision == 'fp16')
        log_event(key=mlperf_constants.EVAL_SAMPLES,
                  value=val_pipeline.epoch_size)

        # Inference network
        infer_net = COCOInference(net=net,
                                  ltrb=False,
                                  scale_bboxes=True,
                                  score_threshold=0.0)

        # annotations file
        cocoapi_annotation_file = os.path.join(
            args.coco_root, 'annotations', 'bbox_only_instances_val2017.json')

    # Prepare model
    model = SSDModel(net=net,
                     anchors_np=anchors_np,
                     precision=args.precision,
                     fp16_loss_scale=args.fp16_loss_scale,
                     train_net=train_net,
                     trainer_fn=trainer_fn,
                     lr_scheduler=lr_scheduler,
                     metric=mx.metric.Loss(),
                     infer_net=infer_net,
                     async_executor=async_executor,
                     save_prefix=save_prefix,
                     ctx=mx.gpu(hvd.local_rank()))

    # Do a training and validation runs on fake data.
    # this will set layers shape (needed before loading pre-trained backbone),
    # allocate tensors and and cache optimized graph.
    # Training dry run:
    logging.info('Running training dry runs')
    dummy_train_pipeline = get_training_pipeline(
        coco_root=None,
        tfrecords=[('dummy.tfrecord', 'dummy.idx')],
        anchors=anchors_np,
        num_shards=1,
        shard_id=0,
        device_id=hvd.local_rank(),
        batch_size=args.batch_size * args.input_batch_multiplier,
        dataset_size=None,
        data_layout=args.data_layout,
        data_shape=args.data_shape,
        num_workers=args.dali_workers,
        fp16=args.precision == 'fp16',
        seed=args.seed)
    dummy_train_iterator = get_training_iterator(pipeline=dummy_train_pipeline,
                                                 batch_size=args.batch_size)
    for images, box_targets, cls_targets in dummy_train_iterator:
        model.train_step(images=images,
                         box_targets=box_targets,
                         cls_targets=cls_targets)
    # Freeing memory is disabled due a bug in CUDA graphs
    # del dummy_train_pipeline
    # del dummy_train_iterator
    mx.ndarray.waitall()
    logging.info('Done')
    # Validation dry run:
    logging.info('Running inference dry runs')
    dummy_val_pipeline = get_inference_pipeline(
        coco_root=None,
        tfrecords=[('dummy.tfrecord', 'dummy.idx')],
        num_shards=1,
        shard_id=0,
        device_id=hvd.local_rank(),
        batch_size=args.eval_batch_size,
        dataset_size=None,
        data_layout=args.data_layout,
        data_shape=args.data_shape,
        num_workers=args.dali_workers,
        fp16=args.precision == 'fp16')
    dummy_val_iterator = get_inference_iterator(pipeline=dummy_val_pipeline)
    model.infer(data_iterator=dummy_val_iterator, log_interval=None)
    # Freeing memory is disabled due a bug in CUDA graphs
    # del dummy_val_pipeline
    # del dummy_val_iterator
    mx.ndarray.waitall()
    logging.info('Done')

    # re-initialize the model as a precaution in case the dry runs changed the parameters
    model.init_model(force_reinit=True)
    model.zero_grads()
    mx.ndarray.waitall()

    # load saved model or pretrained backbone
    if args.resume_from:
        model.load_parameters(filename=args.resume_from)
    elif args.pretrained_backbone:
        model.load_pretrain_backbone(picklefile_name=args.pretrained_backbone)

    # broadcast parameters
    model.broadcast_params()
    mx.ndarray.waitall()

    if args.test_initialization and hvd.rank() == 0:
        model.print_params_stats(net)

    log_end(key=mlperf_constants.INIT_STOP)

    # Main MLPerf loop (training+validation)
    mpiwrapper.barrier()
    log_start(key=mlperf_constants.RUN_START)
    mpiwrapper.barrier()
    # Real data iterators
    train_iterator = None
    val_iterator = None
    if train_pipeline:
        train_iterator = get_training_iterator(pipeline=train_pipeline,
                                               batch_size=args.batch_size,
                                               synthetic=args.synthetic)
    if val_pipeline:
        val_iterator = get_inference_iterator(pipeline=val_pipeline)
    model_map, epoch = model.train_val(train_iterator=train_iterator,
                                       start_epoch=args.start_epoch,
                                       end_epoch=args.epochs,
                                       val_iterator=val_iterator,
                                       val_interval=args.val_interval,
                                       val_epochs=args.val_epochs,
                                       annotation_file=cocoapi_annotation_file,
                                       target_map=args.target_map,
                                       train_log_interval=args.log_interval,
                                       val_log_interval=args.log_interval,
                                       save_interval=args.save_interval,
                                       cocoapi_threads=args.cocoapi_threads,
                                       profile_start=args.profile_start,
                                       profile_stop=args.profile_stop)
    status = 'success' if (model_map
                           and model_map >= args.target_map) else 'aborted'
    mx.ndarray.waitall()
    log_end(key=mlperf_constants.RUN_STOP, metadata={"status": status})

    logging.info(f'Rank {hvd.rank()} done. map={model_map} @ epoch={epoch}')
    mx.nd.waitall()
    hvd.shutdown()
Ejemplo n.º 7
0
def run_training():
    """
	Load training and test data
	Run training process
	Plot train/validation losses
	Report test loss
	Save model
	"""
    # Load training data - recall training data could be chunked
    # Training data in dict train, which is a merge of data_prep_*.p
    data_prep_list = []
    data_prep = {}
    for dp_file in glob.glob('data_prep_%sx%s__*.p' % (IMG_W, IMG_H)):
        with open(dp_file, mode='rb') as f:
            dp = pickle.load(f)
            data_prep_list.append(dp)

    for dp in data_prep_list:
        data_prep = {**data_prep, **dp}

    # Manually do the train/validation split (sklearn train_test_split runs out of memory)
    train = {}
    valid = {}
    num_valid = int(len(data_prep.keys()) * VALIDATION_SIZE)
    random_keys = list(data_prep.keys())
    random.shuffle(random_keys)  # random.shuffle() shuffles list *in place*
    for i, k in enumerate(random_keys):
        if i < num_valid:
            valid[k] = data_prep[k]
        else:
            train[k] = data_prep[k]

    # Format the data, for both train and validation data
    X_train = []
    y_train_conf = []
    y_train_loc = []
    for image_file in train.keys():
        X_train.append(image_file)
        y_train_conf.append(train[image_file]['y_true_conf'])
        y_train_loc.append(train[image_file]['y_true_loc'])
    X_train = np.array(X_train)
    y_train_conf = np.array(y_train_conf)
    y_train_loc = np.array(y_train_loc)

    X_valid = []
    y_valid_conf = []
    y_valid_loc = []
    for image_file in valid.keys():
        X_valid.append(image_file)
        y_valid_conf.append(valid[image_file]['y_true_conf'])
        y_valid_loc.append(valid[image_file]['y_true_loc'])
    X_valid = np.array(X_valid)
    y_valid_conf = np.array(y_valid_conf)
    y_valid_loc = np.array(y_valid_loc)

    # Launch the graph
    with tf.Graph().as_default(), tf.Session() as sess:
        # "Instantiate" neural network, get relevant tensors
        model = SSDModel()
        x = model['x']
        y_true_conf = model['y_true_conf']
        y_true_loc = model['y_true_loc']
        conf_loss_mask = model['conf_loss_mask']
        is_training = model['is_training']
        optimizer = model['optimizer']
        reported_loss = model['loss']

        # Training process
        # TF saver to save/restore trained model
        saver = tf.train.Saver()

        if RESUME:
            print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
            saver.restore(sess, MODEL_SAVE_PATH)

            # Restore previous loss history
            with open('loss_history.p', 'rb') as f:
                loss_history = pickle.load(f)
        else:
            print('Training model from scratch')
            # Variable initialization
            sess.run(tf.global_variables_initializer())

            # For book-keeping, keep track of training and validation loss over epochs, like such:
            # [(train_acc_epoch1, valid_acc_epoch1), (train_acc_epoch2, valid_acc_epoch2), ...]
            loss_history = []

        # Record time elapsed for performance check
        last_time = time.time()
        train_start_time = time.time()

        # Run NUM_EPOCH epochs of training
        for epoch in range(NUM_EPOCH):
            train_gen = next_batch(X_train, y_train_conf, y_train_loc,
                                   BATCH_SIZE)
            num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE)
            losses = []  # list of loss values for book-keeping

            # Run training on each batch
            for _ in range(num_batches_train):
                # Obtain the training data and labels from generator
                images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(
                    train_gen)

                # Perform gradient update (i.e. training step) on current batch
                _, loss = sess.run(
                    [optimizer, reported_loss],
                    feed_dict={
                        x: images,
                        y_true_conf: y_true_conf_gen,
                        y_true_loc: y_true_loc_gen,
                        conf_loss_mask: conf_loss_mask_gen,
                        is_training: True
                    })

                losses.append(
                    loss)  # TODO: Need mAP metric instead of raw loss

            # A rough estimate of loss for this epoch (overweights the last batch)
            train_loss = np.mean(losses)

            # Calculate validation loss at the end of the epoch
            valid_gen = next_batch(X_valid, y_valid_conf, y_valid_loc,
                                   BATCH_SIZE)
            num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE)
            losses = []
            for _ in range(num_batches_valid):
                images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(
                    valid_gen)

                # Perform forward pass and calculate loss
                loss = sess.run(reported_loss,
                                feed_dict={
                                    x: images,
                                    y_true_conf: y_true_conf_gen,
                                    y_true_loc: y_true_loc_gen,
                                    conf_loss_mask: conf_loss_mask_gen,
                                    is_training: False
                                })
                losses.append(loss)
            valid_loss = np.mean(losses)

            # Record and report train/validation/test losses for this epoch
            loss_history.append((train_loss, valid_loss))

            # Print accuracy every epoch
            print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' %\
             (epoch+1, train_loss, valid_loss, time.time() - last_time))
            last_time = time.time()

            if SAVE_MODEL and SAVE_MODEL_EVERY_EPOCH:
                _ = saver.save(sess, MODEL_SAVE_PATH)

        total_time = time.time() - train_start_time
        print('Total elapsed time: %d min %d sec' %
              (total_time / 60, total_time % 60))

        test_loss = 0.  # TODO: Add test set

        if SAVE_MODEL:
            # Save model to disk
            save_path = saver.save(sess, MODEL_SAVE_PATH)
            print('Trained model saved at: %s' % save_path)

            # Also save accuracy history
            print('Loss history saved at loss_history.p')
            with open('loss_history.p', 'wb') as f:
                pickle.dump(loss_history, f)

    # Return final test accuracy and accuracy_history
    return test_loss, loss_history
Ejemplo n.º 8
0
'''
Visualize the model using TensorBoard
'''
import tensorflow as tf
from settings import *
from model import SSDModel

FM_ONLY = False  # Only want to see feature map sizes?

with tf.Graph().as_default(), tf.Session() as sess:
    if FM_ONLY:
        # Only want to see feature map sizes (e.g. loss function and vector concatenation not yet set up)
        if MODEL == 'AlexNet':
            from model import AlexNet as MyModel
        else:
            raise NotImplementedError('Model %s not supported' % MODEL)
        _ = MyModel()
    else:
        # This includes the entire graph, e.g. loss function, optimizer, etc.
        _ = SSDModel()

    tf.summary.merge_all()
    writer = tf.summary.FileWriter('./tensorboard_out', sess.graph)
    tf.global_variables_initializer().run()