def main(cfg):
    ######################################################################################
    # Create Training Data
    ######################################################################################
    datasets = build_dataset(cfg.data.train)
    tf_datasets = [
        build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)
    ]
    ######################################################################################
    # Build Model
    ######################################################################################
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    model.CLASSES = datasets.CLASSES
    _ = model(next(iter(tf_datasets[0][0])))
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)
    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model,
                                     batch_processor,
                                     name=cfg.model_name,
                                     optimizer=cfg.optimizer,
                                     work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level),
                                     amp_enabled=cfg.fp16,
                                     loss_weights=cfg.loss_weights)
    runner.timestamp = int(time())
    ######################################################################################
    # Setup Training Hooks
    ######################################################################################
    runner.register_hook(
        checkpoint.CheckpointHook(interval=cfg.checkpoint_interval,
                                  out_dir=cfg.outputs_path,
                                  s3_dir=None))
    runner.register_hook(
        CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval))
    runner.register_hook(iter_timer.IterTimerHook())
    runner.register_hook(text.TextLoggerHook())
    runner.register_hook(
        visualizer.Visualizer(cfg.data.val, interval=100, top_k=10))
    runner.register_hook(
        tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path,
                                          interval=10,
                                          image_interval=100,
                                          s3_dir=None))
    ######################################################################################
    # Run Model
    ######################################################################################
    runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
예제 #2
0
def main():
    args = parse_args()
    num_gpus = len(gpus)
    cfg = Config.fromfile(args.config)
    # update configs according to CLI args
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from

    if args.autoscale_lr:
        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
        total_bs = len(gpus) * cfg.data.imgs_per_gpu
        cfg.optimizer['learning_rate'] = \
            cfg.optimizer['learning_rate'] * total_bs / 8

    # init distributed env first, since logger depends on the dist info.
    init_dist()

    if not gpus:
        distributed = False  # single node single gpu
    else:
        distributed = True

    # create work_dir
    mkdir_or_exist(osp.abspath(cfg.work_dir))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp))
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    # log some basic info
    logger.info('Distributed training: {}'.format(distributed))
    logger.info('TF MMDetection Version: {}'.format(__version__))
    logger.info('Config:\n{}'.format(cfg.text))
    logger.info('Tensorflow version: {}'.format(tf.version.VERSION))

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}, deterministic: {}'.format(
            args.seed, args.deterministic))
        set_random_seed(args.seed + get_dist_info()[0], deterministic=args.deterministic)

    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)

    # dummy data to init network
    padded_img_side = max(cfg.data.train['scale'])
    img = tf.random.uniform(shape=[padded_img_side, padded_img_side, 3], dtype=tf.float32)
    img_meta = tf.constant(
        [465., 640., 3., 800., 1101., 3., float(padded_img_side), float(padded_img_side), 3., 1.7204301, 0.],
        dtype=tf.float32)
    # bboxes = tf.constant([[1.0, 1.0, 10.0, 10.0]], dtype=tf.float32)
    # labels = tf.constant([1], dtype=tf.int32)
    _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)),
              training=False)

    # print('BEFORE:', model.layers[0].layers[0].get_weights()[0][0,0,0,:])
    weights_path = cfg.model['backbone']['weights_path']
    logger.info('Loading weights from: {}'.format(weights_path))
    model.layers[0].layers[0].load_weights(weights_path, by_name=True, skip_mismatch=True) #by_name=False)
    # print('AFTER:',model.layers[0].layers[0].get_weights()[0][0,0,0,:])

    print_model_info(model, logger)

    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        datasets.append(build_dataset(cfg.data.val))

    datasets = [build_dataset(cfg.data.train)]

    if len(cfg.workflow) > 1:
        raise NotImplementedError

    train_detector(model,
                   datasets,
                   cfg,
                   num_gpus=num_gpus,
                   distributed=distributed,
                   mixed_precision=args.amp,
                   validate=args.validate,
                   timestamp=timestamp)
예제 #3
0
def main():
    args = parse_args()
    print(args)
    assert args.out or args.json_out, \
        ('Please specify at least one operation to save the results) '
         'with the argument "--out" or "--json_out"')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')

    if args.json_out is not None and args.json_out.endswith('.json'):
        args.json_out = args.json_out[:-5]

    cfg = Config.fromfile(args.config)

    cfg.model.pretrained = None

    distributed = False

    # build the dataloader
    # TODO: support multiple images per gpu (only minor changes are needed)
    dataset = build_dataset(cfg.data.test)

    # build the model and load checkpoint
    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)

    # dummy data to init network
    img = tf.random.uniform(shape=[1333, 1333, 3], dtype=tf.float32)
    img_meta = tf.constant(
        [465., 640., 3., 800., 1101., 3., 1333., 1333., 3., 1.7204301, 0.],
        dtype=tf.float32)

    _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)),
              training=False)

    load_checkpoint(model, args.checkpoint)
    model.CLASSES = dataset.CLASSES

    if not distributed:
        outputs = single_gpu_test(model, dataset)
    else:
        raise NotImplementedError

    rank, _, _, _ = get_dist_info()

    if args.out and rank == 0:
        print('\nwriting results to {}'.format(args.out))
        fileio.dump(outputs, args.out)
        eval_types = args.eval
        if eval_types:
            print('Starting evaluate {}'.format(' and '.join(eval_types)))
            if eval_types == ['proposal_fast']:
                result_file = args.out
                coco_eval(result_file, eval_types, dataset.coco)
            else:
                if not isinstance(outputs[0], dict):
                    result_files = results2json(dataset, outputs, args.out)
                    coco_eval(result_files, eval_types, dataset.coco)
                else:
                    for name in outputs[0]:
                        print('\nEvaluating {}'.format(name))
                        outputs_ = [out[name] for out in outputs]
                        result_file = args.out + '.{}'.format(name)
                        result_files = results2json(dataset, outputs_,
                                                    result_file)
                        coco_eval(result_files, eval_types, dataset.coco)

    # Save predictions in the COCO json format
    if args.json_out and rank == 0:
        if not isinstance(outputs[0], dict):
            results2json(dataset, outputs, args.json_out)
        else:
            for name in outputs[0]:
                outputs_ = [out[name] for out in outputs]
                result_file = args.json_out + '.{}'.format(name)
                results2json(dataset, outputs_, result_file)
예제 #4
0
def main(cfg):
    decompress_data(cfg)
    ######################################################################################
    # Create Training Data
    ######################################################################################
    cfg.global_batch_size = cfg.batch_size_per_device * hvd.size()
    cfg.steps_per_epoch = cfg.coco_images // cfg.global_batch_size

    datasets = build_dataset(cfg.data.train)
    tf_datasets = [
        build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)
    ]
    ######################################################################################
    # Build Model
    ######################################################################################

    #update any hyperparams that we may have passed in via arguments
    if cfg.ls > 0.0:
        cfg.model['bbox_head']['label_smoothing'] = cfg.ls
    if cfg.use_rcnn_bn:
        cfg.model['bbox_head']['use_bn'] = cfg.use_rcnn_bn
    if cfg.use_conv:
        cfg.model['bbox_head']['use_conv'] = cfg.use_conv

    cfg.schedule = args.schedule
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    _ = model(next(iter(tf_datasets[0][0])))
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)

    ######################################################################################
    # Build optimizer and associate scheduler
    ######################################################################################

    # base learning rate is set for global batch size of 8, with linear scaling for larger batches
    base_learning_rate = cfg.base_learning_rate
    scaled_learning_rate = base_learning_rate * cfg.global_batch_size / 8
    steps_per_epoch = cfg.steps_per_epoch
    if cfg.schedule == '1x':
        scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            [steps_per_epoch * 8, steps_per_epoch * 10], [
                scaled_learning_rate, scaled_learning_rate * 0.1,
                scaled_learning_rate * 0.01
            ])
    elif cfg.schedule == 'cosine':
        scheduler = tf.keras.experimental.CosineDecayRestarts(
            initial_learning_rate=scaled_learning_rate,
            first_decay_steps=12 * steps_per_epoch,
            t_mul=1,
            m_mul=1)  #0-1-13
    else:
        raise NotImplementedError
    warmup_init_lr = 1.0 / cfg.warmup_init_lr_scale * scaled_learning_rate
    scheduler = WarmupScheduler(scheduler, warmup_init_lr, cfg.warmup_steps)
    # FIXME: currently hardcoded to SGD
    optimizer = tf.keras.optimizers.SGD(scheduler,
                                        momentum=0.9,
                                        nesterov=False)
    if cfg.fp16:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale='dynamic')

    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model,
                                     batch_processor,
                                     name=cfg.model_name,
                                     optimizer=optimizer,
                                     work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level),
                                     amp_enabled=cfg.fp16,
                                     loss_weights=cfg.loss_weights)
    runner.timestamp = int(time())
    ######################################################################################
    # Setup Training Hooks
    ######################################################################################
    runner.register_hook(
        checkpoint.CheckpointHook(interval=cfg.checkpoint_interval,
                                  out_dir=cfg.outputs_path,
                                  s3_dir=cfg.s3_checkpoints,
                                  h5=True))
    runner.register_hook(
        CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval))
    runner.register_hook(iter_timer.IterTimerHook())
    runner.register_hook(text.TextLoggerHook())
    runner.register_hook(
        visualizer.Visualizer(cfg.data.val, interval=100, top_k=10))
    runner.register_hook(
        tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path,
                                          image_interval=100,
                                          s3_dir=cfg.s3_tensorboard))
    ######################################################################################
    # Run Model
    ######################################################################################
    runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
예제 #5
0
def main_sagemaker(args, cfg):
    """
    Main training entry point for jobs launched via SageMaker
    """
    instance_name = cfg.sagemaker_job['job_name']
    s3_path = cfg.sagemaker_job['s3_path']

    decompress_data()  # setup data dirs based on SM CHANNELS

    num_gpus = len(gpus)
    # update configs according to CLI args
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from

    if args.autoscale_lr:
        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
        total_bs = get_dist_info()[2] * cfg.data.imgs_per_gpu
        cfg.optimizer[
            'learning_rate'] = cfg.optimizer['learning_rate'] * total_bs / 8

    # init distributed env first, since logger depends on the dist info.
    init_dist()

    if not gpus:
        distributed = False  # single node single gpu
    else:
        distributed = True

    # create work_dir
    mkdir_or_exist(osp.abspath(cfg.work_dir))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp))
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    # log some basic info
    logger.info('Distributed training: {}'.format(distributed))
    logger.info('TF MMDetection Version: {}'.format(__version__))
    logger.info('Config:\n{}'.format(cfg.text))
    logger.info('Tensorflow version: {}'.format(tf.version.VERSION))

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}, deterministic: {}'.format(
            args.seed, args.deterministic))
        set_random_seed(args.seed + get_dist_info()[0],
                        deterministic=args.deterministic)

    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)

    # dummy data to init network
    padded_img_side = max(cfg.data.train['scale'])
    img = tf.random.uniform(shape=[padded_img_side, padded_img_side, 3],
                            dtype=tf.float32)
    img_meta = tf.constant([
        465., 640., 3., 800., 1101., 3.,
        float(padded_img_side),
        float(padded_img_side), 3., 1.7204301, 0.
    ],
                           dtype=tf.float32)
    # bboxes = tf.constant([[1.0, 1.0, 10.0, 10.0]], dtype=tf.float32)
    # labels = tf.constant([1], dtype=tf.int32)
    _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)),
              training=False)
    # print('BEFORE:', model.layers[0].layers[0].get_weights()[0][0,0,0,:])

    # sagemaker specific path resolution
    import os, pathlib
    data_root = pathlib.Path(
        os.getenv('SM_CHANNEL_COCO')).joinpath('coco').as_posix()
    cfg.data.train['dataset_dir'] = data_root
    cfg.data.val['dataset_dir'] = data_root
    weights_file = cfg.model['backbone']['weights_path']
    weights_path = pathlib.Path(
        os.getenv('SM_CHANNEL_WEIGHTS')).joinpath(weights_file).as_posix()
    logger.info('Loading weights from: {}'.format(weights_path))
    if osp.splitext(weights_file
                    )[1] == '.h5':  # older keras format from Keras model zoo
        model.layers[0].layers[0].load_weights(weights_path,
                                               by_name=True,
                                               skip_mismatch=True)
    else:  # SavedModel format assumed - extract weights
        backbone_model = tf.keras.models.load_model(weights_path)
        # load weights if layers match
        for layer_idx, layer in enumerate(backbone_model.layers):
            if layer_idx < len(model.layers[0].layers[0].layers):
                model.layers[0].layers[0].layers[layer_idx].set_weights(
                    layer.get_weights())
                print('Loaded weights for:', layer.name)
        del backbone_model
    # print('AFTER:',model.layers[0].layers[0].get_weights()[0][0,0,0,:])

    print_model_info(model, logger)

    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        datasets.append(build_dataset(cfg.data.val))

    datasets = [build_dataset(cfg.data.train)]

    if len(cfg.workflow) > 1:
        raise NotImplementedError

    train_detector(model,
                   datasets,
                   cfg,
                   num_gpus=num_gpus,
                   distributed=distributed,
                   mixed_precision=args.amp,
                   validate=args.validate,
                   timestamp=timestamp)
예제 #6
0
def main_ec2(args, cfg):
    # start logger
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp))
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    """
    Main training entry point for jobs launched directly on EC2 instances
    """
    num_gpus = len(gpus)
    # update configs according to CLI args
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.resume_dir is not None:
        if os.path.exists(args.resume_dir):
            logger.info("RESUMING TRAINING")
            # get the latest checkpoint
            all_chkpt = [
                os.path.join(args.resume_dir, d)
                for d in os.listdir(args.resume_dir)
                if os.path.isdir(os.path.join(args.resume_dir, d))
            ]
            if not all_chkpt:
                cfg.resume_from = None
            else:
                latest_chkpt = max(all_chkpt, key=os.path.getmtime)
                # set the latest checkpoint to resume_from
                cfg.resume_from = latest_chkpt
        else:
            logger.info("CHECKPOINT NOT FOUND, RESTARTING TRAINING")
            cfg.resume_from = None

    if args.autoscale_lr:
        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
        total_bs = get_dist_info()[2] * cfg.data.imgs_per_gpu
        cfg.optimizer[
            'learning_rate'] = cfg.optimizer['learning_rate'] * total_bs / 8

    # init distributed env first, since logger depends on the dist info.
    # init_dist()

    if not gpus:
        distributed = False  # single node single gpu
    else:
        distributed = True

    # create work_dir
    mkdir_or_exist(osp.abspath(cfg.work_dir))
    # log some basic info
    logger.info('Distributed training: {}'.format(distributed))
    logger.info('TF MMDetection Version: {}'.format(__version__))
    logger.info('Config:\n{}'.format(cfg.text))
    logger.info('Tensorflow version: {}'.format(tf.version.VERSION))

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}, deterministic: {}'.format(
            args.seed, args.deterministic))
        set_random_seed(args.seed + get_dist_info()[0],
                        deterministic=args.deterministic)

    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)

    # dummy data to init network
    padded_img_side = max(cfg.data.train['scale'])
    img = tf.random.uniform(shape=[padded_img_side, padded_img_side, 3],
                            dtype=tf.float32)
    img_meta = tf.constant([
        465., 640., 3., 800., 1101., 3.,
        float(padded_img_side),
        float(padded_img_side), 3., 1.7204301, 0.
    ],
                           dtype=tf.float32)
    # bboxes = tf.constant([[1.0, 1.0, 10.0, 10.0]], dtype=tf.float32)
    # labels = tf.constant([1], dtype=tf.int32)
    _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)),
              training=False)
    #model.save('my_model')
    # print('BEFORE:', model.layers[0].layers[0].get_weights()[0][0,0,0,:])
    weights_path = cfg.model['backbone']['weights_path']
    logger.info('Loading weights from: {}'.format(weights_path))
    if osp.splitext(weights_path
                    )[1] == '.h5':  # older keras format from Keras model zoo
        model.layers[0].layers[0].load_weights(weights_path,
                                               by_name=True,
                                               skip_mismatch=True)
    else:  # SavedModel format assumed - extract weights
        backbone_model = tf.keras.models.load_model(weights_path)
        # load weights if layers match
        for layer_idx, layer in enumerate(backbone_model.layers):
            if layer_idx < len(model.layers[0].layers[0].layers):
                model.layers[0].layers[0].layers[layer_idx].set_weights(
                    layer.get_weights())
                print('Loaded weights for:', layer.name)
        del backbone_model
    # print('AFTER:',model.layers[0].layers[0].get_weights()[0][0,0,0,:])

    print_model_info(model, logger)

    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        datasets.append(build_dataset(cfg.data.val))

    datasets = [build_dataset(cfg.data.train)]

    if len(cfg.workflow) > 1:
        raise NotImplementedError

    train_detector(model,
                   datasets,
                   cfg,
                   num_gpus=num_gpus,
                   distributed=distributed,
                   mixed_precision=args.amp,
                   validate=args.validate,
                   timestamp=timestamp)