Beispiel #1
0
def main():
    """Main function."""
    # Parse arguments.
    args = parse_args()

    # Parse configurations.
    config = parse_config(args.config)
    config = update_config(config, args.options)
    config.work_dir = args.work_dir
    config.checkpoint = args.checkpoint
    config.launcher = args.launcher
    config.backend = args.backend
    if not os.path.isfile(config.checkpoint):
        raise FileNotFoundError(f'Checkpoint file `{config.checkpoint}` is '
                                f'missing!')

    # Set CUDNN.
    config.cudnn_benchmark = config.get('cudnn_benchmark', True)
    config.cudnn_deterministic = config.get('cudnn_deterministic', False)
    torch.backends.cudnn.benchmark = config.cudnn_benchmark
    torch.backends.cudnn.deterministic = config.cudnn_deterministic

    # Setting for launcher.
    config.is_distributed = True
    init_dist(config.launcher, backend=config.backend)
    config.num_gpus = dist.get_world_size()

    # Setup logger.
    if dist.get_rank() == 0:
        logger_type = config.get('logger_type', 'normal')
        logger = build_logger(logger_type, work_dir=config.work_dir)
        shutil.copy(args.config, os.path.join(config.work_dir, 'config.py'))
        commit_id = os.popen('git rev-parse HEAD').readline()
        logger.info(f'Commit ID: {commit_id}')
    else:
        logger = build_logger('dumb', work_dir=config.work_dir)

    # Start inference.
    runner = getattr(runners, config.runner_type)(config, logger)
    runner.load(filepath=config.checkpoint,
                running_metadata=False,
                learning_rate=False,
                optimizer=False,
                running_stats=False)

    if args.synthesis_num > 0:
        num = args.synthesis_num
        logger.print()
        logger.info(f'Synthesizing images ...')
        runner.synthesize(num, html_name=f'synthesis_{num}.html')
        logger.info(f'Finish synthesizing {num} images.')

    if args.fid_num > 0:
        num = args.fid_num
        logger.print()
        logger.info(f'Testing FID ...')
        fid_value = runner.fid(num, align_tf=not args.use_torchvision)
        logger.info(f'Finish testing FID on {num} samples. '
                    f'The result is {fid_value:.6f}.')
Beispiel #2
0
def main():
    args = parse_args()
    # Parse configurations.
    config = parse_config(args.config)
    os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus
    timestamp = datetime.datetime.now()
    version = '%d-%d-%d-%02.0d-%02.0d-%02.0d' % \
              (timestamp.year, timestamp.month, timestamp.day, timestamp.hour, timestamp.minute, timestamp.second)
    config.work_dir = os.path.join(config.work_dir, config.checkpoint_path.split('/')[-3], version)
    logger_type = config.get('logger_type', 'normal')
    logger = build_logger(logger_type, work_dir=config.work_dir)
    shutil.copy(args.config, os.path.join(config.work_dir, 'config.py'))
    commit_id = os.popen('git rev-parse HEAD').readline()
    logger.info(f'Commit ID: {commit_id}')
    runner = SefaRunner(config, logger)
    runner.run()
Beispiel #3
0
def main():
    """Main function."""
    # Parse arguments.
    args = parse_args()

    # Parse configurations.
    config = parse_config(args.config)
    config = update_config(config, args.options)
    os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus
    timestamp = datetime.datetime.now()
    version = '%d-%d-%d-%02.0d-%02.0d-%02.0d' % \
              (timestamp.year, timestamp.month, timestamp.day, timestamp.hour, timestamp.minute, timestamp.second)
    work_dir = os.path.join(args.work_dir, version)
    config.work_dir = work_dir
    config.resume_path = args.resume_path
    config.weight_path = args.weight_path
    config.seed = args.seed
    config.launcher = args.launcher
    config.backend = args.backend

    # Set CUDNN.
    config.cudnn_benchmark = config.get('cudnn_benchmark', True)
    config.cudnn_deterministic = config.get('cudnn_deterministic', False)
    torch.backends.cudnn.benchmark = config.cudnn_benchmark
    torch.backends.cudnn.deterministic = config.cudnn_deterministic

    # Set random seed.
    if config.seed is not None:
        random.seed(config.seed)
        np.random.seed(config.seed)
        torch.manual_seed(config.seed)
        config.cudnn_deterministic = True
        torch.backends.cudnn.deterministic = True
        warnings.warn('Random seed is set for training! '
                      'This will turn on the CUDNN deterministic setting, '
                      'which may slow down the training considerably! '
                      'Unexpected behavior can be observed when resuming from '
                      'checkpoints.')

    # Set launcher.
    config.is_distributed = True
    init_dist(config.launcher, backend=config.backend)
    config.num_gpus = dist.get_world_size()

    # Setup logger.
    if dist.get_rank() == 0:
        logger_type = config.get('logger_type', 'normal')
        logger = build_logger(logger_type, work_dir=config.work_dir)
        shutil.copy(args.config, os.path.join(config.work_dir, 'config.py'))
        commit_id = os.popen('git rev-parse HEAD').readline()
        logger.info(f'Commit ID: {commit_id}')
    else:
        logger = build_logger('dumb', work_dir=config.work_dir)

    # Start training.
    runner = getattr(runners, config.runner_type)(config, logger)
    if config.resume_path:
        runner.load(filepath=config.resume_path,
                    running_metadata=True,
                    learning_rate=True,
                    optimizer=True,
                    running_stats=False)
    if config.weight_path:
        runner.load(filepath=config.weight_path,
                    running_metadata=False,
                    learning_rate=False,
                    optimizer=False,
                    running_stats=False)
    runner.train()
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    trainer = gluon.Trainer(
        net.collect_train_params(),  # fix batchnorm, fix first stage, etc...
        'sgd',
        {
            'learning_rate': args.lr,
            'wd': args.wd,
            'momentum': args.momentum,
            'clip_gradient': 5
        })

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
    lr_warmup = float(args.lr_warmup)  # avoid int division

    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False)
    rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.)  # == smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    rcnn_box_loss = mx.gluon.loss.HuberLoss()  # == smoothl1
    rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False)
    metrics = [
        mx.metric.Loss('RPN_Conf'),
        mx.metric.Loss('RPN_SmoothL1'),
        mx.metric.Loss('RCNN_CrossEntropy'),
        mx.metric.Loss('RCNN_SmoothL1'),
        mx.metric.Loss('RCNN_Mask')
    ]

    rpn_acc_metric = RPNAccMetric()
    rpn_bbox_metric = RPNL1LossMetric()
    rcnn_acc_metric = RCNNAccMetric()
    rcnn_bbox_metric = RCNNL1LossMetric()
    rcnn_mask_metric = MaskAccMetric()
    rcnn_fgmask_metric = MaskFGAccMetric()
    metrics2 = [
        rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric,
        rcnn_mask_metric, rcnn_fgmask_metric
    ]

    # set up logger
    log_file_path = args.save_prefix + '_train.log'
    logger = build_logger(log_file_path)
    logger.info(args)
    if args.verbose:
        logger.info('Trainable parameters:')
        logger.info(net.collect_train_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        for metric in metrics:
            metric.reset()
        tic = time.time()
        btic = time.time()
        if not args.disable_hybridization:
            net.hybridize(static_alloc=args.static_alloc)
        base_lr = trainer.learning_rate
        for i, batch in enumerate(train_data):
            if epoch == 0 and i <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(i / lr_warmup)
                if new_lr != trainer.learning_rate:
                    if i % args.log_interval == 0:
                        logger.info(
                            '[Epoch 0 Iteration {}] Set learning rate to {}'.
                            format(i, new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(batch, ctx_list=ctx)
            batch_size = len(batch[0])
            losses = []
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            with autograd.record():
                for data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(
                        *batch):
                    gt_label = label[:, :, 4:5]
                    gt_box = label[:, :, :4]
                    cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(
                        data, gt_box)
                    # losses of rpn
                    rpn_score = rpn_score.squeeze(axis=-1)
                    num_rpn_pos = (rpn_cls_targets >= 0).sum()
                    rpn_loss1 = rpn_cls_loss(
                        rpn_score, rpn_cls_targets, rpn_cls_targets >=
                        0) * rpn_cls_targets.size / num_rpn_pos
                    rpn_loss2 = rpn_box_loss(
                        rpn_box, rpn_box_targets,
                        rpn_box_masks) * rpn_box.size / num_rpn_pos
                    # rpn overall loss, use sum rather than average
                    rpn_loss = rpn_loss1 + rpn_loss2
                    # generate targets for rcnn
                    cls_targets, box_targets, box_masks = net.target_generator(
                        roi, samples, matches, gt_label, gt_box)
                    # losses of rcnn
                    num_rcnn_pos = (cls_targets >= 0).sum()
                    rcnn_loss1 = rcnn_cls_loss(cls_pred, cls_targets,
                                               cls_targets >= 0) * cls_targets.size / \
                                 cls_targets.shape[0] / num_rcnn_pos
                    rcnn_loss2 = rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \
                                 box_pred.shape[0] / num_rcnn_pos
                    rcnn_loss = rcnn_loss1 + rcnn_loss2
                    # generate targets for mask
                    mask_targets, mask_masks = net.mask_target(
                        roi, gt_mask, matches, cls_targets)
                    # loss of mask
                    mask_loss = rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \
                                mask_targets.size / mask_targets.shape[0] / mask_masks.sum()
                    # overall losses
                    losses.append(rpn_loss.sum() + rcnn_loss.sum() +
                                  mask_loss.sum())
                    metric_losses[0].append(rpn_loss1.sum())
                    metric_losses[1].append(rpn_loss2.sum())
                    metric_losses[2].append(rcnn_loss1.sum())
                    metric_losses[3].append(rcnn_loss2.sum())
                    metric_losses[4].append(mask_loss.sum())
                    add_losses[0].append(
                        [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]])
                    add_losses[1].append([[rpn_box_targets, rpn_box_masks],
                                          [rpn_box]])
                    add_losses[2].append([[cls_targets], [cls_pred]])
                    add_losses[3].append([[box_targets, box_masks],
                                          [box_pred]])
                    add_losses[4].append([[mask_targets, mask_masks],
                                          [mask_pred]])
                    add_losses[5].append([[mask_targets, mask_masks],
                                          [mask_pred]])
                autograd.backward(losses)
                for metric, record in zip(metrics, metric_losses):
                    metric.update(0, record)
                for metric, records in zip(metrics2, add_losses):
                    for pred in records:
                        metric.update(pred[0], pred[1])
            trainer.step(batch_size)
            # update metrics
            if args.log_interval and not (i + 1) % args.log_interval:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metrics + metrics2
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(
                        epoch, i,
                        args.log_interval * batch_size / (time.time() - btic),
                        msg))
                btic = time.time()

        msg = ','.join(
            ['{}={:.3f}'.format(*metric.get()) for metric in metrics])
        logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format(
            epoch, (time.time() - tic), msg))
        if not (epoch + 1) % args.val_interval:
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args)
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
        else:
            current_map = 0.
        save_params(net, logger, best_map, current_map, epoch,
                    args.save_interval, args.save_prefix)
        self.logger.info('Epoch %d, pixAcc %.3f, mIoU %.3f'%\
        (epoch, pixAcc, mIoU))
        # best model
        if mIoU > self.best_mIoU:
            self.best_mIoU = mIoU
            self.is_best = True  # for save checkpoint


if __name__ == "__main__":
    from utils.argument import parse_args_for_segm as parse_args
    from utils.logger import build_logger
    from utils.custom_load import make_save_dir, save_checkpoint

    args = parse_args()
    save_dir = make_save_dir(args)
    logger = build_logger(os.path.join(save_dir, 'train.log'), True)
    logger.info(args)
    trainer = Trainer(args, logger)
    if args.eval:
        logger.info('Evaluating model: {}'.format(args.resume))
        trainer.validation(args.start_epoch)
    else:
        logger.info('Starting Epoch:{}'.format(args.start_epoch))
        logger.info('Total Epochs: {}'.format(args.epochs))
        for epoch in range(args.start_epoch, args.epochs):
            trainer.training(epoch)
            if not trainer.args.no_val:
                trainer.validation(epoch)
            # save every epoch
            save_checkpoint(trainer.net.module, save_dir, trainer.is_best,
                            epoch)
Beispiel #6
0
def main():
    # Arguments
    ###########################################################################
    try:
        args = get_args()
        config = process_config(args.config)
    except:
        logging.error("Missing or invalid arguments.")
        exit(0)

    # Logging
    ###########################################################################
    logging.basicConfig(
        filename=os.path.join("logs", config.exp_name + ".log"),
        format="[%(asctime)s] - [%(levelname)s]: %(message)s",
        filemode="a",
        level=logging.DEBUG,
    )
    logging.info("Logging started.")
    logging.info("Keras version: {}".format(keras_version))

    # Session
    ###########################################################################
    sess = tf.Session()
    K.set_session(sess)

    # create experiment related directories
    ###########################################################################
    create_dirs([config.summary_dir, config.checkpoint_dir])

    # Initialize the model
    ###########################################################################
    model_formicID = load_model(config=config, num_species=97)
    model_formicID = compile_model(model=model_formicID, config=config)
    model_formicID = weights_load(
        model=model_formicID,
        weights=
        "experiments/T97_CaAll_QuM_ShSti_AugM_D05_LR0001_E200_I4_def_clean/checkpoint/weights_55-1.76.hdf5",
    )

    # Training in batches with iterator
    ###########################################################################
    history = trainer_dir(
        model=model_formicID,
        config=config,
        callbacks=build_logger(config=config, model=model_formicID),
    )
    save_model(model=model_formicID,
               filename="final_weights.hdf5",
               config=config)

    # Evaluation
    ###########################################################################
    plot_history(history=history, config=config, theme="ggplot", save=None)
    evaluator(model=model_formicID, config=config, test_dir=None)

    # Testing
    ###########################################################################
    Y_true, Y_pred, labels, species_dict = predictor(
        model=model_formicID,
        config=config,
        # species_json="data/species_dict.json",
        plot=True,
        n_img=10,
        n_cols=3,
    )
    predictor_reports(
        Y_true=Y_true,
        Y_pred=Y_pred,
        config=config,
        species_dict=species_dict,
        target_names=labels,
        digits=5,
    )
    plot_confusion_matrix(
        Y_pred=Y_pred,
        Y_true=Y_true,
        config=config,
        target_names=labels,
        species_dict=species_dict,
        title=None,
        cmap="viridis",
        normalize=True,
        scores=True,
        score_size=8,
        save="confusion_matrix.png",
    )
    # Footer
    ###########################################################################
    K.clear_session()
    logging.info("Logging ended.")
Beispiel #7
0
def main():
    """Main function."""
    # Parse arguments.
    args = parse_args()

    # Parse configurations.
    config = parse_config(args.config)
    config = update_config(config, args.options)
    config.work_dir = args.work_dir
    config.resume_path = args.resume_path
    config.weight_path = args.weight_path
    config.seed = args.seed
    config.launcher = args.launcher
    config.backend = args.backend
    if args.adv != None:
        config.loss['g_loss_kwargs']['adv'] = float(args.adv)
    if args.lamb != None:
        config.loss['g_loss_kwargs']['lamb'] = float(args.lamb)
    if args.metric != None:
        config.loss['g_loss_kwargs']['metric'] = args.metric
    if args.baseLR != None:
        config.modules['generator']['opt']['base_lr'] = float(args.baseLR) / 2
    if args.nethz != None:
        config.nethz = args.nethz
    config.savename = args.adv + '_' + args.lamb.replace(
        '.', 'dot') + '_' + args.metric.replace(
            '.', 'dot') + '_' + args.baseLR.replace('.', 'dot')

    config.data['train'][
        'root_dir'] = '/cluster/scratch/' + config.nethz + '/data'
    config.data['val'][
        'root_dir'] = '/cluster/scratch/' + config.nethz + '/data'

    # Set CUDNN.
    config.cudnn_benchmark = config.get('cudnn_benchmark', True)
    config.cudnn_deterministic = config.get('cudnn_deterministic', False)
    torch.backends.cudnn.benchmark = config.cudnn_benchmark
    torch.backends.cudnn.deterministic = config.cudnn_deterministic

    # Set random seed.
    config.seed = 26
    if config.seed is not None:
        random.seed(config.seed)
        np.random.seed(config.seed)
        torch.manual_seed(config.seed)
        config.cudnn_deterministic = True
        torch.backends.cudnn.deterministic = True
        warnings.warn('Random seed is set for training! '
                      'This will turn on the CUDNN deterministic setting, '
                      'which may slow down the training considerably! '
                      'Unexpected behavior can be observed when resuming from '
                      'checkpoints.')

    # Set launcher.
    config.is_distributed = True
    init_dist(config.launcher, backend=config.backend)
    config.num_gpus = dist.get_world_size()

    # Setup logger.
    if dist.get_rank() == 0:
        logger_type = config.get('logger_type', 'normal')
        logger = build_logger(logger_type, work_dir=config.work_dir)
        shutil.copy(args.config, os.path.join(config.work_dir, 'config.py'))
        commit_id = os.popen('git rev-parse HEAD').readline()
        logger.info(f'Commit ID: {commit_id}')
    else:
        logger = build_logger('dumb', work_dir=config.work_dir)

    # Start training.
    runner = getattr(runners, config.runner_type)(config, logger)
    if config.resume_path:
        runner.load(filepath=config.resume_path,
                    running_metadata=True,
                    learning_rate=True,
                    optimizer=True,
                    running_stats=False)
    if config.weight_path:
        runner.load(filepath=config.weight_path,
                    running_metadata=False,
                    learning_rate=False,
                    optimizer=False,
                    running_stats=False)
    runner.train()