Beispiel #1
0
def train(cfg,
          local_rank,
          distributed,
          logger=None,
          tblogger=None,
          transfer_weight=False,
          change_lr=False):
    device = torch.device('cuda')

    # create model
    logger.info('Creating model "{}"'.format(cfg.MODEL.ARCHITECTURE))
    model = build_model(cfg).to(device)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=255).to(device)
    optimizer = make_optimizer(cfg, model)
    # model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O2')
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        # model = apex.parallel.DistributedDataParallel(model)
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            broadcast_buffers=True,
        )

    save_to_disk = get_rank() == 0

    # checkpoint
    arguments = {}
    arguments['iteration'] = 0
    arguments['best_iou'] = 0
    checkpointer = Checkpointer(model, optimizer, scheduler, cfg.LOGS.DIR,
                                save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load(
        f=cfg.MODEL.WEIGHT,
        model_weight_only=transfer_weight,
        change_scheduler=change_lr)
    arguments.update(extra_checkpoint_data)

    # data_loader
    logger.info('Loading dataset "{}"'.format(cfg.DATASETS.TRAIN))
    data_loader = make_data_loader(cfg, 'train', distributed)
    data_loader_val = make_data_loader(cfg, 'val', distributed)

    do_train(cfg,
             model=model,
             data_loader=data_loader,
             optimizer=optimizer,
             scheduler=scheduler,
             criterion=criterion,
             checkpointer=checkpointer,
             device=device,
             arguments=arguments,
             tblogger=tblogger,
             data_loader_val=data_loader_val,
             distributed=distributed)
Beispiel #2
0
def test(cfg, local_rank, distributed, logger=None):
    device = torch.device('cuda')
    cpu_device = torch.device('cpu')

    # create model
    logger.info("Creating model \"{}\"".format(cfg.MODEL.ARCHITECTURE))
    model = build_model(cfg).to(device)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=255).to(device)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            broadcast_buffers=True,
        )

    # checkpoint
    checkpointer = Checkpointer(model, save_dir=cfg.LOGS.DIR, logger=logger)
    _ = checkpointer.load(f=cfg.MODEL.WEIGHT)

    # data_loader
    logger.info('Loading dataset "{}"'.format(cfg.DATASETS.TEST))
    stage = cfg.DATASETS.TEST.split('_')[-1]
    data_loader = make_data_loader(cfg, stage, distributed)
    dataset_name = cfg.DATASETS.TEST

    metrics = inference(model, criterion, data_loader, dataset_name, True)

    if is_main_process():
        logger.info("Metrics:")
        for k, v in metrics.items():
            logger.info("{}: {}".format(k, v))
 def __init__(self, args):
     kwargs = {'num_workers': 4, 'pin_memory': True}
     self.source_loader, self.target_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs)
     self.tbar = tqdm(self.test_loader, desc='\r')
     self.trainer = wgan_trainer(args, 2)
     self.evaluator = Evaluator(2)
     self.best_IoU = {'disc': 0.77, 'cup': 0.65}
     self.attempt = 9.5
     self.validation(args, self.trainer.target_model, self.tbar )
     self.trainer_wgan(args)
Beispiel #4
0
def train(model: Model, X: torch.Tensor, X_val: torch.Tensor,
          YMin: torch.Tensor, YMin_val: torch.Tensor,
          YMax: torch.Tensor, YMax_val: torch.Tensor,
          N: int = 1024, M: int = 1000, num_epoch: int = 10, lr: float = 1e-3,
          epoch_size: int = 1000, batch_size: int = 32, device: str = 'cuda:0') -> Model:
    """
    Trains model
    :param model: model to train
    :param X: Training data
    :param X_val: validation data
    :param YMin: Training labels for minimums
    :param YMin_val: validation labels for minimums
    :param YMax: Training labels for maximums
    :param YMax_val: validation labels for maximums
    :param N: length of subsequences in data
    :param M: amount of subsequences in data
    :param num_epoch: amount of epochs to train model
    :param lr: learning rate
    :param epoch_size: amount of batches to feed every epoch
    :param batch_size: batch size
    :param device: device
    :return: trained model
    """
    assert X.shape[0] == N*M
    model.to(device)
    opt = optim.Adam(model.parameters(), lr=lr)
    sh = optim.lr_scheduler.StepLR(opt, 1, 0.5)
    for epoch in range(num_epoch):
        with tqdm(total=epoch_size, desc=f'epoch {epoch} of {num_epoch}') as tq:
            model.train()
            train_loader = make_data_loader(X, YMin, YMax, N=N, batch_size=batch_size, num_batches=epoch_size)
            for x in tqdm(train_loader):
                loss, pred_, true_ = _train_step(model, opt, [x_.to(device) for x_ in x])
                tq.set_postfix(loss=loss.item(), lr=sh.get_last_lr())
                tq.update()
            sh.step()
            logging.info(f"Training for epoch {epoch}")

        model.eval()
        loss, pred, true = _eval(model, [x_.to(device) for x_ in [X_val, YMin_val, YMax_val]])
        true = true.cpu().numpy()
        pred = pred.cpu().numpy()
        print(classification_report(true, pred, labels=[1, 2], target_names=['Min', 'Max']))
        scores = classification_report(true, pred, labels=[1, 2], target_names=['Min', 'Max'], output_dict=True)['micro avg']
        del scores['support']
        scores['loss'] = loss.item()
        logging.info(f"Validation for epoch {epoch} ended, scores are {scores}")
    return model
Beispiel #5
0
def profile(cfg, logger=None):
    device = torch.device('cuda')

    # create model
    logger.info("Creating model \"{}\"".format(cfg.MODEL.ARCHITECTURE))
    model = build_model(cfg).to(device)
    model.eval()

    # data_loader
    logger.info("Loading dataset \"{}\"".format(cfg.DATASETS.TRAIN))
    data_loader = make_data_loader(cfg, 'train', False)

    # profile
    locs, feats, targets, metadata = next(iter(data_loader))
    inputs = ME.SparseTensor(feats, coords=locs).to(device)
    targets = targets.to(device, non_blocking=True).long()
    return profiler(model, inputs={'x': inputs, 'y': targets})
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument("--local_rank", type=int, default=0)

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
        synchronize()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("train", output_dir, get_rank(),
                          filename='train_log.txt')
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    model = Network()
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # scaling policy, only suppose batch_size < SOLVER.IMS_PER_BATCH
    lr_steps, scale_factor = cfg.SOLVER.STEPS, 1.0
    batch_size = num_gpus * cfg.SOLVER.IMS_PER_GPU
    if batch_size < cfg.SOLVER.IMS_PER_BATCH:
        assert cfg.SOLVER.IMS_PER_BATCH % batch_size == 0
        scale_factor = cfg.SOLVER.IMS_PER_BATCH // batch_size
        lr_steps = [step * scale_factor for step in lr_steps]
    optimizer = make_optimizer(cfg, model, 1.0 / scale_factor)
    scheduler = WarmupMultiStepLR(
        optimizer, lr_steps, cfg.SOLVER.GAMMA,
        warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
        warmup_iters=cfg.SOLVER.WARMUP_ITERS,
        warmup_method=cfg.SOLVER.WARMUP_METHOD,
    )

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    checkpoint_dir = os.path.join(cfg.OUTPUT_DIR, 'checkpoints')
    mkdir(checkpoint_dir)

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, checkpoint_dir, save_to_disk, logger
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)
    start_iter = arguments["iteration"]

    data_loader = make_data_loader(
        num_gpus, is_train=True, is_distributed=args.distributed,
        start_iter=start_iter)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)

    model.train()
    start_training_time = time.time()
    end = time.time()

    rcnn_iou_now = cfg.MODEL.DYNAMIC_RCNN.WARMUP_IOU
    rcnn_beta_now = cfg.MODEL.DYNAMIC_RCNN.WARMUP_BETA
    iteration_count = cfg.MODEL.DYNAMIC_RCNN.ITERATION_COUNT
    S_I, S_E = [], []
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(
                "Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}")
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict, rcnn_iou_new, rcnn_error_new = model(
            images, targets, rcnn_iou=rcnn_iou_now, rcnn_beta=rcnn_beta_now)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        def reduce_loss_dict(loss_dict):
            """
            Reduce the loss dictionary from all processes so that process with rank
            0 has the averaged results. Returns a dict with the same fields as
            loss_dict, after reduction.
            """
            world_size = get_world_size()
            if world_size < 2:
                return loss_dict
            with torch.no_grad():
                loss_names = []
                all_losses = []
                for k in sorted(loss_dict.keys()):
                    loss_names.append(k)
                    all_losses.append(loss_dict[k])
                all_losses = torch.stack(all_losses, dim=0)
                dist.reduce(all_losses, dst=0)
                if dist.get_rank() == 0:
                    # only main process gets accumulated, so only divide by
                    # world_size in this case
                    all_losses /= world_size
                reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
            return reduced_losses

        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        S_I.append(rcnn_iou_new)
        S_E.append(rcnn_error_new)
        if iteration % iteration_count == 0:
            rcnn_iou_now = max(sum(S_I) / iteration_count,
                               cfg.MODEL.DYNAMIC_RCNN.WARMUP_IOU)
            rcnn_beta_now = min(sorted(S_E)[iteration_count // 2],
                                cfg.MODEL.DYNAMIC_RCNN.WARMUP_BETA)
            S_I, S_E = [], []

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0 or iteration == max_iter:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / max_iter
        )
    )
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Inference")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    # Merge config file.
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # Print experimental infos.
    save_dir = ""
    logger = setup_logger("AlphAction", save_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(cfg)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + get_pretty_env_info())

    # Build the model.
    model = build_detection_model(cfg)
    model.to("cuda")

    # load weight.
    output_dir = cfg.OUTPUT_DIR
    checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
    checkpointer.load(cfg.MODEL.WEIGHT)

    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    mem_active = has_memory(cfg.IA_STRUCTURE)
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            os.makedirs(output_folder, exist_ok=True)
            output_folders[idx] = output_folder

    # Do inference.
    data_loaders_test = make_data_loader(cfg,
                                         is_train=False,
                                         is_distributed=distributed)
    for output_folder, dataset_name, data_loader_test in zip(
            output_folders, dataset_names, data_loaders_test):
        inference(
            model,
            data_loader_test,
            dataset_name,
            mem_active=mem_active,
            output_folder=output_folder,
        )
        synchronize()
import numpy as np
from dataset import make_data_loader
from helpers import parse_args, timer
from generate_data import generate_x, find_min_max


def sample(loader):
    for x in loader:
        pass


if __name__ == '__main__':
    args = parse_args()
    filename = 'task_3.log'
    x_msg = f"X generation with N = {args.N} and M = {args.M}"
    X = timer(generate_x, filename, x_msg)(args.M, args.N)
    y_msg = "Finding optimums for X"
    YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k)

    loader = make_data_loader(X,
                              YMin,
                              YMax,
                              N=args.N,
                              batch_size=args.batch_size,
                              num_batches=args.num_batches)
    timer(
        sample, filename,
        f"{args.num_batches} batches sampling with batch size = {args.batch_size}"
    )(loader)
Beispiel #9
0
    def __init__(self, args):

        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        checkpoint = torch.load(args.resume)
        args.start_epoch = checkpoint['epoch']
        # Define Dataloader
        kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.source_loader, self.target_loader, _, self.nclass = make_data_loader(
            args, **kwargs)

        # Define Target Model
        self.target_model = DeepLab(num_classes=self.nclass,
                                    backbone=args.backbone,
                                    output_stride=args.out_stride,
                                    sync_bn=args.sync_bn,
                                    freeze_bn=args.freeze_bn)

        # Using cuda
        self.best_pred = {'disc': 0.0, 'cup': 0.0}

        self.target_model = torch.nn.DataParallel(self.target_model)
        patch_replication_callback(self.target_model)
        self.target_model = self.target_model.cuda()
        model_dict = self.target_model.module.state_dict()
        pretrained_dict = {
            k: v
            for k, v in checkpoint['state_dict'].items()
            if 'last_conv' not in k
        }
        model_dict.update(pretrained_dict)
        self.target_model.module.load_state_dict(model_dict)
        self.target_model.train()
        self.set_requires_grad('target', True)

        # Define learning rate and optimizer params
        target_params = [{
            'params': self.target_model.module.get_1x_lr_params(),
            'lr': args.lr
        }, {
            'params': self.target_model.module.get_10x_lr_params(),
            'lr': args.lr * 10
        }]

        target_optim = torch.optim.SGD(target_params,
                                       momentum=args.momentum,
                                       weight_decay=args.weight_decay,
                                       nesterov=args.nesterov)
        target_optim.zero_grad()

        self.target_criterion = torch.nn.BCEWithLogitsLoss()
        self.target_optim = target_optim

        # Define lr scheduler
        self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs,
                                      len(self.target_loader))
        self.evaluator = Evaluator(3)
Beispiel #10
0

if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='Training GlamPoints detector')
    parser.add_argument('--path_ymlfile',
                        type=str,
                        default='configs/glampoints_training.yml',
                        help='Path to yaml file.')

    opt = parser.parse_args()

    with open(opt.path_ymlfile, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    _device = settings.initialize_cuda_and_logging(cfg)

    train_loader, val_loader = make_data_loader(cfg)

    model = build_model(cfg)
    model.to(_device)

    optimizer = build_optimizer(cfg, model)

    loss_func = build_loss(cfg)

    logger, tb_logger = build_logger(cfg)

    do_train(cfg, model, train_loader, val_loader, optimizer, loss_func,
             logger, tb_logger, _device)
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Testing")
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--iter",
                        "-i",
                        type=int,
                        default=-1,
                        help="The iteration number, default -1 which will "
                        "test the latest model")
    parser.add_argument('--show_res', '-s', default=False, action='store_true')

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if args.show_res and num_gpus > 1:
        print('\033[93m You can\'t specify both show_image (-s) and multiple'
              ' devices (-d %s) \033[0m' % num_gpus)
        exit(-1)

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("test.inference",
                          output_dir,
                          get_rank(),
                          filename='test_log.txt')
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(cfg)

    model = Network()
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    checkpoint_dir = os.path.join(cfg.OUTPUT_DIR, 'checkpoints')
    mkdir(checkpoint_dir)

    checkpointer = DetectronCheckpointer(cfg,
                                         model,
                                         save_dir=checkpoint_dir,
                                         logger=logger)

    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    for idx, dataset_name in enumerate(dataset_names):
        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
        mkdir(output_folder)
        output_folders[idx] = output_folder

    data_loaders = make_data_loader(num_gpus,
                                    is_train=False,
                                    is_distributed=distributed,
                                    return_raw=args.show_res)

    def test_model(model):
        for output_folder, dataset_name, data_loader_val in zip(
                output_folders, dataset_names, data_loaders):
            inference(
                model,
                data_loader_val,
                dataset_name=dataset_name,
                iou_types=iou_types,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=output_folder,
                box_only=False,
                bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
                show_res=args.show_res,
                logger=logger)
            synchronize()

    test_iter = args.iter
    if args.iter == -1:
        model_file = os.readlink(
            os.path.join(checkpoint_dir, 'last_checkpoint'))
        test_iter = int(model_file.split('/')[-1].split('_')[-1][:-4])
    else:
        model_file = os.path.join(checkpoint_dir,
                                  "model_{:07d}.pth".format(args.iter))

    if os.path.exists(model_file):
        logger.info("\n\nstart to evaluate iteration of {}".format(test_iter))
        _ = checkpointer.load(model_file)
        test_model(model)