def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=14,
                        metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr',
                        type=float,
                        default=1.0,
                        metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.7,
                        metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    parser.add_argument('--verbose',
                        action='store_true',
                        default=False,
                        help='For displaying SM Data Parallel-specific logs')
    parser.add_argument('--data-path',
                        type=str,
                        default='/tmp/data',
                        help='Path for downloading '
                        'the MNIST dataset')

    args = parser.parse_args()
    args.world_size = dist.get_world_size()
    args.rank = rank = dist.get_rank()
    args.local_rank = local_rank = dist.get_local_rank()
    args.lr = 1.0
    args.batch_size //= args.world_size // 8
    args.batch_size = max(args.batch_size, 1)
    data_path = args.data_path

    if args.verbose:
        print('Hello from rank', rank, 'of local_rank', local_rank,
              'in world size of', args.world_size)

    if not torch.cuda.is_available():
        raise Exception(
            "Must run SM Distributed DataParallel MNIST example on CUDA-capable devices."
        )

    torch.manual_seed(args.seed)

    device = torch.device("cuda")

    if local_rank == 0:
        train_dataset = datasets.MNIST(data_path,
                                       train=True,
                                       download=True,
                                       transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307, ),
                                                                (0.3081, ))
                                       ]))
    else:
        time.sleep(8)
        train_dataset = datasets.MNIST(data_path,
                                       train=True,
                                       download=False,
                                       transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307, ),
                                                                (0.3081, ))
                                       ]))

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)
    if rank == 0:
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(data_path,
                           train=False,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307, ), (0.3081, ))
                           ])),
            batch_size=args.test_batch_size,
            shuffle=True)

    model = DDP(Net().to(device))
    torch.cuda.set_device(local_rank)
    model.cuda(local_rank)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        if rank == 0:
            test(model, device, test_loader)
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
def train_model(args):
    from torchvision import datasets, models
    from tqdm import tqdm

    imgs, trn_df, vld_df = _get_images(args.train_dir,
                                       args.num_folds,
                                       args.vld_fold_idx,
                                       data_type='train')
    trn_loader, vld_loader = _get_data_loader(imgs, trn_df, vld_df)

    logger.info("=== Getting Pre-trained model ===")
    model = models.resnet18(pretrained=True)
    last_hidden_units = model.fc.in_features
    model.fc = torch.nn.Linear(last_hidden_units, 186)
    #     len_buffer =  len(list(module.buffers()))

    #     logger.info("=== Buffer ===")
    #     print(f"len_buffer={len_buffer}")
    #     print(list(model.buffers()))

    # SDP: Pin each GPU to a single process
    # Use SMDataParallel PyTorch DDP for efficient distributed training
    model = DDP(model.to(args.device), broadcast_buffers=False)

    # SDP: Pin each GPU to a single SDP process.
    torch.cuda.set_device(args.local_rank)
    model.cuda(args.local_rank)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           verbose=True,
                                                           patience=5,
                                                           factor=0.5)

    best_score = -1
    training_stats = []
    logger.info("=== Start Training ===")

    for epoch_id in range(args.num_epochs):

        ################################################################################
        # ==> Training phase
        ################################################################################
        trn_loss = []
        model.train()

        # Measure how long the training epoch takes.
        t0 = time.time()
        running_loss = 0.0

        for batch_id, (inputs, targets) in enumerate((trn_loader)):
            inputs = inputs.cuda()
            targets = targets.cuda()
            targets_gra = targets[:, 0]
            targets_vow = targets[:, 1]
            targets_con = targets[:, 2]

            # 50%의 확률로 원본 데이터 그대로 사용
            if np.random.rand() < 0.5:
                logits = model(inputs)
                grapheme = logits[:, :168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss1 = loss_fn(grapheme, targets_gra)
                loss2 = loss_fn(vowel, targets_vow)
                loss3 = loss_fn(cons, targets_con)

            else:

                lam = np.random.beta(1.0, 1.0)
                rand_index = torch.randperm(inputs.size()[0])
                shuffled_targets_gra = targets_gra[rand_index]
                shuffled_targets_vow = targets_vow[rand_index]
                shuffled_targets_con = targets_con[rand_index]

                bbx1, bby1, bbx2, bby2 = _rand_bbox(inputs.size(), lam)
                inputs[:, :, bbx1:bbx2,
                       bby1:bby2] = inputs[rand_index, :, bbx1:bbx2, bby1:bby2]
                # 픽셀 비율과 정확히 일치하도록 lambda 파라메터 조정
                lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) /
                           (inputs.size()[-1] * inputs.size()[-2]))

                logits = model(inputs)
                grapheme = logits[:, :168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss1 = loss_fn(grapheme, targets_gra) * lam + loss_fn(
                    grapheme, shuffled_targets_gra) * (1. - lam)
                loss2 = loss_fn(vowel, targets_vow) * lam + loss_fn(
                    vowel, shuffled_targets_vow) * (1. - lam)
                loss3 = loss_fn(cons, targets_con) * lam + loss_fn(
                    cons, shuffled_targets_con) * (1. - lam)

            loss = 0.5 * loss1 + 0.25 * loss2 + 0.25 * loss3
            trn_loss.append(loss.item())
            running_loss += loss.item()
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            # Printing vital information
            if (batch_id + 1) % (args.log_interval) == 0:
                s = f'[Epoch {epoch_id} Batch {batch_id+1}/{len(trn_loader)}] ' \
                f'loss: {running_loss / args.log_interval:.4f}'
                print(s)
                running_loss = 0

        # Measure how long this epoch took.
        trn_time = _format_time(time.time() - t0)

        if args.rank == 0:
            ################################################################################
            # ==> Validation phase
            ################################################################################
            val_loss = []
            val_true = []
            val_pred = []
            model.eval()

            # === Validation phase ===
            logger.info('=== Start Validation ===')

            with torch.no_grad():
                for inputs, targets in vld_loader:
                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    logits = model(inputs)
                    grapheme = logits[:, :168]
                    vowel = logits[:, 168:179]
                    cons = logits[:, 179:]

                    loss= 0.5* loss_fn(grapheme, targets[:,0]) + 0.25*loss_fn(vowel, targets[:,1]) + \
                    0.25*loss_fn(vowel, targets[:,2])
                    val_loss.append(loss.item())

                    grapheme = grapheme.cpu().argmax(dim=1).data.numpy()
                    vowel = vowel.cpu().argmax(dim=1).data.numpy()
                    cons = cons.cpu().argmax(dim=1).data.numpy()

                    val_true.append(targets.cpu().numpy())
                    val_pred.append(np.stack([grapheme, vowel, cons], axis=1))

            val_true = np.concatenate(val_true)
            val_pred = np.concatenate(val_pred)
            val_loss = np.mean(val_loss)
            trn_loss = np.mean(trn_loss)

            score_g = recall_score(val_true[:, 0],
                                   val_pred[:, 0],
                                   average='macro')
            score_v = recall_score(val_true[:, 1],
                                   val_pred[:, 1],
                                   average='macro')
            score_c = recall_score(val_true[:, 2],
                                   val_pred[:, 2],
                                   average='macro')
            final_score = np.average([score_g, score_v, score_c],
                                     weights=[2, 1, 1])

            # Printing vital information
            s = f'[Epoch {epoch_id}] ' \
            f'trn_loss: {trn_loss:.4f}, vld_loss: {val_loss:.4f}, score: {final_score:.4f}, ' \
            f'score_each: [{score_g:.4f}, {score_v:.4f}, {score_c:.4f}]'
            print(s)

            ################################################################################
            # ==> Save checkpoint and training stats
            ################################################################################
            if final_score > best_score:
                best_score = final_score
                state_dict = model.cpu().state_dict()
                model = model.cuda()
                torch.save(state_dict, os.path.join(args.model_dir,
                                                    'model.pth'))

            # Record all statistics from this epoch
            training_stats.append({
                'epoch': epoch_id + 1,
                'trn_loss': trn_loss,
                'trn_time': trn_time,
                'val_loss': val_loss,
                'score': final_score,
                'score_g': score_g,
                'score_v': score_v,
                'score_c': score_c
            })

            # === Save Model Parameters ===
            logger.info("Model successfully saved at: {}".format(
                args.model_dir))
Example #3
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=14,
        metavar="N",
        help="number of epochs to train (default: 14)",
    )
    parser.add_argument("--lr",
                        type=float,
                        default=1.0,
                        metavar="LR",
                        help="learning rate (default: 1.0)")
    parser.add_argument(
        "--gamma",
        type=float,
        default=0.7,
        metavar="M",
        help="Learning rate step gamma (default: 0.7)",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10,
        metavar="N",
        help="how many batches to wait before logging training status",
    )
    parser.add_argument("--save-model",
                        action="store_true",
                        default=False,
                        help="For Saving the current Model")
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="For displaying smdistributed.dataparallel-specific logs",
    )
    parser.add_argument(
        "--data-path",
        type=str,
        default="/tmp/data",
        help="Path for downloading "
        "the MNIST dataset",
    )

    args = parser.parse_args()
    args.world_size = dist.get_world_size()
    args.rank = rank = dist.get_rank()
    args.local_rank = local_rank = dist.get_local_rank()
    args.lr = 1.0
    args.batch_size //= args.world_size // 8
    args.batch_size = max(args.batch_size, 1)
    data_path = args.data_path

    if args.verbose:
        print(
            "Hello from rank",
            rank,
            "of local_rank",
            local_rank,
            "in world size of",
            args.world_size,
        )

    if not torch.cuda.is_available():
        raise CUDANotFoundException(
            "Must run smdistributed.dataparallel MNIST example on CUDA-capable devices."
        )

    torch.manual_seed(args.seed)

    device = torch.device("cuda")

    # select a single rank per node to download data
    is_first_local_rank = local_rank == 0
    if is_first_local_rank:
        train_dataset = datasets.MNIST(
            data_path,
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        )
    dist.barrier()  # prevent other ranks from accessing the data early
    if not is_first_local_rank:
        train_dataset = datasets.MNIST(
            data_path,
            train=True,
            download=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        )

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        sampler=train_sampler,
    )
    if rank == 0:
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                data_path,
                train=False,
                transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307, ), (0.3081, ))
                ]),
            ),
            batch_size=args.test_batch_size,
            shuffle=True,
        )

    model = DDP(Net().to(device))
    torch.cuda.set_device(local_rank)
    model.cuda(local_rank)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        if rank == 0:
            test(model, device, test_loader)
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Example #4
0
def sdp_init(model, optimizer, args):
    model = DDP(model.to(args.device), broadcast_buffers=False)
    #     model = DDP(model, device_ids=[args.rank], broadcast_buffers=False)
    model.cuda(args.local_rank)
    return model, optimizer, args
Example #5
0
def train(cfg, args):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if use_amp:
        # Initialize mixed-precision training
        use_mixed_precision = cfg.DTYPE == "float16"

        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    if args.distributed:
        # if use_apex_ddp:
        #     model = DDP(model, delay_allreduce=True)
        # else:
        # SMDataParallel: Wrap the PyTorch model with SMDataParallel’s DDP
        model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False)
        #model = DDP(model)
    print("model parameter size: ", sum(p.numel() for p in model.parameters() if p.requires_grad))
    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    # SMDataParallel: Save model on master node.
    save_to_disk = dist.get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=args.distributed,
        start_iter=arguments["iteration"],
        data_dir = args.data_dir
    )
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if cfg.PER_EPOCH_EVAL:
        per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            tester=functools.partial(test, cfg=cfg),
            model=model,
            distributed=args.distributed,
            min_bbox_map=cfg.MIN_BBOX_MAP,
            min_segm_map=cfg.MIN_MASK_MAP)
    else:
        per_iter_callback_fn = None
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        use_amp,
        cfg,
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    return model