Esempio n. 1
0
def main():
    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed:
        args.gpu = args.rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)

    if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    model = models.__dict__[args.arch]().cuda()
    if args.distributed: model = DDP(model)

    data, train_sampler = torch_loader(f'{args.data}-sz/160', 128, 256)
    learner = Learner.from_model_data(model, data)
    learner.crit = F.cross_entropy
    learner.metrics = [accuracy, top5]
    if args.fp16: learner.half()
    wd=2e-5
    update_model_dir(learner, args.save_dir)
    fit(learner, '1', 0.03, 1, train_sampler, wd)

    data, train_sampler = torch_loader(f'{args.data}-sz/320', 128, 256)
    learner.set_data(data)
    fit(learner, '3', 1e-1, 1, train_sampler, wd)

    data, train_sampler = torch_loader(args.data, 128, 256)
    learner.set_data(data)
    fit(learner, '3', 1e-1, 1, train_sampler, wd)
    print('Finished!')
Esempio n. 2
0
    def _check_and_init_distributed_model(self):
        if not self.options.use_data_parallel_distributed:
            return

        if not dist.is_initialized():
            world_size = self.options.dist_world_size
            url = self.options.dist_url
            rank = self.options.dist_rank
            # This is for SLURM's special use case
            if rank == -1:
                rank = int(os.environ.get("SLURM_NODEID"))

            print("=> Distributed training: world size: {}, rank: {}, URL: {}".
                  format(world_size, rank, url))

            dist.init_process_group(backend="nccl",
                                    init_method=url,
                                    rank=rank,
                                    world_size=world_size)

        # Initialize the distributed data parallel model
        master_gpu = self.options.gpu
        if master_gpu is None or master_gpu < 0:
            raise RuntimeError("Distributed training requires "
                               "to put the model on the GPU, but the GPU is "
                               "not given in the argument")
        # This is needed for distributed model since the distributed model
        # initialization will require the model be on the GPU, even though
        # the later code will put the same model on the GPU again with
        # self.options.gpu, so this should be ok
        self.resnet.cuda(master_gpu)
        self.resnet = nn.parallel.DistributedDataParallel(
            self.resnet,
            output_device=master_gpu)
Esempio n. 3
0
def init_platform():
    config_file = cfg_from_file('config.yml')
    default_file = cfg_from_file('default.yml')
    logger.info(pprint.pformat(default_file))
    logger.info(pprint.pformat(config_file))

    merge_a_into_b(config_file, config)
    merge_a_into_b(default_file, default)
    default.best_model_path = ''

    if default.gpu == '':
        default.gpu = None
    if default.gpu is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = default.gpu

    default.distributed = default.world_size > 1
    if default.distributed:
        dist.init_process_group(backend=default.dist_backend, init_method=default.dist_url,
                                world_size=default.world_size)

    default.lr_epoch = [int(ep) for ep in default.lr_step.split(',')]

    if default.seed is not None:
        seed = default.seed
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        cudnn.deterministic = True
Esempio n. 4
0
def main():
    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed: args.gpu = args.rank % torch.cuda.device_count()

    if args.distributed:
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)

    if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    if args.pretrained: model = models.__dict__[args.arch](pretrained=True)
    else:               model = models.__dict__[args.arch]()

    model = model.cuda()
    if args.distributed: model = DDP(model)

    data, train_sampler = torch_loader(f'{args.data}-160', 128, 256)
    learner = Learner.from_model_data(model, data)
    learner.crit = F.cross_entropy
    learner.metrics = [accuracy, top5]
    if args.fp16: learner.half()
    update_model_dir(learner, args.save_dir)

    wd=1e-4
    lr=0.1
    data, train_sampler = torch_loader(args.data, 224, 192)
    learner.set_data(data)
    fit(learner, '1', lr/4, 1, train_sampler, wd)
    fit(learner, '1', lr/2, 1, train_sampler, wd)
    fit(learner, '2', lr, 28, train_sampler, wd)

    #data, train_sampler = torch_loader(args.data, 224, 192)
    #learner.set_data(data)
    #fit(learner, '3', lr, 5, train_sampler, wd)
    fit(learner, '4', lr/10, 25, train_sampler, wd)
    fit(learner, '5', lr/100, 25, train_sampler, wd)

    data, train_sampler = torch_loader(args.data, 288, 128, min_scale=0.5)
    learner.set_data(data)
    fit(learner, '6', lr/500, 10, train_sampler, wd)
    #save_sched(learner.sched, args.save_dir)
    #fit(learner, '7', 1e-4, 10, train_sampler, wd/4)

    # TTA works ~50% of the time. Hoping top5 works better
    print('\n TTA \n')
    log_preds,y = learner.TTA()
    preds = np.mean(np.exp(log_preds),0)
    acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y))
    t5 = top5(torch.FloatTensor(preds),torch.LongTensor(y))
    print('TTA acc:', acc)
    print('TTA top5:', t5[0])
    with open(f'{args.save_dir}/tta_accuracy.txt', "a", 1) as f:
        f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracy: {acc}\tTop5: {t5}")

    print('Finished!')
 def __init__(self, party, master_addr='127.0.0.1', master_port='29500'):
     self.party = party
     if party:
         self.other = 0
     else:
         self.other = 1
     os.environ['MASTER_ADDR'] = master_addr
     os.environ['MASTER_PORT'] = master_port
     # currently only supports sending between 2 parties over tcp
     dist.init_process_group('tcp', rank=party, world_size=2)
Esempio n. 6
0
def main():
    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed: args.gpu = args.rank % torch.cuda.device_count()

    if args.distributed:
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    if args.fp16: assert torch.backends.cudnn.enabled, "missing cudnn"

    model = cifar10models.__dict__[args.arch] if args.arch in cifar10_names else models.__dict__[args.arch]
    if args.pretrained: model = model(pretrained=True)
    else: model = model()

    model = model.cuda()
    if args.distributed: model = DDP(model)
    if args.data_parallel: model = nn.DataParallel(model, [0,1,2,3])

    data, train_sampler = torch_loader(args.data, args.sz)

    learner = Learner.from_model_data(model, data)
    #print (learner.summary()); exit()
    learner.crit = F.cross_entropy
    learner.metrics = [accuracy]
    if args.fp16: learner.half()

    if args.prof: args.epochs,args.cycle_len = 1,0.01
    if args.use_clr: args.use_clr = tuple(map(float, args.use_clr.split(',')))

    # Full size
    update_model_dir(learner, args.save_dir)
    sargs = save_args('first_run', args.save_dir)

    if args.warmup:
        learner.fit(args.lr/10, 1, cycle_len=1, sampler=train_sampler, wds=args.weight_decay,
                use_clr_beta=(100,1,0.9,0.8), loss_scale=args.loss_scale, **sargs)

    learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len,
                sampler=train_sampler, wds=args.weight_decay,
                use_clr_beta=args.use_clr, loss_scale=args.loss_scale,
                **sargs)
    save_sched(learner.sched, args.save_dir)

    print('Finished!')

    if args.use_tta:
        log_preds,y = learner.TTA()
        preds = np.mean(np.exp(log_preds),0)
        acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y))
        print('TTA acc:', acc)

        with open(f'{args.save_dir}/tta_accuracy.txt', "a", 1) as f:
            f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracty: {acc}\n")
Esempio n. 7
0
def test_mpi():
    dist.init_process_group('mpi')
    world_size = dist.get_world_size()
    rank = dist.get_rank()

    vector = [0] * world_size
    vector[rank] = 1
    vector = torch.DoubleTensor(vector)

    dist.all_reduce(vector, op=dist.reduce_op.SUM)
    print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
Esempio n. 8
0
 def _run(self, rank):
     self.rank = rank
     try:
         dist.init_process_group(backend=BACKEND)
     except RuntimeError as e:
         if 'recompile' in e.args[0]:
             sys.exit(0)
     # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
     # We're retreiving a corresponding test and executing it.
     getattr(self, self.id().split(".")[2])()
     sys.exit(0)
Esempio n. 9
0
def initialize(options):
    """Initialize environment and add additional information."""
    if not (hasattr(dist, '_initialized') and dist._initialized):
        dist.init_process_group(options.comm_backend)

    config_logging(options)

    config_pytorch(options)

    config_path(options)

    return options
Esempio n. 10
0
 def _run(self, rank):
     self.rank = rank
     try:
         dist.init_process_group(init_method=INIT_METHOD,
                                 backend=BACKEND,
                                 world_size=int(WORLD_SIZE))
     except RuntimeError as e:
         if 'recompile' in e.args[0]:
             sys.exit(0)
         raise
     # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
     # We're retreiving a corresponding test and executing it.
     getattr(self, self.id().split(".")[2])()
     sys.exit(0)
Esempio n. 11
0
def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ["RANK"])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)
Esempio n. 12
0
def test_synchronize_sgd():
    torch.manual_seed(42)
    dist.init_process_group('mpi')
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    device = torch.device('cpu')
    # device = torch.device('cuda') # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random Tensors to hold input and outputs
    x = torch.randn(N, D_in, device=device)
    y = torch.randn(N, D_out, device=device)

    x = x[rank::world_size]
    y = y[rank::world_size]

    # Create random Tensors for weights; setting requires_grad=True means that we
    # want to compute gradients for these Tensors during the backward pass.
    w1 = torch.randn(D_in, H, device=device, requires_grad=True)
    w2 = torch.randn(H, D_out, device=device, requires_grad=True)

    learning_rate = 1e-6
    for t in range(500):
        # Forward pass: compute predicted y using operations on Tensors. Since w1 and
        # w2 have requires_grad=True, operations involving these Tensors will cause
        # PyTorch to build a computational graph, allowing automatic computation of
        # gradients. Since we are no longer implementing the backward pass by hand we
        # don't need to keep references to intermediate values.
        y_pred = x.mm(w1).clamp(min=0).mm(w2)

        # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
        # is a Python number giving its value.
        loss = (y_pred - y).pow(2).sum()

        if rank == 0:
            print("Iter {} : {:10.3e}".format(t, loss.item()))

        # Use autograd to compute the backward pass. This call will compute the
        # gradient of loss with respect to all Tensors with requires_grad=True.
        # After this call w1.grad and w2.grad will be Tensors holding the gradient
        # of the loss with respect to w1 and w2 respectively.
        loss.backward()

        # Update weights using gradient descent. For this step we just want to mutate
        # the values of w1 and w2 in-place; we don't want to build up a computational
        # graph for the update steps, so we use the torch.no_grad() context manager
        # to prevent PyTorch from building a computational graph for the updates
        with torch.no_grad():
            w1 -= learning_rate * w1.grad
            w2 -= learning_rate * w2.grad

            # Manually zero the gradients after running the backward pass
            w1.grad.zero_()
            w2.grad.zero_()

            # Synchronize weights
            dist.all_reduce(w1, op=dist.reduce_op.SUM)
            dist.all_reduce(w2, op=dist.reduce_op.SUM)
            w1 /= world_size
            w2 /= world_size
Esempio n. 13
0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


if __name__ == '__main__':
    args = parser.parse_args()
    args.distributed = args.world_size > 1
    main_proc = True
    if args.distributed:
        if args.gpu_rank:
            torch.cuda.set_device(int(args.gpu_rank))
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
        main_proc = args.rank == 0  # Only the first proc should save models
    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
        args.epochs)
    best_wer = None
    if args.visdom and main_proc:
        from visdom import Visdom

        viz = Visdom()
        opts = dict(title=args.id, ylabel='', xlabel='Epoch', legend=['Loss', 'WER', 'CER'])
        viz_window = None
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard and main_proc:
        os.makedirs(args.log_dir, exist_ok=True)
Esempio n. 14
0
def main():
    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed: args.gpu = args.rank % torch.cuda.device_count()

    if args.distributed:
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)

    if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
    if args.cycle_len > 1: args.cycle_len = int(args.cycle_len)

    # create model
    if args.pretrained: model = models.__dict__[args.arch](pretrained=True)
    else:               model = models.__dict__[args.arch]()

    model = model.cuda()
    if args.distributed: model = DDP(model)

    if args.train_128: data, train_sampler = torch_loader(f'{args.data}-160', 128)
    else:              data, train_sampler = torch_loader(args.data, args.sz)

    learner = Learner.from_model_data(model, data)
    learner.crit = F.cross_entropy
    learner.metrics = [accuracy, top5]
    if args.fp16: learner.half()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.resume.endswith('.h5'): args.resume = args.resume[:-len('.h5')]
            learner.load(args.resume)
        else: print("=> no checkpoint found at '{}'".format(args.resume))    

    if args.prof:
        args.epochs = 1
        args.cycle_len=1
    if args.use_clr: args.use_clr = tuple(map(float, args.use_clr.split(',')))

    # 128x128
    if args.train_128:
        save_dir = f'{args.save_dir}/128'
        update_model_dir(learner, save_dir)
        sargs = save_args('first_run_128', save_dir)
        learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len,
                    sampler=train_sampler, wds=args.weight_decay, use_clr_beta=args.use_clr,
                    loss_scale=args.loss_scale, **sargs)
        save_sched(learner.sched, save_dir)
        data, train_sampler = torch_loader(args.data, args.sz)
        learner.set_data(data)


    # Full size
    update_model_dir(learner, args.save_dir)
    sargs = save_args('first_run', args.save_dir)
    learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len,
                sampler=train_sampler, wds=args.weight_decay, use_clr_beta=args.use_clr,
                loss_scale=args.loss_scale, **sargs)
    save_sched(learner.sched, args.save_dir)

    # TTA works ~50% of the time. Hoping top5 works better
    if args.use_tta:
        log_preds,y = learner.TTA()
        preds = np.mean(np.exp(log_preds),0)
        acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y))
        t5 = top5(torch.FloatTensor(preds),torch.LongTensor(y))
        print('TTA acc:', acc)
        print('TTA top5:', t5[0])

        with open(f'{args.save_dir}/tta_accuracy.txt', "a", 1) as f:
            f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracy: {acc}\tTop5: {t5}")

    print('Finished!')
Esempio n. 15
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    if args.scratch:
        checkpoint = torch.load(args.scratch)
        model = resnet34(cfg=checkpoint['cfg'])

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    history_score = np.zeros((args.epochs + 1, 1))
    np.savetxt(os.path.join(args.save, 'record.txt'),
               history_score,
               fmt='%10.5f',
               delimiter=',')
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)
        history_score[epoch] = prec1
        np.savetxt(os.path.join(args.save, 'record.txt'),
                   history_score,
                   fmt='%10.5f',
                   delimiter=',')

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer': optimizer.state_dict(),
            }, is_best, args.save)

    history_score[-1] = best_prec1
    np.savetxt(os.path.join(args.save, 'record.txt'),
               history_score,
               fmt='%10.5f',
               delimiter=',')
Esempio n. 16
0
def main():
    global args, best_prec1

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)
    # print(args.resume)

    # model = torch.load(args.resume)

    model = ResNetDCT_Upscaled_Static(channels=int(args.subset),
                                      input_gate=True,
                                      pretrained=False)
    model.fc1 = nn.Linear(model.fc1.in_features, 49)
    model = nn.DataParallel(model)
    cudnn.benchmark = True
    print(args.resume)
    model.load_state_dict(torch.load(args.resume))
    # # define loss function (criterion) and optimizer
    # criterion = nn.CrossEntropyLoss().cuda()
    # optimizer = torch.optim.SGD(model.parameters(),
    #                             lr=0.001,
    #                             momentum=0.9,
    #                             weight_decay=5e-4)

    # # Resume
    # print("1")
    # title = 'ImageNet-' + args.arch
    # if not os.path.isdir(args.checkpoint):
    #     mkdir_p(args.checkpoint)

    # if args.resume:
    #     # Load checkpoint.
    #     print('==> Resuming from checkpoint..')
    #     checkpoint = torch.load(args.resume)
    #     best_prec1 = checkpoint['best_prec1']
    #     model_dict = model.state_dict()
    #     pretrained_dict = {
    #         k: v
    #         for k, v in checkpoint['state_dict'].items() if k in model_dict
    #     }
    #     model_dict.update(pretrained_dict)
    #     model.load_state_dict(model_dict, strict=False)
    #     #model.load_state_dict(checkpoint['state_dict'], strict=False)
    #     args.checkpoint = os.path.dirname(args.resume)

    # if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
    #     model.features = torch.nn.DataParallel(model.features)
    #     model = model.cuda()
    # else:
    #     model = torch.nn.DataParallel(model).cuda()

    # cudnn.benchmark = True
    # print('Total params: %.2fM' %
    #       (sum(p.numel() for p in model.parameters()) / 1000000.0))

    # Data loading code

    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
    #                                             step_size=10,
    #                                             gamma=0.1)
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [160, 180],
    #                                                  0.1)
    test_loader, test_val = testloader_upscaled_static(args, model='resnet')
    print('Evaluation only')
    test_model(
        model,
        test_loader,
    )
    # test1(model, val_loader)
    # test(model)
    return
Esempio n. 17
0
def main():
    global best_prec1, args

    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed:
        args.gpu = args.rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    model = model.cuda()
    n_dev = torch.cuda.device_count()
    if args.fp16: model = network_to_half(model)
    if args.distributed:
        model = DDP(model)
        #args.lr *= n_dev
    elif args.dp:
        model = nn.DataParallel(model)
        args.batch_size *= n_dev
        #args.lr *= n_dev

    global param_copy
    if args.fp16:
        param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()]
        for param in param_copy: param.requires_grad = True
    else: param_copy = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else: print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(args.sz),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset)
                     if args.distributed else None)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(int(args.sz*1.14)),
            transforms.CenterCrop(args.sz),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed: train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)
        if args.prof: break
        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
Esempio n. 18
0
# global parameters
num_warmup = 1
num_benchmark = 5

# get env variables
comm_addr = os.getenv("SLURM_SRUN_COMM_HOST")
comm_size = int(os.getenv("SLURM_NTASKS"))
comm_rank = int(os.getenv("PMI_RANK"))
comm_local_rank = comm_rank % torch.cuda.device_count()
comm_port = "29500"
os.environ["MASTER_ADDR"] = comm_addr
os.environ["MASTER_PORT"] = comm_port

# init process group
dist.init_process_group(backend="nccl", rank=comm_rank, world_size=comm_size)

#load parameters
params = YParams("config/UNet_transpose.yaml", "default")
device = torch.device("cuda:{}".format(comm_local_rank))

# get data loader
dist.barrier()
tstart = time.time()
train_data_loader = get_data_loader_distributed(params, comm_rank,
                                                comm_local_rank)
dist.barrier()
tend = time.time()
if comm_rank == 0:
    print("Setup: took {}s".format(tend - tstart))
Esempio n. 19
0
        assert len(opt.cfg) or len(
            opt.weights), 'either --cfg or --weights must be specified'
        opt.img_size.extend(
            [opt.img_size[-1]] *
            (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)
        log_dir = increment_dir(Path(opt.logdir) / 'exp',
                                opt.name)  # runs/exp1

    device = select_device(opt.device, batch_size=opt.batch_size)

    # DDP mode
    if opt.local_rank != -1:
        assert torch.cuda.device_count() > opt.local_rank
        torch.cuda.set_device(opt.local_rank)
        device = torch.device('cuda', opt.local_rank)
        dist.init_process_group(backend='nccl',
                                init_method='env://')  # distributed backend
        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
        opt.batch_size = opt.total_batch_size // opt.world_size

    logger.info(opt)
    with open(opt.hyp) as f:
        hyp = yaml.load(f, Loader=yaml.FullLoader)  # load hyps

    # Train
    if not opt.evolve:
        tb_writer = None
        if opt.global_rank in [-1, 0]:
            logger.info(
                'Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/'
                % opt.logdir)
            tb_writer = SummaryWriter(log_dir=log_dir)  # runs/exp0
Esempio n. 20
0
def train300_mlperf_coco(args):
    global torch
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    args.distributed = False
    if use_cuda:
        try:
            from apex.parallel import DistributedDataParallel as DDP
            if 'WORLD_SIZE' in os.environ:
                args.distributed = int(os.environ['WORLD_SIZE']) > 1
        except:
            raise ImportError("Please install APEX from https://github.com/nvidia/apex")

    if args.distributed:
        # necessary pytorch imports
        import torch.utils.data.distributed
        import torch.distributed as dist
 #     ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED)
        if args.no_cuda:
            device = torch.device('cpu')
        else:
            torch.cuda.set_device(args.local_rank)
            device = torch.device('cuda')
            dist.init_process_group(backend='nccl',
                                    init_method='env://')
            # set seeds properly
            args.seed = broadcast_seeds(args.seed, device)
            local_seed = (args.seed + dist.get_rank()) % 2**32
            print(dist.get_rank(), "Using seed = {}".format(local_seed))
            torch.manual_seed(local_seed)
            np.random.seed(seed=local_seed)


    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    input_size = 300
    train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False)
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)
    ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size)

    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)

    #print("Number of labels: {}".format(train_coco.labelnum))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco)
    else:
        train_sampler = None
    train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4)
    # set shuffle=True in DataLoader
    ssd_print(key=mlperf_log.INPUT_SHARD, value=None)
    ssd_print(key=mlperf_log.INPUT_ORDER)
    ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size)


    ssd300 = SSD300(train_coco.labelnum)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])
    ssd300.train()
    if use_cuda:
        ssd300.cuda()
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda()
    if args.distributed:
        N_gpu = torch.distributed.get_world_size()
    else:
        N_gpu = 1

	# parallelize
    if args.distributed:
        ssd300 = DDP(ssd300)

    global_batch_size = N_gpu * args.batch_size
    current_lr = args.lr * (global_batch_size / 32)
    current_momentum = 0.9
    current_weight_decay = 5e-4
    optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr,
                            momentum=current_momentum,
                            weight_decay=current_weight_decay)
    ssd_print(key=mlperf_log.OPT_NAME, value="SGD")
    ssd_print(key=mlperf_log.OPT_LR, value=current_lr)
    ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum)
    ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY,
                         value=current_weight_decay)
    eval_points = np.array(args.evaluation) * 32 / global_batch_size
    eval_points = list(map(int, list(eval_points)))
    print("epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v:k for k,v in val_coco.label_map.items()}
    success = torch.zeros(1)
    if use_cuda:
        success = success.cuda()


    for epoch in range(args.epochs):
        ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        # set the epoch for the sampler
        if args.distributed:
            train_sampler.set_epoch(epoch)
        for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader):

            if iter_num == (160000 * 32) // global_batch_size:
                current_lr *= 0.1
                print("")
                print("lr decay step #1")
                for param_group in optim.param_groups:
                    param_group['lr'] = current_lr
                ssd_print(key=mlperf_log.OPT_LR,
                                     value=current_lr)
            if iter_num == (200000 * 32) // global_batch_size:
                current_lr *= 0.1
                print("")
                print("lr decay step #2")
                for param_group in optim.param_groups:
                    param_group['lr'] = current_lr
                ssd_print(key=mlperf_log.OPT_LR,
                                     value=current_lr)
            if use_cuda:
                img = img.cuda()
            img = Variable(img, requires_grad=True)
            ploc, plabel = ssd300(img)
            trans_bbox = bbox.transpose(1,2).contiguous()
            if use_cuda:
                trans_bbox = trans_bbox.cuda()
                label = label.cuda()
            gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                           Variable(label, requires_grad=False)
            loss = loss_func(ploc, plabel, gloc, glabel)

            if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item()

            print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\
                        .format(iter_num, loss.item(), avg_loss), end="\r")
            optim.zero_grad()
            loss.backward()
            if args.warmup is not None:
                lr_warmup(optim, args.warmup, iter_num, current_lr, args)
            optim.step()

            if iter_num in eval_points:
                rank = dist.get_rank() if args.distributed else args.local_rank
                if rank == 0:
                    if not args.no_save:
                        print("")
                        print("saving model...")
                        torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info},
                                   "./models/iter_{}.pt".format(iter_num))

                    if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map,
                                args.threshold, epoch,iter_num):
                        success = torch.ones(1)
                        if use_cuda:
                            success = success.cuda()
                if args.distributed:
                    dist.all_reduce(success, op=dist.reduce_op.MAX)
                if success[0]:
                    return True
            iter_num += 1
    return False
Esempio n. 21
0
def main(cfg, local_rank):
    torch.manual_seed(cfg.SEED)
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
    Dataset = get_dataset(cfg.SAMPLE_METHOD, cfg.TASK)


    print('Creating model...')
    model = create_model(cfg.MODEL.NAME, cfg.MODEL.HEAD_CONV, cfg)
    
    num_gpus = torch.cuda.device_count()

    if cfg.TRAIN.DISTRIBUTE:
        device = torch.device('cuda:%d'%local_rank)
        torch.cuda.set_device(local_rank)
        dist.init_process_group(backend='nccl', init_method='env://',
                                world_size=num_gpus, rank=local_rank)
    else:
        device = torch.device('cuda')
             
    logger = Logger(cfg)

        
    if cfg.TRAIN.OPTIMIZER=='adam':
        optimizer = torch.optim.Adam(model.parameters(), cfg.TRAIN.LR)
    elif cfg.TRAIN.OPTIMIZER== 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=0.9)    
    else:
        NotImplementedError
        
    start_epoch = 0
    #if cfg.MODEL.INIT_WEIGHTS:
    #    model, optimizer, start_epoch = load_model(
    #      model, cfg.MODEL.PRETRAINED, optimizer, cfg.TRAIN.RESUME, cfg.TRAIN.LR, cfg.TRAIN.LR_STEP)

    Trainer = train_factory[cfg.TASK]
    trainer = Trainer(cfg, local_rank, model, optimizer)

    cfg.TRAIN.MASTER_BATCH_SIZE

    if cfg.TRAIN.MASTER_BATCH_SIZE == -1:
        master_batch_size = cfg.TRAIN.BATCH_SIZE // len(cfg.GPUS)
    else:
        master_batch_size = cfg.TRAIN.MASTER_BATCH_SIZE
    rest_batch_size = (cfg.TRAIN.BATCH_SIZE - master_batch_size)
    chunk_sizes = [cfg.TRAIN.MASTER_BATCH_SIZE]
    for i in range(len(cfg.GPUS) - 1):
        slave_chunk_size = rest_batch_size // (len(cfg.GPUS) - 1)
        if i < rest_batch_size % (len(cfg.GPUS) - 1):
            slave_chunk_size += 1
        chunk_sizes.append(slave_chunk_size)
    trainer.set_device(cfg.GPUS, chunk_sizes, device)

    print('Setting up data...')
    val_dataset = Dataset(cfg, 'val')
    val_loader = torch.utils.data.DataLoader(
      val_dataset, 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
    )
    
    train_dataset = Dataset(cfg, 'train')
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                  num_replicas=num_gpus,
                                                                  rank=local_rank)
    train_loader = torch.utils.data.DataLoader(
      train_dataset, 
      batch_size=cfg.TRAIN.BATCH_SIZE//num_gpus if cfg.TRAIN.DISTRIBUTE else cfg.TRAIN.BATCH_SIZE, 
      shuffle=not cfg.TRAIN.DISTRIBUTE,
      num_workers=cfg.WORKERS,
      pin_memory=True,
      drop_last=True,
      sampler = train_sampler if cfg.TRAIN.DISTRIBUTE else None
    )

    print('Starting training...')
    best = 0.
    for epoch in range(start_epoch + 1, cfg.TRAIN.EPOCHS + 1):
        mark = epoch if cfg.TRAIN.SAVE_ALL_MODEL else 'last'
        train_sampler.set_epoch(epoch)
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if cfg.TRAIN.VAL_INTERVALS > 0 and epoch % cfg.TRAIN.VAL_INTERVALS == 0:
            save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
                mAP = val_dataset.run_eval(preds, cfg.OUTPUT_DIR)
                print('mAP is: ', mAP)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if mAP > best:
                best = mAP
                save_model(os.path.join(cfg.OUTPUT_DIR, 'model_best.pth'), 
                           epoch, model)
        else:
            save_model(os.path.join(cfg.OUTPUT_DIR, 'model_last.pth'), 
                     epoch, model, optimizer)
        logger.write('\n')
        if epoch in cfg.TRAIN.LR_STEP:
            save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(epoch)), 
                     epoch, model, optimizer)
            lr = cfg.TRAIN.LR * (0.1 ** (cfg.TRAIN.LR_STEP.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
              param_group['lr'] = lr
    logger.close()
Esempio n. 22
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    else:
        print("We should be distributed")
        return
    # create model
    model = None
    if args.arch == "alexnet":
        model = AlexNet()
    if args.arch == "vgg16":
        model = VGG16()
    if args.arch == "vgg19":
        model = VGG19()
    if args.arch == "vgg13":
        model = VGG13()
    if args.arch == "lenet":
        model = LeNet()
    if args.arch == "resnet50":
        model = resnet50()
    if args.arch == "goolenet":
        model = GoogLeNet()

    torch.cuda.set_device(args.gpu)
    model.cuda(args.gpu)
    # When using a single GPU per process and per
    # DistributedDataParallel, we need to divide the batch size
    # ourselves based on the total number of GPUs we have
    args.batch_size = int(args.batch_size / ngpus_per_node)
    args.workers = int(args.workers / ngpus_per_node)
    # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
    model = distributed_layer_sparse_whole_randperm_threshold_mem.DistributedDataParallel(
        model,
        device_id=args.gpu,
        sparse_ratio=args.sparse_ratio,
        sparse_threshold=args.sparse_threshold,
        mem_decay=args.memory_decay)

    args.file_name = args.method + '_' + args.arch + "batch-size_" + str(args.batch_size) + "_sparse-ratio_" + str(args.sparse_ratio) + \
        "_sparse-threshold_" + str(args.sparse_threshold) + "_memory-decay_" + str(args.memory_decay)

    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    #traindir = os.path.join(args.data, 'train')
    #valdir = os.path.join(args.data, 'val')
    #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
    #                                 std=[0.229, 0.224, 0.225])

    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    train_dataset = torchvision.datasets.CIFAR10(root=args.data,
                                                 train=True,
                                                 download=True,
                                                 transform=transform_train)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               sampler=train_sampler)

    testset = torchvision.datasets.CIFAR10(root=args.data,
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    val_loader = torch.utils.data.DataLoader(testset,
                                             batch_size=100,
                                             shuffle=False,
                                             num_workers=2)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return
    epoch_start = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)
        adjust_mem_decay(model, epoch, args)

        # train for one epoch
        losses = train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
            f = open(args.data + "/result_" + args.file_name + ".txt", "a+")
            f.write(
                str(epoch + 1) + '\t' + str(time.time() - epoch_start) + '\t' +
                str(losses.avg) + '\t' + str(acc1.item()) + '\n')
            f.close()
Esempio n. 23
0
    def __init__(
        self,
        name: str,
        rank: int = -1,
        world_size: int = -1,
        init_dist: bool = True,
        init_rpc: bool = True,
        dist_backend: str = "gloo",
        dist_init_method: str = "tcp://localhost:9100",
        rpc_init_method: str = "tcp://localhost:9101",
        dist_timeout: float = 60,
        rpc_timeout: float = 60,
    ):
        """
        Args:
            name: A unique name to identify current process.
            rank: A unique rank of the current process. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            world_size:   Size of the distributed world. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            dist_timeout: Distributed package timeout in seconds.
            rpc_timeout:  Global rpc call timeout in seconds.
        """
        self.world_size = world_size
        self.rank = rank
        self.name = name
        self.groups = {}
        self.group_create_signals = {}

        if init_dist:
            dist.init_process_group(
                backend=dist_backend,
                init_method=dist_init_method,
                timeout=timedelta(seconds=dist_timeout),
                rank=rank,
                world_size=world_size,
            )
        if init_rpc:
            rpc.init_rpc(
                self.name,
                rank=rank,
                world_size=world_size,
                backend=rpc.BackendType.TENSORPIPE,
                rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
                    init_method=rpc_init_method, rpc_timeout=rpc_timeout),
            )

        # get rank-name mapping
        self.rank_name_map = {}
        for wi in rpc._get_current_rpc_agent().get_worker_infos():
            self.rank_name_map[wi.id] = wi.name

        # Start role dispatching.
        self.started = True
        self.rpc_timeout = rpc_timeout

        # map for paired values and registered services
        self.value_lut = {}
        self.service_lut = {}
        self.lut_lock = Lock()
        self.lut_manager = self.rank_name_map[0]
Esempio n. 24
0
                    help='set the inclusive lower limit for tensor size; ' +
                    'default: 19 (2**19 = 512 KB)')
parser.add_argument('--min-num-tensors', dest='min_num_tensors', action='store',
                    default=2, type=int,
                    help='set the inclusive lower limit for the number of ' +
                    'tensors to be sent during one test run; ' +
                    'default: 2 (10**2 = 100)')

args = parser.parse_args()

MIN_NUM_TENSORS = args.min_num_tensors
MIN_BYTES = args.min_bytes
MAX_NUM_TENSORS = args.max_num_tensors + 1
MAX_BYTES = args.max_bytes + 1

dist.init_process_group(backend=os.environ['BACKEND'])

rank = dist.get_rank()
dist.barrier()

if rank == 0:
    print_header("broadcast")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.broadcast(tensor, 0)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
Esempio n. 25
0
 def __enter__(self):
     dist.init_process_group(*self.args, **self.kwargs)
Esempio n. 26
0
File: misc.py Progetto: Bakqui/MAAC
def init_processes(rank, size, fn, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)
Esempio n. 27
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
Esempio n. 28
0
def train(
        cfg,
        data_cfg,
        img_size=1024,
        resume=False,
        epochs=273,  # 500200 batches at bs 64, dataset length 117263
        batch_size=16,
        accumulate=1,
        multi_scale=False,
        freeze_backbone=False,
        transfer=True  # Transfer learning (train only YOLO layers)
):
    init_seeds()
    weights = 'weights' + os.sep
    latest = weights + 'latest.pt'
    best = weights + 'best.pt'
    device = torch_utils.select_device()

    if multi_scale:
        img_size = 1024  # initiate with maximum multi_scale size
        opt.num_workers = 0  # bug https://github.com/ultralytics/yolov3/issues/174
    else:
        torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Configure run
    train_path = parse_data_cfg(data_cfg)['train']

    # Initialize model
    model = Darknet(cfg, img_size).to(device)

    # Optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=hyp['lr0'],
                          momentum=hyp['momentum'],
                          weight_decay=hyp['weight_decay'])

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_loss = float('inf')
    nf = int(model.module_defs[model.yolo_layers[0] -
                               1]['filters'])  # yolo layer size (i.e. 255)
    if resume:  # Load previously saved model
        if transfer:  # Transfer learning
            chkpt = torch.load(weights + 'yolov3-dota.pt', map_location=device)
            model.load_state_dict(
                {
                    k: v
                    for k, v in chkpt['model'].items()
                    if v.numel() > 1 and v.shape[0] != 255
                },
                strict=False)
            for p in model.parameters():
                p.requires_grad = True if p.shape[0] == nf else False

        else:  # resume from latest.pt
            chkpt = torch.load(latest, map_location=device)  # load checkpoint
            model.load_state_dict(chkpt['model'])

        start_epoch = chkpt['epoch'] + 1
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_loss = chkpt['best_loss']
        del chkpt

    else:  # Initialize model with backbone (optional)
        if '-tiny.cfg' in cfg:
            cutoff = load_darknet_weights(model,
                                          weights + 'yolov3-tiny.conv.15')
        else:
            cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

    # Scheduler (reduce lr at epochs 218, 245, i.e. batches 400k, 450k)
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (-2 * x / epochs)  # exp ramp to lr0 * 1e-2
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inv exp ramp to lr0 * 1e-2
    # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=[218, 245],
                                               gamma=0.1,
                                               last_epoch=start_epoch - 1)

    # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y)

    # Dataset
    dataset = LoadImagesAndLabels(train_path, img_size=img_size, augment=True)

    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend=opt.backend,
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.rank)
        model = torch.nn.parallel.DistributedDataParallel(model)
        sampler = torch.utils.data.distributed.DistributedSampler(dataset)
    else:
        sampler = None

    # Dataloader
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            num_workers=opt.num_workers,
                            shuffle=False,
                            pin_memory=True,
                            collate_fn=dataset.collate_fn,
                            sampler=sampler)

    # Mixed precision training https://github.com/NVIDIA/apex
    # install help: https://github.com/NVIDIA/apex/issues/259
    mixed_precision = False
    if mixed_precision:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Start training
    t = time.time()
    model.hyp = hyp  # attach hyperparameters to model
    model_info(model)
    nb = len(dataloader)
    results = (0, 0, 0, 0, 0)  # P, R, mAP, F1, test_loss
    n_burnin = min(round(nb / 5 + 1), 1000)  # burn-in batches
    os.remove('train_batch0.jpg') if os.path.exists(
        'train_batch0.jpg') else None
    os.remove('test_batch0.jpg') if os.path.exists('test_batch0.jpg') else None
    for epoch in range(start_epoch, epochs):
        model.train()
        print(
            ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf',
                                          'cls', 'total', 'nTargets', 'time'))

        # Update scheduler
        scheduler.step()

        # Freeze backbone at epoch 0, unfreeze at epoch 1
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        mloss = torch.zeros(5).to(device)  # mean losses
        for i, (imgs, targets, _, _) in enumerate(dataloader):
            imgs = imgs.to(device)
            targets = targets.to(device)
            nt = len(targets)
            # if nt == 0:  # if no targets continue
            #     continue

            # Plot images with bounding boxes
            if epoch == 0 and i == 0:
                plot_images(imgs=imgs,
                            targets=targets,
                            fname='train_batch0.jpg')

            # SGD burn-in
            if epoch == 0 and i <= n_burnin:
                lr = hyp['lr0'] * (i / n_burnin)**4
                for x in optimizer.param_groups:
                    x['lr'] = lr

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)
            if torch.isnan(loss):
                print('WARNING: nan loss detected, ending training')
                return results

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if (i + 1) % accumulate == 0 or (i + 1) == nb:
                optimizer.step()
                optimizer.zero_grad()

            # Update running mean of tracked metrics
            mloss = (mloss * i + loss_items) / (i + 1)

            # Print batch results
            s = ('%8s%12s' +
                 '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%g/%g' %
                                  (i, nb - 1), *mloss, nt, time.time() - t)
            t = time.time()
            print(s)

            # Multi-Scale training (320 - 608 pixels) every 10 batches
            if multi_scale and (i + 1) % 10 == 0:
                dataset.img_size = random.choice(range(10, 20)) * 32
                print('multi_scale img_size = %g' % dataset.img_size)

        # Calculate mAP (always test final epoch, skip first 5 if opt.nosave)
        if not (opt.notest or
                (opt.nosave and epoch < 5)) or epoch == epochs - 1:
            with torch.no_grad():
                results = test.test(cfg,
                                    data_cfg,
                                    batch_size=batch_size,
                                    img_size=img_size,
                                    model=model,
                                    conf_thres=0.1)

        # Write epoch results
        with open('results.txt', 'a') as file:
            file.write(s + '%11.3g' * 5 % results +
                       '\n')  # P, R, mAP, F1, test_loss

        # Update best loss
        test_loss = results[4]
        if test_loss < best_loss:
            best_loss = test_loss

        # Save training results
        save = True and not opt.nosave
        if save:
            # Create checkpoint
            chkpt = {
                'epoch':
                epoch,
                'best_loss':
                best_loss,
                'model':
                model.module.state_dict()
                if type(model) is nn.parallel.DistributedDataParallel else
                model.state_dict(),
                'optimizer':
                optimizer.state_dict()
            }

            # Save latest checkpoint
            torch.save(chkpt, latest)

            # Save best checkpoint
            if best_loss == test_loss:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            if epoch > 0 and epoch % 10 == 0:
                torch.save(chkpt, weights + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

    return results
Esempio n. 29
0
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, local_rank=-1, evaluation_interval=5):
    total_batch = C.get()["batch"] 
    if local_rank >= 0:
        dist.init_process_group(backend='nccl', init_method='env://', world_size=int(os.environ['WORLD_SIZE']))
        device = torch.device('cuda', local_rank)
        torch.cuda.set_device(device)

        C.get()['lr'] *= dist.get_world_size()
        logger.info(f'local batch={C.get()["batch"]} world_size={dist.get_world_size()} ----> total batch={C.get()["batch"] * dist.get_world_size()}')
        total_batch = C.get()["batch"] * dist.get_world_size()

    is_master = local_rank < 0 or dist.get_rank() == 0 # is_master = true
    if is_master:
        add_filehandler(logger, 'lol.log') # 一个新的log文件
        #add_filehandler(logger, args.save + '.log') # 无效path
 
    if not reporter:
        reporter = lambda **kwargs: 0

    max_epoch = C.get()['epoch']
    trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0))

    # create a model & an optimizer
    model = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=local_rank)
    model_ema = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=-1) # model_ema是干嘛的
    model_ema.eval()

    criterion_ce = criterion = CrossEntropyLabelSmooth(num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0))
    if C.get().conf.get('mixup', 0.0) > 0.0:
        criterion = CrossEntropyMixUpLabelSmooth(num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0))
    if C.get()['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(
            model.parameters(),
            lr=C.get()['lr'],
            momentum=C.get()['optimizer'].get('momentum', 0.9),
            weight_decay=0.0,
            nesterov=C.get()['optimizer'].get('nesterov', True)
        )
    elif C.get()['optimizer']['type'] == 'rmsprop':
        optimizer = RMSpropTF(
            model.parameters(),
            lr=C.get()['lr'],
            weight_decay=0.0,
            alpha=0.9, momentum=0.9,
            eps=0.001
        )
    else:
        raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type'])

    lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine')
    if lr_scheduler_type == 'cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=C.get()['epoch'], eta_min=0.)
    elif lr_scheduler_type == 'resnet':
        scheduler = adjust_learning_rate_resnet(optimizer)
    elif lr_scheduler_type == 'efficientnet':
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 0.97 ** int((x + C.get()['lr_schedule']['warmup']['epoch']) / 2.4))
    else:
        raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

    if C.get()['lr_schedule'].get('warmup', None) and C.get()['lr_schedule']['warmup']['epoch'] > 0:
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=C.get()['lr_schedule']['warmup']['multiplier'],
            total_epoch=C.get()['lr_schedule']['warmup']['epoch'],
            after_scheduler=scheduler
        )
    print('tag is ', tag)
    if not tag or not is_master:
        from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter
        logger.warning('tag not provided, no tensorboard log.')
    else:
        from tensorboardX import SummaryWriter
    writers = [SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']]

    if C.get()['optimizer']['ema'] > 0.0 and is_master:
        # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet
        ema = EMA(C.get()['optimizer']['ema'])
    else:
        ema = None # 进入none

    result = OrderedDict()
    epoch_start = 1
    if save_path != 'test.pth':     # and is_master: --> should load all data(not able to be broadcasted)
        if save_path and os.path.exists(save_path):
            logger.info('%s file found. loading...' % save_path)
            data = torch.load(save_path)
            key = 'model' if 'model' in data else 'state_dict'

            if 'epoch' not in data:
                model.load_state_dict(data)
            else:
                logger.info('checkpoint epoch@%d' % data['epoch'])
                if not isinstance(model, (DataParallel, DistributedDataParallel)):
                    print('get in not parallel')
                    model.load_state_dict({k.replace('module.', ''): v for k, v in data[key].items()})
                else:
                    print('get in parallel')
                    model.load_state_dict({k if 'module.' in k else 'module.'+k: v for k, v in data[key].items()})
                logger.info('optimizer.load_state_dict+')
                optimizer.load_state_dict(data['optimizer']) # optimizer也加载上次状态
                if data['epoch'] < C.get()['epoch']:
                    epoch_start = data['epoch']
                else:
                    print('only eval true')
                    only_eval = True
                if ema is not None:
                    ema.shadow = data.get('ema', {}) if isinstance(data.get('ema', {}), dict) else data['ema'].state_dict()
            del data
        else:
            logger.info('"%s" file not found. skip to pretrain weights...' % save_path)
            if only_eval:
                logger.warning('model checkpoint not found. only-evaluation mode is off.')
            only_eval = False

    if local_rank >= 0:
        for name, x in model.state_dict().items():
            dist.broadcast(x, 0)
        logger.info(f'multinode init. local_rank={dist.get_rank()} is_master={is_master}')
        torch.cuda.synchronize()

    tqdm_disabled = bool(os.environ.get('TASK_NAME', '')) and local_rank != 0  # KakaoBrain Environment

    #? only eval的模式下,也要运行一个epoch来获得loss之类的
    if only_eval:
        logger.info('evaluation only+')
        model.eval()
        rs = dict()
        rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0], is_master=is_master)

        with torch.no_grad():
            rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1], is_master=is_master)
            rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2], is_master=is_master)
            if ema is not None and len(ema) > 0:
                model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()})
                rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled)
                rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled)
        for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']):
            if setname not in rs:
                continue
            result['%s_%s' % (key, setname)] = rs[setname][key]
        result['epoch'] = 0
        return result
    logger.debug('get into train stage')
    # train loop
    best_top1 = 0
    for epoch in range(epoch_start, max_epoch + 1):
        if local_rank >= 0:
            trainsampler.set_epoch(epoch)

        model.train()
        rs = dict()
        rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=C.get()['optimizer']['decay'], tqdm_disabled=tqdm_disabled)
        model.eval()

        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        if ema is not None and C.get()['optimizer']['ema_interval'] > 0 and epoch % C.get()['optimizer']['ema_interval'] == 0:
            logger.info(f'ema synced+ rank={dist.get_rank()}')
            if ema is not None:
                model.load_state_dict(ema.state_dict())
            for name, x in model.state_dict().items():
                # print(name)
                dist.broadcast(x, 0)
            torch.cuda.synchronize()
            logger.info(f'ema synced- rank={dist.get_rank()}')

        if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch):
            with torch.no_grad():
                rs['valid'] = run_epoch(model, validloader, criterion_ce, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled)
                rs['test'] = run_epoch(model, testloader_, criterion_ce, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled)

                if ema is not None:
                    model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()})
                    rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled)
                    rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled)

            logger.info(
                f'epoch={epoch} '
                f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} '
                f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} '
                f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} '
            )

            if metric == 'last' or rs[metric]['top1'] > best_top1:
                if metric != 'last':
                    best_top1 = rs[metric]['top1']
                for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch)
                writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch)

                reporter(
                    loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'],
                    loss_test=rs['test']['loss'], top1_test=rs['test']['top1']
                )

                # save checkpoint
                if is_master and save_path:
                    logger.info('save model@%d to %s, err=%.4f' % (epoch, save_path, 1 - best_top1))
                    torch.save({
                        'epoch': epoch,
                        'log': {
                            'train': rs['train'].get_dict(),
                            'valid': rs['valid'].get_dict(),
                            'test': rs['test'].get_dict(),
                        },
                        'optimizer': optimizer.state_dict(),
                        'model': model.state_dict(),
                        'ema': ema.state_dict() if ema is not None else None,
                    }, save_path)

    del model

    result['top1_test'] = best_top1
    return result
Esempio n. 30
0
def main():
    global args, best_err1, best_err5
    args = parser.parse_args()
    if args.tensorboard: configure("runs/%s" % (args.expname))

    args.distributed = args.world_size > 1
    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    if args.dataset.startswith('cifar'):
        normalize = transforms.Normalize(
            mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
            std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
        if args.augment:
            transform_train = transforms.Compose([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ])
        else:
            transform_train = transforms.Compose([
                transforms.ToTensor(),
                normalize,
            ])
        transform_test = transforms.Compose([transforms.ToTensor(), normalize])

        if args.dataset == 'cifar100':
            train_loader = torch.utils.data.DataLoader(
                datasets.CIFAR100('../data',
                                  train=True,
                                  download=True,
                                  transform=transform_train),
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=args.workers,
                pin_memory=True)
            val_loader = torch.utils.data.DataLoader(
                datasets.CIFAR100('../data',
                                  train=False,
                                  transform=transform_test),
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=args.workers,
                pin_memory=True)
            numberofclass = 100
        elif args.dataset == 'cifar10':
            train_loader = torch.utils.data.DataLoader(
                datasets.CIFAR10('../data',
                                 train=True,
                                 download=True,
                                 transform=transform_train),
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=args.workers,
                pin_memory=True)
            val_loader = torch.utils.data.DataLoader(
                datasets.CIFAR10('../data',
                                 train=False,
                                 transform=transform_test),
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=args.workers,
                pin_memory=True)
            numberofclass = 10
        else:
            raise Exception('unknown dataset: {}'.format(args.dataset))

    elif args.dataset == 'imagenet':
        traindir = os.path.join(args.data, 'train')
        valdir = os.path.join(args.data, 'val')
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        train_dataset = datasets.ImageFolder(
            traindir,
            transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]))

        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset)
        else:
            train_sampler = None

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=(train_sampler is None),
            num_workers=args.workers,
            pin_memory=True,
            sampler=train_sampler)

        val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
            valdir,
            transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])),
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=args.workers,
                                                 pin_memory=True)
        numberofclass = 1000

    else:
        raise Exception('unknown dataset: {}'.format(args.dataset))

    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.net_type))
        try:
            model = models.__dict__[str(args.net_type)](pretrained=True)
        except (KeyError, TypeError):
            print('unknown model')
            print('torchvision provides the follwoing pretrained model:',
                  model_names)
            return
    else:
        print("=> creating model '{}'".format(args.net_type))
        if args.net_type == 'resnet':
            model = RN.ResNet(args.dataset, args.depth, numberofclass,
                              args.bottleneck)  # for ResNet
        elif args.net_type == 'preresnet':
            model = PRN.PreResNet(args.dataset, args.depth, numberofclass,
                                  args.bottleneck)  # for Pre-activation ResNet
        elif args.net_type == 'pyramidnet':
            model = PYRM.PyramidNet(args.dataset, args.depth, args.alpha,
                                    numberofclass,
                                    args.bottleneck)  # for ResNet
        else:
            raise Exception('unknown network architecture: {}'.format(
                args.net_type))

    if not args.distributed:
        if args.net_type.startswith('alexnet') or args.net_type.startswith(
                'vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    print(model)
    print('the number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                nesterov=True)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_err1 = checkpoint['best_err1']
            best_err5 = checkpoint['best_err5']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs + 1):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        err1, err5 = validate(val_loader, model, criterion, epoch)

        # remember best prec@1 and save checkpoint
        is_best = err1 <= best_err1
        best_err1 = min(err1, best_err1)
        if is_best:
            best_err5 = err5
        print('Current best accuracy (top-1 and 5 error):', best_err1,
              best_err5)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.net_type,
                'state_dict': model.state_dict(),
                'best_err1': best_err1,
                'best_err5': best_err5,
                'optimizer': optimizer.state_dict(),
            }, is_best)
    print('Best accuracy (top-1 and 5 error):', best_err1, best_err5)
Esempio n. 31
0
File: main.py Progetto: gongzg/DALI
def main():
    global best_prec1, args

    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed:
        args.gpu = args.rank % torch.cuda.device_count()


    if args.distributed:
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True, num_classes=args.num_classes)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch](num_classes=args.num_classes)

    model = model.cuda()
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
        model = DDP(model)

    global model_params, master_params
    if args.fp16:
        model_params, master_params = prep_param_lists(model)
    else:
        master_params = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(master_params, args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = traindir)
    pipe.build()
    test_run = pipe.run()
    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
    train_loader = DALIClassificationIterator(pipe, size = int(1281167 / args.world_size) )


    pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = valdir)
    pipe.build()
    test_run = pipe.run()
    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
    val_loader = DALIClassificationIterator(pipe, size = int(50000 / args.world_size) )

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)
        if args.prof:
            break
        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
Esempio n. 32
0
def train(rank, a, h, resume_run_id=None):
    if h.num_gpus > 1:
        init_process_group(
            backend=h.dist_config["dist_backend"],
            init_method=h.dist_config["dist_url"],
            world_size=h.dist_config["world_size"] * h.num_gpus,
            rank=rank,
        )

    torch.cuda.manual_seed(h.seed)
    device = torch.device("cuda:{:d}".format(rank))

    generator = Generator(h).to(device)
    mpd = MultiPeriodDiscriminator().to(device)
    msd = MultiScaleDiscriminator().to(device)

    if rank == 0:
        print(generator)
        os.makedirs(a.checkpoint_path, exist_ok=True)
        print("checkpoints directory : ", a.checkpoint_path)

    cp_g = None
    cp_do = None
    if resume_run_id:
        restored_g = wandb.restore("g_latest")
        cp_g = restored_g.name
        restored_do = wandb.restore("do_latest")
        cp_do = restored_do.name

    steps = 0
    if cp_g is None or cp_do is None:
        state_dict_do = None
        last_epoch = -1
    else:
        state_dict_g = load_checkpoint(cp_g, device)
        state_dict_do = load_checkpoint(cp_do, device)
        generator.load_state_dict(state_dict_g["generator"])
        mpd.load_state_dict(state_dict_do["mpd"])
        msd.load_state_dict(state_dict_do["msd"])
        steps = state_dict_do["steps"] + 1
        last_epoch = state_dict_do["epoch"]

    if h.num_gpus > 1:
        generator = DistributedDataParallel(generator,
                                            device_ids=[rank]).to(device)
        mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
        msd = DistributedDataParallel(msd, device_ids=[rank]).to(device)

    optim_g = torch.optim.AdamW(generator.parameters(),
                                h.learning_rate,
                                betas=[h.adam_b1, h.adam_b2])
    optim_d = torch.optim.AdamW(
        itertools.chain(msd.parameters(), mpd.parameters()),
        h.learning_rate,
        betas=[h.adam_b1, h.adam_b2],
    )

    if state_dict_do is not None:
        optim_g.load_state_dict(state_dict_do["optim_g"])
        optim_d.load_state_dict(state_dict_do["optim_d"])

    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g,
                                                         gamma=h.lr_decay,
                                                         last_epoch=last_epoch)
    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d,
                                                         gamma=h.lr_decay,
                                                         last_epoch=last_epoch)

    training_filelist, validation_filelist = get_dataset_filelist(a)

    trainset = MelDataset(
        training_filelist,
        h.segment_size,
        h.n_fft,
        h.num_mels,
        h.hop_size,
        h.win_size,
        h.sampling_rate,
        h.fmin,
        h.fmax,
        n_cache_reuse=0,
        shuffle=False if h.num_gpus > 1 else True,
        fmax_loss=h.fmax_for_loss,
        device=device,
        fine_tuning=a.fine_tuning,
        base_mels_path=a.input_mels_dir,
    )
    print(f"train dataset size:{len(trainset)}")
    train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None

    train_loader = DataLoader(
        trainset,
        num_workers=h.num_workers,
        shuffle=False,
        sampler=train_sampler,
        batch_size=h.batch_size,
        pin_memory=True,
        drop_last=True,
    )

    if rank == 0:
        validset = MelDataset(
            validation_filelist,
            h.segment_size,
            h.n_fft,
            h.num_mels,
            h.hop_size,
            h.win_size,
            h.sampling_rate,
            h.fmin,
            h.fmax,
            False,
            False,
            n_cache_reuse=0,
            fmax_loss=h.fmax_for_loss,
            device=device,
            fine_tuning=a.fine_tuning,
            base_mels_path=a.input_mels_dir,
        )
        print(f"valid dataset size:{len(validset)}")
        validation_loader = DataLoader(
            validset,
            num_workers=1,
            shuffle=False,
            sampler=None,
            batch_size=1,
            pin_memory=True,
            drop_last=True,
        )

        sw = SummaryWriter(os.path.join(a.checkpoint_path, "logs"))

    generator.train()
    mpd.train()
    msd.train()
    for epoch in range(max(0, last_epoch), a.training_epochs):
        if rank == 0:
            start = time.time()
            print("Epoch: {}".format(epoch + 1))

        if h.num_gpus > 1:
            train_sampler.set_epoch(epoch)

        for i, batch in enumerate(train_loader):
            if rank == 0:
                start_b = time.time()
            x, y, _, y_mel = batch

            x = torch.autograd.Variable(x.to(device, non_blocking=True))
            y = torch.autograd.Variable(y.to(device, non_blocking=True))
            y_mel = torch.autograd.Variable(y_mel.to(device,
                                                     non_blocking=True))
            y = y.unsqueeze(1)

            y_g_hat = generator(x)
            y_g_hat_mel = mel_spectrogram(
                y_g_hat.squeeze(1),
                h.n_fft,
                h.num_mels,
                h.sampling_rate,
                h.hop_size,
                h.win_size,
                h.fmin,
                h.fmax_for_loss,
            )

            optim_d.zero_grad()

            # MPD
            y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
            loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(
                y_df_hat_r, y_df_hat_g)

            # MSD
            y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach())
            loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(
                y_ds_hat_r, y_ds_hat_g)

            loss_disc_all = loss_disc_s + loss_disc_f

            loss_disc_all.backward()
            optim_d.step()

            # Generator
            optim_g.zero_grad()

            # L1 Mel-Spectrogram Loss
            loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45

            y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
            y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat)
            loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
            loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
            loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
            loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
            loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel

            loss_gen_all.backward()
            optim_g.step()

            if rank == 0:
                # STDOUT logging
                if steps % a.stdout_interval == 0:
                    with torch.no_grad():
                        mel_error = F.l1_loss(y_mel, y_g_hat_mel).item()

                    print(
                        "Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}"
                        .format(steps, loss_gen_all, mel_error,
                                time.time() - start_b))
                    wandb.log(
                        {
                            "loss/Gen Loss Total": loss_gen_all,
                            "loss/Mel-Spec. Error": mel_error,
                        },
                        step=steps,
                    )

                # checkpointing
                if steps % a.checkpoint_interval == 0 and steps != 0:
                    # generator
                    checkpoint_path = "{}/g_{:08d}".format(
                        a.checkpoint_path, steps)
                    save_checkpoint(
                        checkpoint_path,
                        {
                            "generator": (generator.module if h.num_gpus > 1
                                          else generator).state_dict()
                        },
                    )
                    checkpoint_name = "g_{:08d}".format(steps)
                    wandb.save(checkpoint_name)
                    # also save as latest
                    checkpoint_path = "{}/g_latest".format(a.checkpoint_path)
                    save_checkpoint(
                        checkpoint_path,
                        {
                            "generator": (generator.module if h.num_gpus > 1
                                          else generator).state_dict()
                        },
                    )
                    wandb.save("g_latest")
                    # discriminator
                    checkpoint_path = "{}/do_{:08d}".format(
                        a.checkpoint_path, steps)
                    save_checkpoint(
                        checkpoint_path,
                        {
                            "mpd": (mpd.module
                                    if h.num_gpus > 1 else mpd).state_dict(),
                            "msd": (msd.module
                                    if h.num_gpus > 1 else msd).state_dict(),
                            "optim_g":
                            optim_g.state_dict(),
                            "optim_d":
                            optim_d.state_dict(),
                            "steps":
                            steps,
                            "epoch":
                            epoch,
                        },
                    )
                    checkpoint_name = "do_{:08d}".format(steps)
                    wandb.save(checkpoint_name)
                    # also save as latest
                    checkpoint_path = "{}/do_latest".format(a.checkpoint_path)
                    save_checkpoint(
                        checkpoint_path,
                        {
                            "mpd": (mpd.module
                                    if h.num_gpus > 1 else mpd).state_dict(),
                            "msd": (msd.module
                                    if h.num_gpus > 1 else msd).state_dict(),
                            "optim_g":
                            optim_g.state_dict(),
                            "optim_d":
                            optim_d.state_dict(),
                            "steps":
                            steps,
                            "epoch":
                            epoch,
                        },
                    )
                    wandb.save("do_latest")

                # Tensorboard summary logging
                if steps % a.summary_interval == 0:
                    sw.add_scalar("training/gen_loss_total", loss_gen_all,
                                  steps)
                    sw.add_scalar("training/mel_spec_error", mel_error, steps)

                # Validation
                if steps % a.validation_interval == 0:  # and steps != 0:
                    generator.eval()
                    torch.cuda.empty_cache()
                    val_err_tot = 0
                    with torch.no_grad():
                        samples_orig = []
                        samples_pred = []
                        for j, batch in enumerate(validation_loader):
                            x, y, _, y_mel = batch
                            y_g_hat = generator(x.to(device))
                            y_mel = torch.autograd.Variable(
                                y_mel.to(device, non_blocking=True))
                            y_g_hat_mel = mel_spectrogram(
                                y_g_hat.squeeze(1),
                                h.n_fft,
                                h.num_mels,
                                h.sampling_rate,
                                h.hop_size,
                                h.win_size,
                                h.fmin,
                                h.fmax_for_loss,
                            )
                            val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item()

                            if j <= 4:
                                if steps == 0:
                                    sw.add_audio(
                                        "gt/y_{}".format(j),
                                        y[0],
                                        steps,
                                        h.sampling_rate,
                                    )
                                    sw.add_figure(
                                        "gt/y_spec_{}".format(j),
                                        plot_spectrogram(x[0]),
                                        steps,
                                    )

                                    # log orig audio to wandb
                                    orig_audio = y.squeeze().cpu()
                                    samples_orig.append(
                                        wandb.Audio(
                                            orig_audio,
                                            caption=f"sample {i}",
                                            sample_rate=h.sampling_rate,
                                        ))

                                sw.add_audio(
                                    "generated/y_hat_{}".format(j),
                                    y_g_hat[0],
                                    steps,
                                    h.sampling_rate,
                                )
                                y_hat_spec = mel_spectrogram(
                                    y_g_hat.squeeze(1),
                                    h.n_fft,
                                    h.num_mels,
                                    h.sampling_rate,
                                    h.hop_size,
                                    h.win_size,
                                    h.fmin,
                                    h.fmax,
                                )
                                sw.add_figure(
                                    "generated/y_hat_spec_{}".format(j),
                                    plot_spectrogram(
                                        y_hat_spec.squeeze(0).cpu().numpy()),
                                    steps,
                                )

                                # log pred audio to wandb
                                pred_audio = y_g_hat.squeeze().cpu()
                                samples_pred.append(
                                    wandb.Audio(
                                        pred_audio,
                                        caption=f"sample {i}",
                                        sample_rate=h.sampling_rate,
                                    ))

                        val_err = val_err_tot / (j + 1)
                        sw.add_scalar("validation/mel_spec_error", val_err,
                                      steps)

                        # log audios to wandb
                        wandb.log(
                            {
                                "audio/generated": samples_pred,
                            },
                            step=steps,
                        )
                        if steps == 0:
                            wandb.log(
                                {
                                    "audio/original": samples_orig,
                                },
                                step=steps,
                            )

                    generator.train()

            steps += 1

        scheduler_g.step()
        scheduler_d.step()

        if rank == 0:
            print("Time taken for epoch {} is {} sec\n".format(
                epoch + 1, int(time.time() - start)))
Esempio n. 33
0
def train(args):
    is_distributed = len(args.hosts) > 1 and args.backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))
    use_cuda = args.num_gpus > 0
    logger.debug("Number of gpus available - {}".format(args.num_gpus))
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    device = torch.device("cuda" if use_cuda else "cpu")

    if is_distributed:
        # Initialize the distributed environment.
        world_size = len(args.hosts)
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size)
        logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
            args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format(
            dist.get_rank(), args.num_gpus))

    # set the seed for generating random numbers
    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed(args.seed)

    train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs)
    test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs)

    logger.debug("Processes {}/{} ({:.0f}%) of train data".format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)
    ))

    logger.debug("Processes {}/{} ({:.0f}%) of test data".format(
        len(test_loader.sampler), len(test_loader.dataset),
        100. * len(test_loader.sampler) / len(test_loader.dataset)
    ))

    model = Net().to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # single-machine multi-gpu case or single-machine or multi-machine cpu case
        model = torch.nn.DataParallel(model)

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            if is_distributed and not use_cuda:
                # average gradients manually for multi-machine cpu case only
                _average_gradients(model)
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
        test(model, test_loader, device)
    save_model(model, args.model_dir)
def init_dist(args):
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=torch.cuda.device_count(),
                            rank=args.local_rank)
Esempio n. 35
0
def main_worker(process_id, ngpus_per_node, args):
    args.gpu = process_id
    model = Net()
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.1)

    if args.multiprocessing_distributed:
        args.rank = args.rank_start + process_id
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=args.rank)
    print("Process {} in node {} has been started.".format(
        args.rank, args.node))

    torch.cuda.set_device(args.gpu)
    model.cuda(args.gpu)
    model = torch.nn.parallel.DistributedDataParallel(model,
                                                      device_ids=[args.gpu],
                                                      output_device=args.gpu)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        get_mnist(istrain=True), num_replicas=None, rank=None)
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        get_mnist(istrain=False), num_replicas=None, rank=None)

    #different form
    train_loader = torch.utils.data.DataLoader(get_mnist(istrain=True),
                                               batch_size=args.batch_size,
                                               shuffle=None,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    test_loader = torch.utils.data.DataLoader(get_mnist(istrain=False),
                                              batch_size=args.batch_size,
                                              shuffle=None,
                                              num_workers=args.workers,
                                              pin_memory=True,
                                              sampler=test_sampler)
    acc_train = AverageMeter()
    acc_test = AverageMeter()
    best_acc = 0
    for epoch in range(5):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        #train epoch
        for i, (input, target) in enumerate(train_loader):
            input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)
            output = model(input)
            loss = criterion(output, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            acc = accuracy(output, target)
            acc_train.update(acc[0], n=input.size(0))

        #test epoch
        for i, (input, target) in enumerate(test_loader):
            input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)
            with torch.no_grad():
                output = model(input)
            acc = accuracy(output, target)
            acc_test.update(acc[0], n=input.size(0))

        #the performances differ since different data are used
        print("rank {}th process after epoch{}: train_acc{:.2f},val_acc{:.2f}".
              format(args.rank, epoch, acc_train.avg.item(),
                     acc_test.avg.item()))
        is_best = acc_test.avg.item() > best_acc
        best_acc = max(acc_test.avg.item(), best_acc)
        acc_train.reset()
        acc_test.reset()

        # save once per node
        if args.rank % args.gpu_use == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
Esempio n. 36
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    if args.arch=='alexnet':
        model = model_list.alexnet(pretrained=args.pretrained)
        input_size = 227
    else:
        raise Exception('Model not supported yet')

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    if not os.path.exists(args.data+'/imagenet_mean.binaryproto'):
        print("==> Data directory"+args.data+"does not exits")
        print("==> Please specify the correct data path by")
        print("==>     --data <DATA_PATH>")
        return

    normalize = transforms.Normalize(
            meanfile=args.data+'/imagenet_mean.binaryproto')

    train_dataset = datasets.ImageFolder(
        args.data,
        transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
            transforms.RandomSizedCrop(input_size),
        ]),
        Train=True)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(args.data, transforms.Compose([
            transforms.ToTensor(),
            normalize,
            transforms.CenterCrop(input_size),
        ]),
        Train=False),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    print model

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)
Esempio n. 37
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    # create model
    if args.network_name == 'vgg19':
        model = vgg19_bn(pretrained=False)
    elif args.network_name == 'vgg16':
        model = vgg16_bn(pretrained=False)
    elif 'resnet' in args.network_name:
        model = models.__dict__[args.arch](pretrained=False)
    else:
        raise NotImplementedError

    # Initialize network
    for layer in model.modules():
        if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
            if args.init == 'normal_kaiming':
                nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')
            elif args.init == 'normal_kaiming_fout':
                nn.init.kaiming_normal_(layer.weight,
                                        nonlinearity='relu',
                                        mode='fan_out')
            elif args.init == 'normal_xavier':
                nn.init.xavier_normal_(layer.weight)
            elif args.init == 'orthogonal':
                nn.init.orthogonal_(layer.weight)
            else:
                raise ValueError(
                    f"Unrecognised initialisation parameter {args.init}")

    model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    #############################
    ####    Pruning code     ####
    #############################

    pruning_factor = args.pruning_factor
    keep_masks = []
    filename = ''
    if pruning_factor != 1:
        print(f'Pruning network iteratively for {args.num_steps} steps')
        keep_masks = iterative_pruning(model,
                                       train_loader,
                                       device,
                                       pruning_factor,
                                       prune_method=args.prune_method,
                                       num_steps=args.num_steps,
                                       mode=args.mode,
                                       num_batches=args.num_batches)

        apply_prune_mask(model, keep_masks)

    # File where to save training history
    run_name = (args.network_name + '_IMAGENET' + '_spars' +
                str(1 - pruning_factor) + '_variant' + str(args.prune_method) +
                '_train-frac' + str(args.frac_data_for_train) +
                f'_steps{args.num_steps}_{args.mode}' + f'_{args.init}' +
                f'_batch{args.num_batches}' + f'_rseed_{seed}')
    writer_name = 'runs/' + run_name
    writer = SummaryWriter(writer_name)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    iterations = 0
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # Train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # Evaluate on validation set
        iterations = epoch * len(train_loader)
        acc1 = validate(val_loader, model, criterion, args, writer, iterations)

        # Save checkpoint
        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            if (epoch + 1) % 5 == 0:
                if not os.path.exists('saved_models/'):
                    os.makedirs('saved_models/')
                save_name = 'saved_models/' + run_name + '_cross_entropy_' + str(
                    epoch + 1) + '.model'
                torch.save(model.state_dict(), save_name)
            elif (epoch + 1) == args.epochs:
                if not os.path.exists('saved_models/'):
                    os.makedirs('saved_models/')
                save_name = 'saved_models/' + run_name + '_cross_entropy_' + str(
                    epoch + 1) + '.model'
                torch.save(model.state_dict(), save_name)
Esempio n. 38
0
                raise unittest.SkipTest("Compiled without the " + BACKEND + " backend")

            if skip_ok:
                # do this first so we don't give an error message about
                # mismatched exit codes if the first isn't valid
                assert first_process.exitcode == 0 \
                    or first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE \
                    or first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE \
                    or first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE

                if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
                    raise unittest.SkipTest("cuda is not available")
                if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE:
                    raise unittest.SkipTest("One unique gpu per process is not available")
                if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE:
                    raise unittest.SkipTest("worldsize is too small to run group tests")

            self.assertEqual(first_process.exitcode, 0)

elif BACKEND == 'mpi':
    WORLD_SIZE = os.environ['WORLD_SIZE']
    dist.init_process_group(init_method=INIT_METHOD, backend='mpi')

    class TestMPI(TestCase, _DistTestBase):
        pass

if __name__ == '__main__':
    assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process"

    unittest.main()
Esempio n. 39
0
def train(args):
    world_size = len(args.hosts)
    is_distributed = world_size > 1
    logger.debug('Number of hosts {}. Distributed training - {}'.format(world_size, is_distributed))
    use_cuda = args.num_gpus > 0
    logger.debug('Number of gpus available - {}'.format(args.num_gpus))
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    device = torch.device('cuda' if use_cuda else 'cpu')

    if is_distributed:
        # Initialize the distributed environment.
        backend = 'gloo'
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size)
        logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
            backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format(
            dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    # set the seed for generating random numbers
    seed = 1
    torch.manual_seed(seed)
    if use_cuda:
        torch.cuda.manual_seed(seed)

    train_sampler, train_loader = _get_train_data_loader(args.data_dir, is_distributed, args.batch_size, **kwargs)
    test_loader = _get_test_data_loader(args.data_dir, **kwargs)

    logger.debug('Processes {}/{} ({:.0f}%) of train data'.format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)
    ))

    logger.debug('Processes {}/{} ({:.0f}%) of test data'.format(
        len(test_loader.sampler), len(test_loader.dataset),
        100. * len(test_loader.sampler) / len(test_loader.dataset)
    ))

    model = Net().to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        logger.debug('Multi-machine multi-gpu: using DistributedDataParallel.')
        model = torch.nn.parallel.DistributedDataParallel(model)
    elif use_cuda:
        # single-machine multi-gpu case
        logger.debug('Single-machine multi-gpu: using DataParallel().cuda().')
        model = torch.nn.DataParallel(model)
    else:
        # single-machine or multi-machine cpu case
        logger.debug('Single-machine/multi-machine cpu: using DataParallel.')
        model = torch.nn.DataParallel(model)

    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.5)

    log_interval = 100
    for epoch in range(1, args.epochs + 1):
        if is_distributed:
            train_sampler.set_epoch(epoch)
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            if is_distributed and not use_cuda:
                # average gradients manually for multi-machine cpu case only
                _average_gradients(model)
            optimizer.step()
            if batch_idx % log_interval == 0:
                logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
        accuracy = test(model, test_loader, device)
    save_model(model, args.model_dir)

    logger.debug('Overall test accuracy: {}'.format(accuracy))
Esempio n. 40
0
def init_process_group(world_size, rank):
    dist.init_process_group(backend='nccl',
                            init_method='tcp://127.0.0.1:12345',
                            world_size=world_size,
                            rank=rank)
Esempio n. 41
0
def process(rank, world_size, train_pairs, test_pairs, resume):

    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    dist.init_process_group("nccl", rank=rank, world_size=world_size)

    device = rank

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_pairs, num_replicas=world_size, rank=rank, shuffle=False)

    # dataset_train = DataLoader(train_pairs, batch_size=BATCH_SIZE,
    #                            shuffle=True, num_workers=NUM_WORKERS,
    #                            collate_fn=collate_function,
    #                            pin_memory=True)

    dataset_train = DataLoader(train_pairs,
                               batch_size=BATCH_SIZE,
                               shuffle=False,
                               num_workers=NUM_WORKERS,
                               collate_fn=collate_function,
                               pin_memory=True,
                               sampler=train_sampler)

    dataset_test = DataLoader(test_pairs,
                              batch_size=BATCH_SIZE,
                              shuffle=False,
                              num_workers=NUM_WORKERS,
                              collate_fn=collate_function,
                              drop_last=True,
                              pin_memory=True)

    model = TransformerSTT(**model_parameters)
    # model = nn.DataParallel(model)
    model = model.to(device)
    model = DDP(model, find_unused_parameters=True, device_ids=[rank])
    # print(str(model))
    learning_rate = LR
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_criterion = nn.CTCLoss(zero_infinity=True)
    train_step = 0

    model, optimizer, train_step, writer = resume_training(
        resume, model, optimizer, train_step, rank)

    scaler = GradScaler()

    loss_list = list()
    wer_list = list()

    for epoch in range(NUM_EPOCH):
        model.train()
        for data in tqdm(dataset_train):
            mel_tensor, jamo_code_tensor, mel_lengths, jamo_lengths, mel_transformer_mask, speakers = data

            # speaker_code = speaker_table.speaker_name_to_code(speakers)

            with autocast():
                output_tensor = model((
                    mel_tensor.to(device),
                    mel_transformer_mask.to(device),
                ))

                output_tensor = output_tensor.permute(
                    1, 0, 2)  # (N, S, E) => (T, N, C)

                loss = loss_criterion(output_tensor,
                                      jamo_code_tensor.to(device),
                                      (mel_lengths // 8).to(device),
                                      jamo_lengths.to(device))

            optimizer.zero_grad()
            # loss.backward()
            # optimizer.step()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_step += 1

            if rank == 0:

                decoded_input_text = KOREAN_TABLE.decode_jamo_code_tensor(
                    jamo_code_tensor)
                decoded_input_text = KOREAN_TABLE.decode_ctc_prediction(
                    decoded_input_text)
                decoded_output_text = KOREAN_TABLE.decode_jamo_prediction_tensor(
                    output_tensor)
                decoded_output_str = KOREAN_TABLE.decode_ctc_prediction(
                    decoded_output_text)

                wer = KOREAN_TABLE.caculate_wer(decoded_input_text,
                                                decoded_output_str)
                wer_list.append(wer)
                loss_list.append(loss.item())

                if len(loss_list) >= LOGGING_STEPS:
                    writer.add_scalar('ctc_loss/train', np.mean(loss_list),
                                      train_step)
                    decoded_pairs =  [f'** {in_text} \n\n -> {out_text} \n\n => {final_output} \n\n' \
                                    for (in_text, out_text, final_output) in zip(decoded_input_text, decoded_output_text, decoded_output_str)]
                    writer.add_text('text_result/train',
                                    '\n\n'.join(decoded_pairs), train_step)
                    writer.add_scalar('WER/train', np.mean(wer_list),
                                      train_step)
                    logging_image = mel_tensor_to_plt_image(
                        mel_tensor, decoded_input_text, train_step)
                    writer.add_image('input_spectrogram/train', logging_image,
                                     train_step)
                    print(f'Train Step {train_step}')
                    loss_list = list()
                    wer_list = list()

                if train_step % CHECKPOINT_STEPS == 0:
                    save_checkpoint(model, optimizer, train_step,
                                    writer.logdir, KEEP_LAST_ONLY)

            # break

        if rank == 0:

            loss_test_list = list()
            wer_test_list = list()

            model.eval()
            for data in tqdm(dataset_test):
                mel_tensor, jamo_code_tensor, mel_lengths, jamo_lengths, mel_transformer_mask, speakers = data

                with autocast():
                    output_tensor = model((
                        mel_tensor.to(device),
                        mel_transformer_mask.to(device),
                    ))

                    output_tensor = output_tensor.permute(
                        1, 0, 2)  # (N, S, E) => (T, N, C)

                    loss = loss_criterion(output_tensor,
                                          jamo_code_tensor.to(device),
                                          (mel_lengths // 8).to(device),
                                          jamo_lengths.to(device))

                loss_test_list.append(loss.item())

                decoded_input_text = KOREAN_TABLE.decode_jamo_code_tensor(
                    jamo_code_tensor)
                decoded_input_text = KOREAN_TABLE.decode_ctc_prediction(
                    decoded_input_text)
                decoded_output_text = KOREAN_TABLE.decode_jamo_prediction_tensor(
                    output_tensor)
                decoded_output_str = KOREAN_TABLE.decode_ctc_prediction(
                    decoded_output_text)
                wer = KOREAN_TABLE.caculate_wer(decoded_input_text,
                                                decoded_output_str)
                wer_test_list.append(wer)

            decoded_pairs =  [f'** {in_text} \n\n -> {out_text} \n\n => {final_output} \n\n' \
                        for (in_text, out_text, final_output) in zip(decoded_input_text, decoded_output_text, decoded_output_str)]
            writer.add_scalar('ctc_loss/test', np.mean(loss_test_list),
                              train_step)
            writer.add_scalar('WER/test', np.mean(wer_test_list), train_step)
            writer.add_text('text_result/test', '\n\n'.join(decoded_pairs),
                            train_step)
            logging_image = mel_tensor_to_plt_image(mel_tensor,
                                                    decoded_input_text,
                                                    train_step)
            writer.add_image('input_spectrogram/test', logging_image,
                             train_step)
def _train(args):
    is_distributed = len(args.hosts) > 1 and args.dist_backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))

    if is_distributed:
        # Initialize the distributed environment.
        world_size = len(args.hosts)
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
                args.dist_backend,
                dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
                dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info("Device Type: {}".format(device))

    logger.info("Loading Cifar10 dataset")
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainset = torchvision.datasets.CIFAR10(root=args.data_dir, train=True,
                                            download=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
                                               shuffle=True, num_workers=args.workers)

    testset = torchvision.datasets.CIFAR10(root=args.data_dir, train=False,
                                           download=False, transform=transform)
    test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size,
                                              shuffle=False, num_workers=args.workers)

    logger.info("Model loaded")
    model = Net()

    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(0, args.epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader):
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
    print('Finished Training')
    return _save_model(model, args.model_dir)
Esempio n. 43
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()
    print(model)
######################
    if args.arch.startswith('vgg'):
        mod = list(model.classifier.children())
        mod.pop()
        mod.append(torch.nn.Linear(4096, 30))
        new_classifier = torch.nn.Sequential(*mod)
        model.classifier = new_classifier
        
        
    elif args.arch.startswith('alexnet'):
        mod = list(model.classifier.children())
        mod.pop()
        mod.append(torch.nn.Linear(4096, 30))
        new_classifier = torch.nn.Sequential(*mod)
        model.classifier = new_classifier
    else:
        model.fc=torch.nn.Linear(2048, 30)
    
    print(model)
    ########################


    # mod = list(model.children())
    # mod.pop()
    # new_classifier = torch.nn.Sequential(*mod)
    # model = new_classifier
    # model.cuda()
    class MyResNet(nn.Module):

        def __init__(self, model):
            super(MyResNet, self).__init__()
            self.conv1 = model.conv1
            self.bn1 = model.bn1
            self.relu = model.relu
            self.maxpool = model.maxpool
            self.layer1 = model.layer1
            self.layer2 = model.layer2
            self.layer3 = model.layer3
            self.layer4 = model.layer4
            self.avgpool = model.avgpool
            #self.Net_classifier=Net_classifier


        def forward(self, x):
            x = self.conv1(x)
            x = self.bn1(x)
            x = self.relu(x)
            x = self.maxpool(x)

            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)

            x = self.avgpool(x)
            #x = x.view(x.size(0), -1)
            #x = self.Net_classifier(x)
            return x

    my_model=MyResNet(model)


    class SiameseNetwork(nn.Module):
        def __init__(self):
            super(SiameseNetwork, self).__init__()
            self.cnn1 = my_model

        def forward_once(self, x):
            output = self.cnn1(x)
            #output = output.view(output.size()[0], -1)
            #output = self.fc1(output)
            return output

        def forward(self, input1):
            output1 = self.forward_once(input1)
            #output2 = self.forward_once(input2)
            #return output1, output2
            return output1
#############################################
    this_model = SiameseNetwork().cuda()
    model = this_model.cuda()
    print(model)
    model.cuda()
    # define loss function (criterion) and optimizer
    # criterion = nn.CrossEntropyLoss().cuda()

    # optimizer = torch.optim.SGD(model.parameters(), args.lr,
    #                             momentum=args.momentum,
    #                             weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            #args.start_epoch = checkpoint['epoch']
            #best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint)
            #optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint succss")
            #      .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
######################
    # logpath_val='./log/test/'
    # txtname='test.csv'
    # if not os.path.exists(logpath_val):
    #     os.makedirs(logpath_val)
    # if os.path.exists(logpath_val+txtname):
    #     os.remove(logpath_val+txtname)
    # f_val=file(logpath_val+txtname,'a+')
#################
    testdir='/home/zq610/WYZ/JD_contest/test/test_B/'
    cudnn.benchmark = True

        
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform=transforms.Compose([
    transforms.Scale([224,224]),
    transforms.ToTensor(),
    normalize,])
####################################################
    #class_index=[1,10,11,12,13,14,15,16,17,18,19,2,20,21,22,23,24,25,26,27,28,29,3,30,4,5,6,7,8,9]
    model.eval()
    test_list = get_files(testdir)
    num=len(test_list)
    featuresall=np.zeros((num,2048))
    images = np.array([])
    for i, item in enumerate(test_list):
        print('Processing %i of %i (%s)' % (i+1, len(test_list), item))
        test_image_name=item.split('/')[-1].split('.')[0]
        test_image = transform(Image.open(item))
        input_var = torch.autograd.Variable(test_image, volatile=True).unsqueeze(0).cuda()##[1,3,224,224]
        output=model(input_var)
        features=output.data[0][:,:,0].t().cpu().numpy()###  feature vector: [1*2048]
        featuresall[i,:]=features
    np.save(args.data+'../test_B_resnet50.npy',featuresall)

    for classes in range(30):
        valdir = os.path.join(args.data,str(classes+1))
        test_list = get_files(valdir)
        num=len(test_list)
        featuresall=np.zeros((num,2048))
        images = np.array([])
        for i, item in enumerate(test_list):
            print('Processing %i of %i (%s)' % (i+1, len(test_list), item))
            test_image_name=item.split('/')[-1].split('.')[0]
            test_image = transform(Image.open(item))
            input_var = torch.autograd.Variable(test_image, volatile=True).unsqueeze(0).cuda()##[1,3,224,224]
            output=model(input_var)
            features=output.data[0][:,:,0].t().cpu().numpy()###  feature vector: [1*2048]
            featuresall[i,:]=features
        np.save(args.data+str(classes+1)+'_resnet50.npy',featuresall)
Esempio n. 44
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))
        sys.stdout.flush()

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        sys.stdout.flush()

        if args.arch == 'fc_resnet50':
            model = fc_resnet50(num_classes=20, pretrained=False)

        else:
            model = models.__dict__[args.arch](pretrained=True)
            if args.arch in ['resnet18']:
                model.fc = torch.nn.Linear(512, len(CLASSES))
            elif 'squeezenet' in args.arch:
                model.classifier[1] = nn.Conv2d(512, len(CLASSES), (1, 1),
                                                (1, 1))
                model.num_classes = len(CLASSES)
            elif 'resnet' in args.arch:
                model.fc = torch.nn.Linear(2048, len(CLASSES))
    else:
        print("=> creating model '{}'".format(args.arch))
        sys.stdout.flush()
        model = models.__dict__[args.arch](num_classes=len(CLASSES))
    #ipdb.set_trace()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    #criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    criterion = nn.BCEWithLogitsLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    sys.stdout.flush()
    cudnn.benchmark = True

    # Data loading code
    #traindir = os.path.join(args.data, 'train')
    #valdir = os.path.join(args.data, 'test')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    #train_dataset = datasets.ImageFolder(
    #    traindir,
    #    transforms.Compose([
    #        transforms.RandomResizedCrop(224),
    #        #transforms.Resize((224,224)),
    #        transforms.RandomHorizontalFlip(),
    #        transforms.ToTensor(),
    #        normalize,
    #    ]))

    img_transform = transforms.Compose([
        #transforms.RandomResizedCrop(224),
        transforms.Resize((448, 448)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])
    train_dataset = Voc2007Classification(args.dir_datasets,
                                          'train',
                                          transform=img_transform)
    val_dataset = Voc2007Classification(args.dir_datasets,
                                        'val',
                                        transform=img_transform)
    test_dataset = Voc2007Classification(args.dir_datasets,
                                         'test',
                                         transform=img_transform)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=train_sampler)

    test_loader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=args.workers,
                                              pin_memory=True,
                                              sampler=train_sampler)

    #val_loader = torch.utils.data.DataLoader(
    #    datasets.ImageFolder(valdir, transforms.Compose([
    #        transforms.Resize(256),
    #        transforms.CenterCrop(224),
    #        transforms.ToTensor(),
    #        normalize,
    #    ])),
    #    batch_size=256, shuffle=False,
    #    num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        #validate(val_loader, model, criterion, args, writer=writer)
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        #train(train_loader, model, criterion, optimizer, epoch, args, writer, logdir)
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        #acc1 = validate(val_loader, model, criterion, args, writer, logdir)
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                name=args.name)
Esempio n. 45
0
def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name):
    url = "file://" + temp_file_name
    dist.init_process_group(init_method=url,
                            backend=backend,
                            rank=rank,
                            world_size=world_size)
    device = torch.device("cuda")
    torch.cuda.set_device(rank)
    torch.manual_seed(rank)
    np.random.seed(rank)  # Any model works. Add one different buffer per rank

    model = Sequential(Linear(2, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3),
                       Linear(3, 3), Linear(3, 3))
    model.register_buffer("test_buffer", torch.ones((1)) * rank)
    model.to(device)
    n_half_params = len(list(model.parameters())) // 2

    sharded_optimizer = OSS(params=list(model.parameters())[:n_half_params],
                            optim=torch.optim.SGD,
                            lr=1e-3,
                            momentum=0.99)
    sharded_optimizer_2 = OSS(params=list(model.parameters())[n_half_params:],
                              optim=torch.optim.SGD,
                              lr=1e-3,
                              momentum=0.99)

    sharded_ddp_model = ShardedDataParallel(
        module=model,
        sharded_optimizer=sharded_optimizer,
        broadcast_buffers=True)

    ddp_model_single = copy.deepcopy(model)
    ddp_optimizer = torch.optim.SGD(list(
        ddp_model_single.parameters())[:n_half_params],
                                    lr=1e-3,
                                    momentum=0.99)
    ddp_optimizer_2 = torch.optim.SGD(list(
        ddp_model_single.parameters())[n_half_params:],
                                      lr=1e-3,
                                      momentum=0.99)
    ddp_model = DDP(ddp_model_single,
                    device_ids=[rank],
                    broadcast_buffers=True)

    def check_same_model_params():
        for pg, ddp_pg in zip(sharded_optimizer.param_groups,
                              ddp_optimizer.param_groups):
            for p, ddp_p in zip(pg["params"], ddp_pg["params"]):
                assert torch.allclose(
                    p, ddp_p, atol=1e-3
                ), f"Model parameters differ in between DDP and ShardedDDP {p} {ddp_p}"
        for b, ddp_b in zip(sharded_ddp_model.buffers(), ddp_model.buffers()):
            assert torch.allclose(
                b, ddp_b, atol=1e-3
            ), "Model buffers differ in between DDP and ShardedDDP"

    check_same_model_params(
    )  # The models should stay the same in between the ranks

    for i in range(20):
        input_tensor = torch.rand((64, 2)).to(device)

        # Run DDP
        ddp_optimizer.zero_grad()
        ddp_optimizer_2.zero_grad()
        ddp_loss = ddp_model(input_tensor).abs().sum()
        ddp_loss.backward()
        ddp_optimizer.step()
        ddp_optimizer_2.step()

        # Run Sharded
        sharded_optimizer.zero_grad()
        sharded_optimizer_2.zero_grad()
        sharded_loss = sharded_ddp_model(input_tensor).abs().sum()
        sharded_loss.backward()
        sharded_optimizer.step()
        sharded_optimizer_2.step()
        check_same_model_params()

    dist.destroy_process_group()
Esempio n. 46
0
def run(args):

    world_size, rank = None, None
    if args.dist_backend in ['mpi', 'ddl']:
        lrank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
        dist.init_process_group(backend=args.dist_backend,
                                init_method='env://')
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        args.distributed = True

    device = torch.device("cuda:{}".format(lrank))
    # cudnn.benchmark = True
    # torch.cuda.set_device(args.local_rank)

    # # will read env master_addr master_port world_size
    # torch.distributed.init_process_group(backend='nccl', init_method="env://")
    # args.world_size = dist.get_world_size()
    # args.rank = dist.get_rank()
    # # args.local_rank = int(os.environ.get('LOCALRANK', args.local_rank))
    # args.total_batch_size = (args.batch_size) * dist.get_world_size()

    global resume_iter
    """model_log"""
    input_size_r = list(args.input_size)
    delta_r = list(args.delta)
    """model_construction"""
    model = FFN(in_channels=4,
                out_channels=1,
                input_size=args.input_size,
                delta=args.delta,
                depth=args.depth).to(device)
    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model = torch.nn.parallel.DistributedDataParallel(model)

    torch.cuda.set_enabled_lms(True)
    """data_load"""
    if args.resume is not None:
        model.load_state_dict(torch.load(args.resume))

    abs_path_training_data = args.train_data_dir
    entries_train_data = Path(abs_path_training_data)
    files_train_data = []

    for entry in entries_train_data.iterdir():
        files_train_data.append(entry.name)

    sorted_files_train_data = natsort.natsorted(files_train_data,
                                                reverse=False)

    files_total = len(sorted_files_train_data)

    input_h5data_dict = {}
    train_dataset_dict = {}
    train_loader_dict = {}
    batch_it_dict = {}
    train_sampler_dict = {}

    for index in range(files_total):
        input_h5data_dict[index] = [
            (abs_path_training_data + sorted_files_train_data[index])
        ]
        print(input_h5data_dict[index])
        train_dataset_dict[index] = BatchCreator(input_h5data_dict[index],
                                                 args.input_size,
                                                 delta=args.delta,
                                                 train=True)
        train_sampler_dict[
            index] = torch.utils.data.distributed.DistributedSampler(
                train_dataset_dict[index],
                num_replicas=world_size,
                rank=rank,
                shuffle=True)
        train_loader_dict[index] = DataLoader(
            train_dataset_dict[index],
            num_workers=0,
            sampler=train_sampler_dict[index],
            pin_memory=True)
        batch_it_dict[index] = get_batch(
            train_loader_dict[index], args.batch_size, args.input_size,
            partial(fixed_offsets, fov_moves=train_dataset_dict[index].shifts))

    best_loss = np.inf
    """optimizer"""
    t_last = time.time()
    cnt = 0
    tp = fp = tn = fn = 0

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    #optimizer = optim.SGD(model.parameters(), lr=1e-3) #momentum=0.9
    #optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.step, gamma=args.gamma, last_epoch=-1)
    """train_loop"""
    while cnt < args.iter:
        cnt += 1

        Num_of_train_data = len(input_h5data_dict)
        index_rand = random.randrange(0, Num_of_train_data, 1)

        seeds, images, labels, offsets = next(batch_it_dict[index_rand])
        #print(sorted_files_train_data[index_rand])
        #seeds = seeds.cuda(non_blocking=True)
        images = images.to(device)
        labels = labels.to(device)
        #offsets = offsets.cuda(non_blocking=True)

        t_curr = time.time()

        #labels = labels.cuda()

        torch_seed = torch.from_numpy(seeds).to(device)
        input_data = torch.cat([images, torch_seed], dim=1)
        #input_data = Variable(input_data.to(device))

        logits = model(input_data)
        updated = torch_seed + logits

        optimizer.zero_grad()
        loss = F.binary_cross_entropy_with_logits(updated, labels)
        loss.backward()

        #torch.nn.utils.clip_grad_value_(model.parameters(), args.clip_grad_thr)
        optimizer.step()

        seeds[...] = updated.detach().cpu().numpy()

        pred_mask = (updated >= logit(0.9)).detach().cpu().numpy()
        true_mask = (labels > 0.5).cpu().numpy()
        true_bg = np.logical_not(true_mask)
        pred_bg = np.logical_not(pred_mask)
        tp += (true_mask & pred_mask).sum()
        fp += (true_bg & pred_mask).sum()
        fn += (true_mask & pred_bg).sum()
        tn += (true_bg & pred_bg).sum()
        precision = 1.0 * tp / max(tp + fp, 1)
        recall = 1.0 * tp / max(tp + fn, 1)
        accuracy = 1.0 * (tp + tn) / (tp + tn + fp + fn)
        if rank == 0:
            print(
                '[Iter_{}:, loss: {:.4}, Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%]\r'
                .format(cnt, loss.item(), precision * 100, recall * 100,
                        accuracy * 100))

        #scheduler.step()
        """model_saving_(best_loss)"""
        """
        if best_loss > loss.item() or t_curr - t_last > args.interval:
            tp = fp = tn = fn = 0
            t_last = t_curr
            best_loss = loss.item()
            input_size_r = list(args.input_size)
            delta_r = list(args.delta)
            torch.save(model.state_dict(), os.path.join(args.save_path,
                                                        'ffn_model_fov:{}_delta:{}_depth:{}.pth'.format(input_size_r[0],
                                                                                                        delta_r[0],
                                                                                                        args.depth)))
            print('Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!'.format(
                precision * 100, recall * 100, accuracy * 100))
        """
        """model_saving_(iter)"""

        if (cnt % args.save_interval) == 0 and rank == 0:
            tp = fp = tn = fn = 0
            #t_last = t_curr
            #best_loss = loss.item()
            input_size_r = list(args.input_size)
            delta_r = list(args.delta)
            torch.save(
                model.state_dict(),
                os.path.join(
                    args.save_path,
                    (str(args.stream) +
                     'ffn_model_fov:{}_delta:{}_depth:{}_recall{}.pth'.format(
                         input_size_r[0], delta_r[0], args.depth,
                         recall * 100))))
            print(
                'Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!'
                .format(precision * 100, recall * 100, accuracy * 100))
Esempio n. 47
0
def run_one_step(rank, world_size, backend, device, temp_file_name):
    url = "file://" + temp_file_name
    dist.init_process_group(init_method=url,
                            backend=backend,
                            rank=rank,
                            world_size=world_size)
    if device == torch.device("cuda"):
        torch.cuda.set_device(rank)

    torch.manual_seed(rank)
    np.random.seed(rank)

    def check(broadcast_buffers: bool,
              grad_accumulation: bool = False) -> None:
        # Any model works. Add one different buffer per rank
        model = Sequential(Linear(2, 3), Linear(3, 3), Linear(3, 3),
                           Linear(3, 3), Linear(3, 3), Linear(3, 3))
        model.register_buffer("test_buffer", torch.ones((1)) * rank)
        model.to(device)

        optimizer = OSS(params=model.parameters(),
                        optim=torch.optim.SGD,
                        lr=0.01,
                        momentum=0.99)
        ddp_model = ShardedDataParallel(model,
                                        optimizer,
                                        broadcast_buffers=broadcast_buffers)

        def check_same_model_params(same_params: bool):
            # Check that all the params are the same on all ranks
            # This should be true with and without broadcast_buffers, we don't have any real buffer here
            receptacle: List[torch.Tensor] = []

            if dist.get_backend() != "nccl":
                for pg in optimizer.param_groups:
                    for p in pg["params"]:
                        # Check the params
                        receptacle = [p.clone() for _ in range(world_size)
                                      ] if rank == 0 else []
                        dist.gather(p, receptacle, dst=0)
                        if rank == 0:
                            for sync_p in receptacle[1:]:
                                if same_params:
                                    assert torch.all(
                                        torch.eq(receptacle[0], sync_p)
                                    ), "Models differ in between ranks"
                                else:
                                    assert not torch.all(
                                        torch.eq(receptacle[0], sync_p)
                                    ), "Gradients should not have been synced"

                # Check that all the buffers are in sync (authoritative rank is 0, its buffer is 0)
                if broadcast_buffers:
                    for b in ddp_model.buffers():
                        receptacle = [b.clone() for _ in range(world_size)
                                      ] if rank == 0 else []
                        dist.gather(b, receptacle, dst=0)
                        if rank == 0:
                            for sync_b in receptacle[1:]:
                                if same_params:
                                    assert torch.all(
                                        torch.eq(receptacle[0], sync_b)
                                    ), "Models differ in between ranks"
                                else:
                                    assert not torch.all(
                                        torch.eq(receptacle[0], sync_b)
                                    ), "Gradients should not have been synced"

                        assert b.cpu().item() == 0.0

        # The model should be synchronized in between the ranks at ShardedDataParallel construction time, check that
        check_same_model_params(same_params=True)

        # Optim loop
        def closure():
            optimizer.zero_grad()

            with ddp_model.no_sync() if grad_accumulation else suppress():
                input_tensor = torch.rand((64, 2)).to(device)
                loss = ddp_model(input_tensor).abs().sum()
                loss.backward()
            return loss

        # The models should stay the same in between the ranks
        for i in range(5):
            _ = optimizer.step(closure=closure)
            # when running on cpu/gloo the "nodes" are not really different
            same_params = device == torch.device("cpu") or grad_accumulation
            check_same_model_params(same_params=same_params)

    check(broadcast_buffers=False)
    check(broadcast_buffers=True)
    check(broadcast_buffers=False, grad_accumulation=True)
    check(broadcast_buffers=True, grad_accumulation=True)
    dist.destroy_process_group()
                    help='Initial kl coefficient.')

parser.add_argument('--gamma',
                    type=float,
                    default=0.995,
                    help='Discount factor.')
parser.add_argument('--lam',
                    type=float,
                    default=1,
                    help='Tradeoff value of bias variance.')
args = parser.parse_args()

if not os.path.exists(args.log):
    os.mkdir(args.log)

dist.init_process_group(backend=args.backend)

if dist.get_rank() == 0:
    with open(os.path.join(args.log, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))

if not os.path.exists(os.path.join(args.log, 'models')):
    os.mkdir(os.path.join(args.log, 'models'))

np.random.seed(args.seed)
torch.manual_seed(args.seed)

args.cuda = args.local_rank

device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
Esempio n. 49
0
                                              args=(rank,))
            process.start()
            return process

        def _run(self, rank):
            self.rank = rank
            try:
                dist.init_process_group(backend=BACKEND)
            except RuntimeError as e:
                if 'recompile' in e.args[0]:
                    sys.exit(0)
            # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
            # We're retreiving a corresponding test and executing it.
            getattr(self, self.id().split(".")[2])()
            sys.exit(0)

        def _join_and_reduce(self):
            for p in self.processes:
                p.join(self.JOIN_TIMEOUT)
                self.assertEqual(p.exitcode, 0)

elif BACKEND == 'mpi':
    dist.init_process_group(backend='mpi')

    class TestMPI(TestCase, _DistTestBase):
        pass


if __name__ == '__main__':
    unittest.main()
        world_size = int(os.getenv('SLURM_NTASKS'))

    if 'OMPI_COMM_WORLD_RANK' in os.environ:
        global_rank = int(os.getenv('OMPI_COMM_WORLD_RANK'))
    else:
        global_rank = int(os.getenv('SLURM_PROCID'))
    return global_rank, world_size

if __name__ == "__main__":
    # Dont change the following :
    global_rank, world_size = get_dist_env()
    
    hostname = socket.gethostname()

    # You have run dist.init_process_group to initialize the distributed environment
    # Always use NCCL as the backend. Gloo performance is pretty bad and MPI is currently
    # unsupported (for a number of reasons).     
    dist.init_process_group(backend='nccl', rank=global_rank, world_size=world_size)

    # now run your distributed training code
    run(global_rank, world_size, hostname)
    
    # following is just for demo purposes (and a sanity check)
    # you don't need this:
    if global_rank > 0 :
        print(f'printing NCCL variables on {global_rank} on {hostname}')
        for key in os.environ:
            if key.find('NCCL')>-1:
                print(f'{hostname} {key} : {os.getenv(key)}')
    
Esempio n. 51
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
Esempio n. 52
0
def main():
    global args, best_prec1
    args = parser.parse_args()
    print(args)

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch](low_dim=args.low_dim)

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolderInstance(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)
    # train_loader = None
    val_loader = torch.utils.data.DataLoader(datasets.ImageFolderInstance(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define lemniscate and loss function (criterion)
    ndata = train_dataset.__len__()
    if args.nce_k > 0:
        lemniscate = NCEAverage(args.low_dim, ndata, args.nce_k, args.nce_t,
                                args.nce_m).cuda()
        criterion = NCECriterion(ndata).cuda()
    else:
        lemniscate = LinearAverage(args.low_dim, ndata, args.nce_t,
                                   args.nce_m).cuda()
        criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            # pdb.set_trace()
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint.get('best_prec1', 0)
            model.load_state_dict(checkpoint['state_dict'])
            lemniscate = checkpoint['lemniscate']
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    if args.evaluate:
        kNN(0, model, lemniscate, train_loader, val_loader, 200, args.nce_t,
            1)  #recompute memory
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, lemniscate, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = NN(epoch, model, lemniscate, train_loader, val_loader)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'lemniscate': lemniscate,
                'best_prec1': best_prec1,
                'optimizer': optimizer.state_dict(),
            }, is_best)
    # evaluate KNN after last epoch
    kNN(0, model, lemniscate, train_loader, val_loader, 200, args.nce_t)
Esempio n. 53
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(
            backend=args.dist_backend,
            init_method=args.dist_url,
            world_size=args.world_size)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            ## model.cuda()
        else:
            model = torch.nn.DataParallel(model) ##.cuda()
    else:
        ## model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss() ##.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(traindir,
                                         transforms.Compose([
                                             transforms.RandomResizedCrop(224),
                                             transforms.RandomHorizontalFlip(),
                                             transforms.ToTensor(),
                                             normalize,
                                         ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=(train_sampler is None),
        num_workers=args.workers,
        pin_memory=True,
        sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir,
                             transforms.Compose([
                                 transforms.Resize(256),
                                 transforms.CenterCrop(224),
                                 transforms.ToTensor(),
                                 normalize,
                             ])),
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer': optimizer.state_dict(),
        }, is_best)
Esempio n. 54
0
                            group=node_handle[node_idx])

        for param in model.parameters():
            dist.broadcast(param.grad.data,
                           local_root,
                           group=node_handle[node_idx])

            optim.step()
            total_loss += loss.data.item()
            n_samples += (output.size(0) * data.m)

    #print('Here ? - 4')
    return reduce_loss(total_loss, n_samples)


dist.init_process_group('mpi')

rank = dist.get_rank()
wsize = dist.get_world_size()
world_size = wsize

parser = argparse.ArgumentParser(description='PyTorch Time series forecasting')
parser.add_argument('--data',
                    type=str,
                    required=True,
                    help='location of the data file')
parser.add_argument('--model', type=str, default='LSTNet', help='')
parser.add_argument('--hidCNN',
                    type=int,
                    default=100,
                    help='number of CNN hidden units')