def dist_main(size, genotype): # parser = argparse.ArgumentParser() # parser.add_argument('rank', default=0, type=int) # parser.add_argument('genotype', default="", type=str) # parser.add_argument('size', default=7000, type=int) # args = parser.parse_args() genotype = json.loads(genotype) # torch.cuda.set_device(rank) # cudnn.benchmark = True cudnn.enabled = True # dist.init_process_group(backend="gloo", init_method="tcp://127.0.0.1:7000", world_size=size, rank=rank) model = NetworkCIFAR( # C=36, C=8, num_classes=10, layers=8, auxiliary=False, genotype=genotype, ) model = torch.nn.parallel.DataParallel(model, device_ids=list(range(size))).cuda() criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD( model.parameters(), # lr=0.025, lr=0.01, momentum=0.9, weight_decay=3e-4, ) train_transform, valid_transform = utils._data_transforms_cifar10(cutout=False, cutout_length=16) train_data = dset.CIFAR10(root="data/", train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root="data/", train=False, download=True, transform=valid_transform) # train_sampler = torch.utils.data.distributed.DistributedSampler(train_data) train_queue = torch.utils.data.DataLoader( train_data, batch_size=64, shuffle=False, pin_memory=True, num_workers=2) # train_data, batch_size=16, shuffle=True, num_workers=2) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=64, shuffle=False, pin_memory=True, num_workers=2) # valid_data, batch_size=16, shuffle=False, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 50) prev_valid_acc = 0 max_acc = 0 for epoch in range(50): # train_sampler.set_epoch(epoch) scheduler.step() # print('epoch {} lr {}'.format(epoch, scheduler.get_lr()[0])) print('epoch {}'.format(epoch)) drop_path_prob = 0.2 * epoch / 50 train_acc, train_obj = train(train_queue, model, criterion, optimizer, drop_path_prob) print('train_acc {}'.format(train_acc)) valid_acc, valid_obj = infer(valid_queue, model, criterion, drop_path_prob) print('valid_acc {}'.format(valid_acc)) # if prev_valid_acc > valid_acc: # return prev_valid_acc # prev_valid_acc = valid_acc if valid_acc > max_acc: max_acc = valid_acc utils.save(model, 'weights.pt') # print("type", type(valid_acc)) return max_acc
def main(genotype, index): genotype = json.loads(genotype) torch.cuda.set_device(index) # cudnn.benchmark = True cudnn.enabled = True model = NetworkCIFAR( C=36, num_classes=10, layers=20, auxiliary=False, genotype=genotype, ) # print(model) model = model.cuda() criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD( model.parameters(), lr=0.025, momentum=0.9, weight_decay=3e-4, ) train_transform, valid_transform = utils._data_transforms_cifar10( cutout=False, cutout_length=16) train_data = dset.CIFAR10(root="data/", train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root="data/", train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=True, num_workers=2) # train_data, batch_size=16, shuffle=True, num_workers=2) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=32, shuffle=False, pin_memory=True, num_workers=2) # valid_data, batch_size=16, shuffle=False, num_workers=2) # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 50) prev_valid_acc = 0 max_acc = 0 for epoch in range(50): scheduler.step() print('epoch {} lr {}'.format(epoch, scheduler.get_lr()[0])) print('epoch {}'.format(epoch)) drop_path_prob = 0.2 * epoch / 50 train_acc, train_obj = train(train_queue, model, criterion, optimizer, drop_path_prob) print('train_acc {}'.format(train_acc)) valid_acc, valid_obj = infer(valid_queue, model, criterion, drop_path_prob) print('valid_acc {}'.format(valid_acc)) # if prev_valid_acc > valid_acc: # return prev_valid_acc # prev_valid_acc = valid_acc if valid_acc > max_acc: max_acc = valid_acc utils.save(model, 'weights.pt') # print("type", type(valid_acc)) return max_acc