def create_resnet(): # Build network import keras_resnet_single as networks resnet = networks.ResNet.build( len(channels), resblocks, [16, 32], (125 * granularity, 125 * granularity, len(channels)), granularity) # Load saved weights, if indicated if args.load_epoch != 0: directory = args.save_dir if args.save_dir == '': directory = expt_name model_name = glob.glob('../MODELS/%s/epoch%02d-*.hdf5' % (directory, args.load_epoch))[0] #assert len(model_name) == 2 #model_name = model_name[0].split('.hdf5')[0]+'.hdf5' print('Loading weights from file:', model_name) resnet.load_weights(model_name) #opt = keras.optimizers.Adam(lr=lr_init, epsilon=1.e-5) # changed eps to match pytorch value #opt = keras.optimizers.SGD(lr=lr_init * hvd.size()) opt = NovoGrad(learning_rate=lr_init * hvd.size()) #Wrap the optimizer in a Horovod distributed optimizer -> uses hvd.DistributedOptimizer() to compute gradients. opt = hvd.DistributedOptimizer(opt) #For Horovod: We specify `experimental_run_tf_function=False` to ensure TensorFlow #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function = False) #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) resnet.summary() return resnet
def create_model(resume_from_epoch): if resume_from_epoch > 0: # Restore from a previous checkpoint, if initial_epoch is specified. model = keras.models.load_model(args.checkpoint_format.format(epoch=resume_from_epoch)) else: # Set up standard WideResNet-16-10 model. model = WideResidualNetwork(depth=16, width=10, input_shape=input_shape, classes=num_classes, dropout_rate=0.01) # WideResNet model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) # TODO: Step 8: Scale the learning rate by the number of workers. # opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(), momentum=args.momentum) # TODO: Step 10: use the NovoGrad optimizer instead of SGD opt = NovoGrad(learning_rate=args.base_lr * hvd.size()) # TODO: Step 3: Wrap the optimizer in a Horovod distributed optimizer opt = hvd.DistributedOptimizer(opt) # For Horovod: We specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'], experimental_run_tf_function = False) return model
def create_model(): # Set up standard WideResNet-16-10 model. model = WideResidualNetwork(depth=16, width=10, weights=None, input_shape=input_shape, classes=num_classes, dropout_rate=0.01) # WideResNet model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) if args.novo_grad: opt = NovoGrad(lr=args.base_lr) else: opt = keras.optimizers.SGD(lr=args.base_lr, momentum=args.momentum) # Wrap the optimizer in a Horovod distributed optimizer opt = hvd.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) return model
elif args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'lars': #no tensorboardX optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=0.01) optname = args.optimizer if len(sys.argv) >= 2 else 'sgd' # log = open(optname+'log.txt','w+') log = None criterion = nn.CrossEntropyLoss() model, optimizer, _ = training_loop(model, criterion, optimizer, train_loader, valid_loader, N_EPOCHS, DEVICE, log) with open('lbloss/' + optname + str(args.lr) + '_loss.txt', 'w+') as myfile:
optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars':#no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'dyna': from dyna import Dyna optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) batch_acumulate = args.batch_size//256 batch_per_step = len(trainloader)//batch_acumulate+int(len(trainloader)%batch_acumulate>0) lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,args.max_lr,steps_per_epoch=batch_per_step, epochs=args.num_epoch,div_factor=args.div_factor,final_div_factor=args.final_div,pct_start=args.pct_start)
def main(lr=0.1): global best_acc args.lr = lr device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') # net = VGG('VGG19') # net = ResNet18() # net = PreActResNet18() # net = GoogLeNet() # net = DenseNet121() # net = ResNeXt29_2x64d() # net = MobileNet() # net = MobileNetV2() # net = DPN92() # net = ShuffleNetG2() # net = SENet18() # net = ShuffleNetV2(1) # net = EfficientNetB0() # net = RegNetX_200MF() net = ResNet50() net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) train_acc = [] valid_acc = [] # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() # lr_scheduler.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print(100. * correct / total) train_acc.append(correct / total) def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 print('test') with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() # Save checkpoint. acc = 100. * correct / total print(acc) valid_acc.append(correct / total) if acc > best_acc: print('Saving..') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, ckpt) best_acc = acc for epoch in range(200): if epoch in args.lr_decay: checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] args.lr *= 0.1 if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': # no tensorboardX optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, dampening=args.damping) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) train(epoch) test(epoch) file = open(args.optimizer + str(lr) + 'log.json', 'w+') json.dump([train_acc, valid_acc], file) return best_acc
weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) lr_scheduler = [ optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader), 1e-4) ] else: optimizer = optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) if use_kfac: preconditioner = kfac.KFAC( model, lr=args.base_lr,
#test_data = test_dataset(dataset.skip(train_sz+valid_sz).take(test_sz), start=train_sz+valid_sz, end=train_sz+valid_sz+test_sz) ''' print("\n Timestamp: " + str(tf.cast(tf.timestamp(), tf.float64))) with tf.device('/gpu:0'): daliop = dali_tf.DALIIterator() shapes = [(BATCH_SZ, 125, 125, 8), (BATCH_SZ, 2)] dtypes = [tf.float32, tf.int32] # Create TF dataset out_dataset = dali_tf.DALIDataset(pipeline=pipe, batch_size=BATCH_SZ, shapes=shapes, dtypes=dtypes, device_id=0) opt = NovoGrad(learning_rate=lr_init * hvd.size()) #Wrap the optimizer in a Horovod distributed optimizer -> uses hvd.DistributedOptimizer() to compute gradients. opt = hvd.DistributedOptimizer(opt) resnet.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'], experimental_run_tf_function=False) # Train using DALI dataset history = resnet.fit( out_dataset, steps_per_epoch=train_sz // (BATCH_SZ * hvd.size()), epochs=epochs, callbacks=callbacks_list, verbose=verbose,
x = self.pool(F.relu(self.conv2(x))) x = self.dropout1(x) x = x.view(-1, 12 * 12 * 64) x = F.relu(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) x = F.log_softmax(x, dim=1) return x model = Net() criterion = nn.NLLLoss() # optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) # optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001) # optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.001) optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001) schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(trainloader), 1e-4) epochs = 3 for epoch in range(epochs): running_loss = 0.0 for i, (inputs, labels) in enumerate(trainloader, 0): # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward()
x = self.dropout1(x) x = x.view(batch_size, -1) x = F.relu(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) x = F.log_softmax(x, dim=1) return x model = Net() criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) # optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001) # optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.001) optimizer = NovoGrad(model.parameters(), lr=0.01, grad_averaging=True, weight_decay=0.001) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(trainloader), 1e-4) epochs = 3 for epoch in range(epochs): running_loss = 0.0 for i, (inputs, labels) in enumerate(trainloader, 0): optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() scheduler.step()