def run(): print 'Inside run!' global args args = parser.parse_args() print args # add checkpoint resume option # load datasets train_dataset = SketchData(root=path, train=True, transform=None, target_transform=None, ) val_dataset = SketchData(root=path, train=False, transform=None, target_transform=None, ) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.b, shuffle=True) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.b, shuffle=False) model = AlexNet() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.wd) criterion = nn.CrossEntropyLoss() best_prec = 0 for epoch in range(args.epochs): print 'Epoch: ' + str(epoch) adjust_learning_rate(optimizer, epoch) print 'Adjusted learning rate' train(train_loader, model, criterion, optimizer, epoch) print 'Trained!' precision = validate(val_loader, model, criterion) print 'Got precision!' best_prec = max(precision.data[0], best_prec) print 'Updated best precision!' save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'best_prec1': best_prec, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, (precision.data[0] > best_prec))
def __init__(self, context: DeepSpeedTrialContext) -> None: self.context = context self.args = AttrDict(self.context.get_hparams()) model = AlexNet(10) model = PipelineModule( layers=join_layers(model), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=self.args.pipe_parallel_size, partition_method=self.args.part, activation_checkpoint_interval=0, ) ds_config = overwrite_deepspeed_config( self.args.deepspeed_config, self.args.get("overwrite_deepspeed_args", {})) model_engine, optimizer, _, _ = deepspeed.initialize( args=self.args, model=model, model_parameters=[ p for p in model.parameters() if p.requires_grad ], config=ds_config, ) self.model_engine = self.context.wrap_model_engine(model_engine)
class TrainNetwork(): def __init__(self, dataset, batch_size, epochs, lr, lr_decay_epoch, momentum): assert (dataset == 'letters' or dataset == 'mnist') self.dataset = dataset self.batch_size = batch_size self.epochs = epochs self.lr = lr self.lr_decay_epoch = lr_decay_epoch self.momentum = momentum # letters contains 27 classes, digits contains 10 classes num_classes = 27 if dataset == 'letters' else 10 # Load pre learned AlexNet with changed number of output classes state_dict = torch.load('./trained_models/alexnet.pth') state_dict['classifier.6.weight'] = torch.zeros(num_classes, 4096) state_dict['classifier.6.bias'] = torch.zeros(num_classes) self.model = AlexNet(num_classes) self.model.load_state_dict(state_dict) # Use cuda if available if torch.cuda.is_available(): self.model.cuda() # Load training dataset kwargs = { 'num_workers': 1, 'pin_memory': True } if torch.cuda.is_available() else {} self.train_loader = torch.utils.data.DataLoader( EMNIST('./data', dataset, download=True, transform=transforms.Compose([ transforms.Lambda(correct_rotation), transforms.Lambda(random_transform), transforms.Resize((224, 224)), transforms.RandomResizedCrop(224, (0.9, 1.1), ratio=(0.9, 1.1)), transforms.Grayscale(3), transforms.ToTensor(), ])), batch_size=batch_size, shuffle=True, **kwargs) # Optimizer and loss function self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) self.loss_fn = nn.CrossEntropyLoss() def reduce_learning_rate(self, epoch): """ Reduce the learning rate by factor 0.1 every lr_decay_epoch :param optimizer: Optimizer containing the learning rate :param epoch: Current epoch :param init_lr: Initial learning rate :param lr_decay_epoch: Number of epochs until learning rate gets reduced :return: None """ lr = self.lr * (0.1**(epoch // self.lr_decay_epoch)) if epoch % self.lr_decay_epoch == 0: print('LR is set to {}'.format(lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = lr def train(self, epoch): """ Train the model for one epoch and save the result as a .pth file :param epoch: Current epoch :return: None """ self.model.train() train_loss = 0 train_correct = 0 progress = None for batch_idx, (data, target) in enumerate(self.train_loader): # Get data and label if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) # Optimize using backpropagation self.optimizer.zero_grad() output = self.model(data) loss = self.loss_fn(output, target) train_loss += loss.data[0] pred = output.data.max(1, keepdim=True)[1] train_correct += pred.eq(target.data.view_as(pred)).sum() loss.backward() self.optimizer.step() # Print information about current step current_progress = int(100 * (batch_idx + 1) * self.batch_size / len(self.train_loader.dataset)) if current_progress is not progress and current_progress % 5 == 0: progress = current_progress print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, (batch_idx + 1) * len(data), len(self.train_loader.dataset), current_progress, loss.data[0])) train_loss /= (len(self.train_loader.dataset) / self.batch_size) train_correct /= len(self.train_loader.dataset) train_correct *= 100 # Print information about current epoch print( 'Train Epoch: {} \tCorrect: {:3.2f}%\tAverage loss: {:.6f}'.format( epoch, train_correct, train_loss)) # Save snapshot torch.save( { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, './trained_models/{}_{}.pth'.format(self.dataset, epoch)) def start(self): """ Start training the network :return: None """ for epoch in range(1, self.epochs + 1): self.reduce_learning_rate(epoch) self.train(epoch)
assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/ckpt.pth') net.load_state_dict(checkpoint['model']) best_acc = checkpoint['accuracy'] start_epoch = checkpoint['epoch'] else: best_acc = 0 start_epoch = 0 # hyperparameters batch_size= args.batch_size epoch_num = args.epochs optimizer = optim.Adam(model.parameters(), lr= args.lr, weight_decay=args.weight_decay) train_loader = DataLoader(data_train, shuffle=True,batch_size= batch_size, num_workers=8) test_loader = DataLoader(data_test, shuffle=True, batch_size= batch_size, num_workers=8) def train(epoch): model.train() # set model in training mode (need this because of dropout) correct = 0 loss = 0 for batch_id, (data, label) in enumerate(train_loader): data = data.to(device) target = label.to(device) optimizer.zero_grad()
import torch.nn as nn from torch.autograd import Variable from data_process import MyDataset from torchvision import transforms, utils from torch.utils.data import DataLoader import torch train_data = MyDataset(txt='./data/train.txt', transform=transforms.ToTensor()) train_loader = DataLoader(train_data, batch_size=50, shuffle=True) #返回的是迭代器 test_data = MyDataset(txt='./data/val.txt', transform=transforms.ToTensor()) test_loader = DataLoader(test_data, batch_size=50) model = AlexNet().cuda() #使用gpu,将模型加载到显存 print(model) #print(list(model.parameters())) optimizer = optim.Adam(model.parameters(), lr=0.001) loss_func = nn.CrossEntropyLoss() #开始训练 for epoch in range(30): print('epoch {}'.format(epoch + 1)) #training------------------------ train_loss = 0 train_accu = 0 for batch_x, batch_y in train_loader: batch_x, batch_y = batch_x.cuda(), batch_y.cuda() #数据加载到显存 out = model(batch_x) loss = loss_func(out, batch_y) train_loss += loss.item()
def train(train_loader, eval_loader, opt): print('==> Start training...') summary_writer = SummaryWriter('./runs/' + str(int(time.time()))) is_cuda = torch.cuda.is_available() model = AlexNet() if is_cuda: model = model.cuda() optimizer = optim.SGD( params=model.parameters(), lr=opt.base_lr, momentum=0.9, ) criterion = nn.CrossEntropyLoss() best_eval_acc = -0.1 losses = AverageMeter() accuracies = AverageMeter() global_step = 0 for epoch in range(1, opt.epochs + 1): # train model.train() for batch_idx, (inputs, targets) in enumerate(train_loader): global_step += 1 if is_cuda: inputs = inputs.cuda() targets = targets.cuda() outputs = model(inputs) loss = criterion(outputs, targets) losses.update(loss.item(), outputs.shape[0]) summary_writer.add_scalar('train/loss', loss, global_step) _, preds = torch.max(outputs, dim=1) acc = preds.eq(targets).sum().item() / len(targets) accuracies.update(acc) summary_writer.add_scalar('train/acc', acc, global_step) optimizer.zero_grad() loss.backward() optimizer.step() summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step) print( '==> Epoch: %d; Average Train Loss: %.4f; Average Train Acc: %.4f' % (epoch, losses.avg, accuracies.avg)) # eval model.eval() losses.reset() accuracies.reset() for batch_idx, (inputs, targets) in enumerate(eval_loader): if is_cuda: inputs = inputs.cuda() targets = targets.cuda() outputs = model(inputs) loss = criterion(outputs, targets) losses.update(loss.item(), outputs.shape[0]) _, preds = torch.max(outputs, dim=1) acc = preds.eq(targets).sum().item() / len(targets) accuracies.update(acc) summary_writer.add_scalar('eval/loss', losses.avg, global_step) summary_writer.add_scalar('eval/acc', accuracies.avg, global_step) if accuracies.avg > best_eval_acc: best_eval_acc = accuracies.avg torch.save(model, './weights/best.pt') print( '==> Epoch: %d; Average Eval Loss: %.4f; Average/Best Eval Acc: %.4f / %.4f' % (epoch, losses.avg, accuracies.avg, best_eval_acc))
def main(): #gpus=[4,5,6,7] gpus = [0] print("GPUs :", gpus) print("prepare data") normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_tfs = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) val_tfs = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]) train_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/train', train_tfs) val_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/val', val_tfs) train_ld = torch.utils.data.DataLoader(train_ds, batch_size=256, shuffle=True, num_workers=4, pin_memory=True) val_ld = torch.utils.data.DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=4, pin_memory=True) print("construct model") #model = ResNet50() #model=torchvision.models.AlexNet() model = AlexNet() #model = torch.nn.DataParallel(model, device_ids=gpus).cuda(gpus[0]) model.cuda() criterion = nn.CrossEntropyLoss().cuda(gpus[0]) optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.875, weight_decay=3.0517578125e-05) model.train() print("begin trainning") for epoch in range(0, 50): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(train_ld), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) end = time.time() for i, (images, labels) in enumerate(train_ld): data_time.update(time.time() - end) print('image shape: ', images.shape) print('labels shape: ', labels.shape) images = images.cuda(gpus[0], non_blocking=True) labels = labels.cuda(gpus[0], non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) # measure accuracy acc1, acc5 = accuracy(outputs, labels, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: progress.display(i)
dataset = torchvision.datasets.ImageFolder(path, transform=transformation) loader = torch.utils.data.DataLoader(dataset, batch_size=64, num_workers=0, shuffle=shuffle) return loader # https://github.com/fastai/imagenette # https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz train_loader = train_dataset(r'data/imagenette2-320/train', shuffle=True) val_loader = val_dataset(r'data/imagenette2-320/val', shuffle=False) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, verbose=True, threshold=0.1, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) def train(epoch): net.train()
num_workers=2 ) #################################################################### # # Load the model # #################################################################### model = AlexNet().to(device) criterion = nn.CrossEntropyLoss() # Stochastic gradient descent optimizer = optim.SGD( model.parameters(), lr=0.01, weight_decay=0.0005, momentum=0.9, ) def test_model(model, epoch): correct = 0 total = 0 with torch.no_grad(): for i, data in enumerate(test_loader, 0): images, labels, _ = data images, labels = images.to(device), labels.to(device)
name = k[7:] # remove `module.` # name = k[9:] # remove `module.1.` new_check_point[name] = v net.load_state_dict(new_check_point) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # # # Training # def main(): # AV = [[] for _ in range(5)] # MAV = [[] for _ in range(5)] # with torch.no_grad(): # for batch_idx, (inputs, targets) in enumerate(trainloader): # print("Processing {} batches in {}".format(batch_idx, 25000//args.bs)) # inputs, targets = inputs.to(device), targets.to(device) # outputs = net(inputs) # for score, t in zip(outputs, targets): # if np.argmax(score) == t: # AV[t].append(score.unsqueeze(dim=0)) # for i in range(5):
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') net = AlexNet(5) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # Training def train(epoch): adjust_learning_rate(optimizer, epoch, args.lr) print('\nEpoch: %d Learning rate: %f' % (epoch, optimizer.param_groups[0]['lr'])) net.train() train_loss = 0 correct = 0 total = 0
image_datasets = { 'train': ImageFolder(train_dir, transform=data_transforms['train']), 'val': ImageFolder(val_dir, transform=data_transforms['val']), } data_loader = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=num_workers) for x in ['train', 'val'] } device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True) model = model.to(device) model = train(model, data_loader, criterion, optimizer, scheduler, num_epochs=num_epoches) n_datapoints = len(model.train_epoch_loss)
def main(): progress = default_progress() experiment_dir = 'experiment/miniplaces' # Here's our data train_loader = torch.utils.data.DataLoader(CachedImageFolder( 'dataset/miniplaces/simple/train', transform=transforms.Compose([ transforms.Resize(128), transforms.RandomCrop(119), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV) ])), batch_size=64, shuffle=True, num_workers=6, pin_memory=True) val_loader = torch.utils.data.DataLoader(CachedImageFolder( 'dataset/miniplaces/simple/val', transform=transforms.Compose([ transforms.Resize(128), transforms.CenterCrop(119), transforms.ToTensor(), transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV) ])), batch_size=512, shuffle=False, num_workers=6, pin_memory=True) # Create a simplified AlexNet with half resolution. model = AlexNet(first_layer='conv1', last_layer='fc8', layer_sizes=dict(fc6=2048, fc7=2048), output_channels=100, half_resolution=True, include_lrn=False, split_groups=False).cuda() # Use Kaiming initialization for the weights for name, val in model.named_parameters(): if 'weight' in name: init.kaiming_uniform_(val) else: # Init positive bias in many layers to avoid dead neurons. assert 'bias' in name init.constant_( val, 0 if any( name.startswith(layer) for layer in ['conv1', 'conv3', 'fc8']) else 1) # An abbreviated training schedule: 40000 batches. # TODO: tune these hyperparameters. # init_lr = 0.002 init_lr = 0.002 # max_iter = 40000 - 34.5% @1 # max_iter = 50000 - 37% @1 # max_iter = 80000 - 39.7% @1 # max_iter = 100000 - 40.1% @1 max_iter = 100000 criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( model.parameters(), lr=init_lr, momentum=0.9, # 0.9, # weight_decay=0.001) weight_decay=0.001) iter_num = 0 best = dict(val_accuracy=0.0) model.train() # Oh, hold on. Let's actually resume training if we already have a model. checkpoint_filename = 'miniplaces.pth.tar' best_filename = 'best_%s' % checkpoint_filename best_checkpoint = os.path.join(experiment_dir, best_filename) try_to_resume_training = False if try_to_resume_training and os.path.exists(best_checkpoint): checkpoint = torch.load(os.path.join(experiment_dir, best_filename)) iter_num = checkpoint['iter'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best['val_accuracy'] = checkpoint['accuracy'] def save_checkpoint(state, is_best): filename = os.path.join(experiment_dir, checkpoint_filename) ensure_dir_for(filename) torch.save(state, filename) if is_best: shutil.copyfile(filename, os.path.join(experiment_dir, best_filename)) def validate_and_checkpoint(): model.eval() val_loss, val_acc = AverageMeter(), AverageMeter() for input, target in progress(val_loader): # Load data input_var, target_var = [ Variable(d.cuda(non_blocking=True)) for d in [input, target] ] # Evaluate model with torch.no_grad(): output = model(input_var) loss = criterion(output, target_var) _, pred = output.max(1) accuracy = (target_var.eq(pred) ).data.float().sum().item() / input.size(0) val_loss.update(loss.data.item(), input.size(0)) val_acc.update(accuracy, input.size(0)) # Check accuracy post_progress(l=val_loss.avg, a=val_acc.avg) # Save checkpoint save_checkpoint( { 'iter': iter_num, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'accuracy': val_acc.avg, 'loss': val_loss.avg, }, val_acc.avg > best['val_accuracy']) best['val_accuracy'] = max(val_acc.avg, best['val_accuracy']) post_progress(v=val_acc.avg) # Here is our training loop. while iter_num < max_iter: for input, target in progress(train_loader): # Track the average training loss/accuracy for each epoch. train_loss, train_acc = AverageMeter(), AverageMeter() # Load data input_var, target_var = [ Variable(d.cuda(non_blocking=True)) for d in [input, target] ] # Evaluate model output = model(input_var) loss = criterion(output, target_var) train_loss.update(loss.data.item(), input.size(0)) # Perform one step of SGD optimizer.zero_grad() loss.backward() optimizer.step() # Also check training set accuracy _, pred = output.max(1) accuracy = (target_var.eq(pred)).data.float().sum().item() / ( input.size(0)) train_acc.update(accuracy) remaining = 1 - iter_num / float(max_iter) post_progress(l=train_loss.avg, a=train_acc.avg, v=best['val_accuracy']) # Advance iter_num += 1 if iter_num >= max_iter: break # Linear learning rate decay lr = init_lr * remaining for param_group in optimizer.param_groups: param_group['lr'] = lr # Ocassionally check validation set accuracy and checkpoint if iter_num % 1000 == 0: validate_and_checkpoint() model.train()
class Solver(object): def __init__(self, config): self.model = None self.name = config.name self.lr = config.lr self.momentum = config.momentum self.beta = config.beta self.max_alpha = config.max_alpha self.epochs = config.epochs self.patience = config.patience self.N = config.N self.batch_size = config.batch_size self.random_labels = config.random_labels self.use_bn = config.batchnorm self.criterion = None self.optimizer = None self.scheduler = None self.device = None self.cuda = config.cuda self.train_loader = None self.test_loader = None def load_data(self): # ToTensor scales pixel values from [0,255] to [0,1] mean_var = (125.3 / 255, 123.0 / 255, 113.9 / 255), (63.0 / 255, 62.1 / 255, 66.7 / 255) transform = transforms.Compose([ transforms.CenterCrop(28), transforms.ToTensor(), transforms.Normalize(*mean_var, inplace=True) ]) train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=DOWNLOAD, transform=transform) test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=DOWNLOAD, transform=transform) if self.random_labels: np.random.shuffle(train_set.targets) np.random.shuffle(test_set.targets) assert self.N <= 50000 if self.N < 50000: train_set.data = train_set.data[:self.N] # downsize the test set to improve speed for small N test_set.data = test_set.data[:self.N] self.train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=self.batch_size, shuffle=True, drop_last=True) self.test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=self.batch_size, shuffle=False, drop_last=True) def load_model(self): if self.cuda: self.device = torch.device('cuda') cudnn.benchmark = True else: self.device = torch.device('cpu') self.model = AlexNet(device=self.device, B=self.batch_size, max_alpha=self.max_alpha, use_bn=self.use_bn).to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=140) self.criterion = nn.NLLLoss().to(self.device) def getIw(self): # Iw should be normalized with respect to N # via reparameterization, we optimize alpha with only 1920 dimensions # but Iw should scale with the dimension of the weights return 7 * 7 * 64 * 384 / 1920 * self.model.getIw() / self.batch_size def do_batch(self, train, epoch): loader = self.train_loader if train else self.test_loader total_ce, total_Iw, total_loss = 0, 0, 0 total_correct = 0 total = 0 pbar = tqdm(loader) num_batches = len(loader) for batch_num, (data, target) in enumerate(pbar): data, target = data.to(self.device), target.to(self.device) if train: self.optimizer.zero_grad() output = self.model(data) # NLLLoss is averaged across observations for each minibatch ce = self.criterion(torch.log(output + EPS), target) Iw = self.getIw() loss = ce + 0.5 * self.beta * Iw if train: loss.backward() self.optimizer.step() total_ce += ce.item() total_Iw += Iw.item() total_loss += loss.item() prediction = torch.max( output, 1) # second param "1" represents the dimension to be reduced total_correct += np.sum( prediction[1].cpu().numpy() == target.cpu().numpy()) total += target.size(0) a = self.model.get_a() pbar.set_description('Train' if train else 'Test') pbar.set_postfix(N=self.N, b=self.beta, ep=epoch, acc=100. * total_correct / total, loss=total_loss / num_batches, ce=total_ce / num_batches, Iw=total_Iw / num_batches, a=a) return total_correct / total, total_loss / num_batches, total_ce / num_batches, total_Iw / num_batches, a def train(self, epoch): self.model.train() return self.do_batch(train=True, epoch=epoch) def test(self, epoch): self.model.eval() with torch.no_grad(): return self.do_batch(train=False, epoch=epoch) def save(self, name=None): model_out_path = (name or self.name) + ".pth" # torch.save(self.model, model_out_path) # print("Checkpoint saved to {}".format(model_out_path)) def run(self): self.load_data() self.load_model() results = [] best_acc, best_ep = -1, -1 for epoch in range(1, self.epochs + 1): # print("\n===> epoch: %d/200" % epoch) train_acc, train_loss, train_ce, train_Iw, train_a = self.train( epoch) self.scheduler.step(epoch) test_acc, test_loss, test_ce, test_Iw, test_a = self.test(epoch) results.append([ self.N, self.beta, train_acc, test_acc, train_loss, test_loss, train_ce, test_ce, train_Iw, test_Iw, train_a, test_a ]) if test_acc > best_acc: best_acc, best_ep = test_acc, epoch if self.patience >= 0: # early stopping if best_ep < epoch - self.patience: break with open(self.name + '.csv', 'a') as f: w = csv.writer(f) w.writerows(results) self.save() return train_acc, test_acc
def run_experiment(args): torch.manual_seed(args.seed) if not args.no_cuda: torch.cuda.manual_seed(args.seed) # Dataset if args.dataset == 'mnist': train_loader, test_loader, _, val_data = prepare_mnist(args) else: create_val_img_folder(args) train_loader, test_loader, _, val_data = prepare_imagenet(args) idx_to_class = {i: c for c, i in val_data.class_to_idx.items()} # Model & Criterion if args.model == 'AlexNet': if args.pretrained: model = models.__dict__['alexnet'](pretrained=True) # Change the last layer in_f = model.classifier[-1].in_features model.classifier[-1] = nn.Linear(in_f, args.classes) else: model = AlexNet(args.classes) criterion = nn.CrossEntropyLoss(size_average=False) else: model = SVM(args.features, args.classes) criterion = MultiClassHingeLoss(margin=args.margin, size_average=False) if not args.no_cuda: model.cuda() # Load saved model and test on it if args.load: model.load_state_dict(torch.load(args.model_path)) val_acc = test(model, criterion, test_loader, 0, [], [], idx_to_class, args) # Optimizer if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters()) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) total_minibatch_count = 0 val_acc = 0 train_losses, train_accs = [], [] val_losses, val_accs = [], [] # Train and test for epoch in range(1, args.epochs + 1): total_minibatch_count = train(model, criterion, optimizer, train_loader, epoch, total_minibatch_count, train_losses, train_accs, args) val_acc = test(model, criterion, test_loader, epoch, val_losses, val_accs, idx_to_class, args) # Save model if args.save: if not os.path.exists(args.models_dir): os.makedirs(args.models_dir) filename = '_'.join( [args.prefix, args.dataset, args.model, 'model.pt']) torch.save(model.state_dict(), os.path.join(args.models_dir, filename)) # Plot graphs fig, axes = plt.subplots(1, 4, figsize=(13, 4)) axes[0].plot(train_losses) axes[0].set_title('Loss') axes[1].plot(train_accs) axes[1].set_title('Acc') axes[1].set_ylim([0, 1]) axes[2].plot(val_losses) axes[2].set_title('Val loss') axes[3].plot(val_accs) axes[3].set_title('Val Acc') axes[3].set_ylim([0, 1]) # Images don't show on Ubuntu # plt.tight_layout() # Save results if not os.path.exists(args.results_dir): os.makedirs(args.results_dir) filename = '_'.join([args.prefix, args.dataset, args.model, 'plot.png']) fig.suptitle(filename) fig.savefig(os.path.join(args.results_dir, filename))
logger = Logger('./largecnnlogs') lossfunction = nn.MSELoss() dataset = Rand_num() sampler = RandomSampler(dataset) loader = DataLoader(dataset, batch_size=20, sampler=sampler, shuffle=False, num_workers=1, drop_last=True) net = AlexNet(3) #net.load_state_dict(torch.load(SAVE_PATH)) net.cuda() optimizer = optim.Adam(net.parameters(), lr=0.001) for epoch in range(10000): for i, data in enumerate(loader, 0): net.zero_grad() video, labels = data video = video.view(-1, 3, 227, 227) labels = labels.view(-1, 3) labels = torch.squeeze(Variable(labels.float().cuda())) video = torch.squeeze(Variable((video.float() / 256).cuda())) net.train() outputs = net.forward(video) loss = lossfunction(outputs, labels) loss.backward() optimizer.step() if i == 0: torch.save(net.state_dict(), SAVE_PATH)
def run(): global args args = parser.parse_args() if args.resume: print 'Resuming from checkpoint!' if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # load datasets, split train and test ''' train_dataset = SketchData(root=path, train=True, transform=transforms.ToTensor(), target_transform=transforms.ToTensor(), ) test_dataset = SketchData(root=path, train=False, transform=transforms.ToTensor(), target_transform=transforms.ToTensor(), ) ''' train_dataset = SketchData( root=path, train=True, transform=None, target_transform=None, ) val_dataset = SketchData( root=path, train=False, transform=None, target_transform=None, ) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.b, shuffle=True) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.b, shuffle=False) # create model, set parameters, optimiser, loss model = AlexNet() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.wd) criterion = nn.CrossEntropyLoss() best_prec = 0 for epoch in range(args.epochs): adjust_learning_rate(optimizer, epoch) train(train_loader, model, criterion, optimizer, epoch) precision = validate(val_loader, model, criterion) best_prec = max(precision.data[0], best_prec) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'best_prec1': best_prec, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, (precision.data[0] > best_prec))
pretrained=False) elif args.model == 'resnext': model = resnext50(img_channel=3, num_classes=19) elif args.model == 'alexnet': model = AlexNet(num_classes=19) else: raise ValueError("This utility can't train that kind of model.") logging.info("Setting dataset") torch.cuda.empty_cache() train_dataset = TrainImageDataset( args.data, 224, 224, ) optimizer = torch.optim.Adam(model.parameters(), args.lr) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) criterion = nn.CrossEntropyLoss() config = wandb.config # wandb config specification config.learning_rate = args.lr config.batch_size = args.batch_size config.model = args.model logging.info("Training...") train_loss, train_acc, test_acc, test_loss = train_for_classification( net=model, dataset=train_dataset, batch_size=args.batch_size, optimizer=optimizer,
train=False, download=True, transform=transform) # Define test batch data testloader = torch.utils.data.DataLoader( testset, batch_size=BATCH_SIZE, shuffle=True, ) # Define loss function loss function and optimization method (using SGD) net = AlexNet().to(device) criterion = nn.CrossEntropyLoss( ) # Cross entropy loss function, usually used for multi-classification problems optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) # Train and save model parameters def train(): for epoch in range(EPOCH): sum_loss = 0.0 # Data read for i, data in enumerate(trainloader): inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # Gradient clear optimizer.zero_grad()
def train_generic_model(model_name="alexnet", dataset="custom", num_classes=-1, batch_size=8, is_transform=1, num_workers=2, lr_decay=1, l2_reg=0, hdf5_path="dataset-bosch-224x224.hdf5", trainset_dir="./TRAIN_data_224_v8", testset_dir="./TEST_data_224_v8", convert_grey=False): CHKPT_PATH = "./checkpoint_{}.PTH".format(model_name) print("CUDA:") print(torch.cuda.is_available()) if is_transform: trans_ls = [] if convert_grey: trans_ls.append(transforms.Grayscale(num_output_channels=1)) trans_ls.extend([ transforms.Resize((224, 224)), # transforms.RandomCrop((224, 224)), # transforms.Grayscale(num_output_channels=1), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) transform = transforms.Compose(trans_ls) else: transform = None print("DATASET FORMAT: {}".format(dataset)) print("TRAINSET PATH: {}".format(trainset_dir)) print("TESTSET PATH: {}".format(testset_dir)) print("HDF5 PATH: {}".format(hdf5_path)) if dataset == "custom": trainset = torchvision.datasets.ImageFolder(root=trainset_dir, transform=transform) train_size = len(trainset) testset = torchvision.datasets.ImageFolder(root=testset_dir, transform=transform) test_size = len(testset) elif dataset == "cifar": trainset = torchvision.datasets.CIFAR10(root="CIFAR_TRAIN_data", train=True, download=True, transform=transform) train_size = len(trainset) testset = torchvision.datasets.CIFAR10(root="CIFAR_TEST_data", train=False, download=True, transform=transform) test_size = len(testset) elif dataset == "hdf5": if num_workers == 1: trainset = Hdf5Dataset(hdf5_path, transform=transform, is_test=False) else: trainset = Hdf5DatasetMPI(hdf5_path, transform=transform, is_test=False) train_size = len(trainset) if num_workers == 1: testset = Hdf5Dataset(hdf5_path, transform=transform, is_test=True) else: testset = Hdf5DatasetMPI(hdf5_path, transform=transform, is_test=True) test_size = len(testset) train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=num_workers) if model_name == "alexnet": net = AlexNet(num_classes=num_classes) elif model_name == "lenet5": net = LeNet5(num_classes=num_classes) elif model_name == "stn-alexnet": net = STNAlexNet(num_classes=num_classes) elif model_name == "stn-lenet5": net = LeNet5STN(num_classes=num_classes) elif model_name == "capsnet": net = CapsuleNet(num_classes=num_classes) elif model_name == "convneta": net = ConvNetA(num_classes=num_classes) elif model_name == "convnetb": net = ConvNetB(num_classes=num_classes) elif model_name == "convnetc": net = ConvNetC(num_classes=num_classes) elif model_name == "convnetd": net = ConvNetD(num_classes=num_classes) elif model_name == "convnete": net = ConvNetE(num_classes=num_classes) elif model_name == "convnetf": net = ConvNetF(num_classes=num_classes) elif model_name == "convnetg": net = ConvNetG(num_classes=num_classes) elif model_name == "convneth": net = ConvNetH(num_classes=num_classes) elif model_name == "convneti": net = ConvNetI(num_classes=num_classes) elif model_name == "convnetj": net = ConvNetJ(num_classes=num_classes) elif model_name == "convnetk": net = ConvNetK(num_classes=num_classes) elif model_name == "convnetl": net = ConvNetL(num_classes=num_classes) elif model_name == "convnetm": net = ConvNetM(num_classes=num_classes) elif model_name == "convnetn": net = ConvNetN(num_classes=num_classes) elif model_name == "resnet18": net = models.resnet18(pretrained=False, num_classes=num_classes) print(net) if torch.cuda.is_available(): net = net.cuda() if model_name == "capsnet": criterion = CapsuleLoss() else: criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=l2_reg) if lr_decay: scheduler = ReduceLROnPlateau(optimizer, 'min') best_acc = 0 from_epoch = 0 if os.path.exists(CHKPT_PATH): print("Checkpoint Found: {}".format(CHKPT_PATH)) state = torch.load(CHKPT_PATH) net.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) best_acc = state['best_accuracy'] from_epoch = state['epoch'] for epoch in range(from_epoch, NUM_EPOCHS): #print("Epoch: {}/{}".format(epoch + 1, NUM_EPOCHS)) epoch_loss = 0 correct = 0 for i, data in enumerate(train_loader, 0): #print("Train \t Epoch: {}/{} \t Batch: {}/{}".format(epoch + 1, # NUM_EPOCHS, # i + 1, # ceil(train_size / BATCH_SIZE))) inputs, labels = data inputs, labels = Variable(inputs).type(torch.FloatTensor),\ Variable(labels).type(torch.LongTensor) if model_name == "capsnet": inputs = augmentation(inputs) ground_truth = torch.eye(num_classes).index_select( dim=0, index=labels) if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() if model_name == "capsnet": classes, reconstructions = net(inputs, ground_truth) loss = criterion(inputs, ground_truth, classes, reconstructions) else: outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() epoch_loss += loss.data[0] if model_name != "capsnet": log_outputs = F.softmax(outputs, dim=1) else: log_outputs = classes pred = log_outputs.data.max(1, keepdim=True)[1] correct += pred.eq(labels.data.view_as(pred)).sum() print( "Epoch: {} \t Training Loss: {:.4f} \t Training Accuracy: {:.2f} \t {}/{}" .format(epoch + 1, epoch_loss / train_size, 100 * correct / train_size, correct, train_size)) correct = 0 test_loss = 0 for i, data in enumerate(test_loader, 0): # print("Test \t Epoch: {}/{} \t Batch: {}/{}".format(epoch + 1, # NUM_EPOCHS, # i + 1, # ceil(test_size / BATCH_SIZE))) inputs, labels = data inputs, labels = Variable(inputs).type( torch.FloatTensor), Variable(labels).type(torch.LongTensor) if model_name == "capsnet": inputs = augmentation(inputs) ground_truth = torch.eye(num_classes).index_select( dim=0, index=labels) if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() if model_name == "capsnet": classes, reconstructions = net(inputs) loss = criterion(inputs, ground_truth, classes, reconstructions) else: outputs = net(inputs) loss = criterion(outputs, labels) test_loss += loss.data[0] if model_name != "capsnet": log_outputs = F.softmax(outputs, dim=1) else: log_outputs = classes pred = log_outputs.data.max(1, keepdim=True)[1] correct += pred.eq(labels.data.view_as(pred)).sum() print( "Epoch: {} \t Testing Loss: {:.4f} \t Testing Accuracy: {:.2f} \t {}/{}" .format(epoch + 1, test_loss / test_size, 100 * correct / test_size, correct, test_size)) if correct >= best_acc: if not os.path.exists("./models"): os.mkdir("./models") torch.save( net.state_dict(), "./models/model-{}-{}-{}-{}-val-acc-{:.2f}-train-{}-test-{}-epoch-{}.pb" .format(model_name, dataset, hdf5_path, str(datetime.now()), 100 * correct / test_size, trainset_dir.replace(" ", "_").replace("/", "_"), testset_dir.replace(" ", "_").replace("/", "_"), epoch + 1)) best_acc = max(best_acc, correct) # save checkpoint path state = { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'best_accuracy': best_acc } torch.save(state, CHKPT_PATH) if lr_decay: # Note that step should be called after validate() scheduler.step(test_loss) print('Finished Training') print("") print("")