def handler(context): dataset_alias = context.datasets trainval_2007_dataset_id = dataset_alias['trainval2007'] trainval_2012_dataset_id = dataset_alias['trainval2012'] test_2007_dataset_id = dataset_alias['test2007'] trainval_2007_dataset = list( load_dataset_from_api(trainval_2007_dataset_id)) trainval_2012_dataset = list( load_dataset_from_api(trainval_2012_dataset_id)) test_2007_dataset = list(load_dataset_from_api(test_2007_dataset_id)) if network_model == 'ssd300': model = SSD300(n_fg_class=len(voc_bbox_label_names), pretrained_model='imagenet') elif network_model == 'ssd512': model = SSD512(n_fg_class=len(voc_bbox_label_names), pretrained_model='imagenet') model.use_preset('evaluate') train_chain = MultiboxTrainChain(model) if USE_GPU >= 0: chainer.cuda.get_device_from_id(USE_GPU).use() model.to_gpu() trainval_2007 = DetectionDatasetFromAPI(trainval_2007_dataset) trainval_2012 = DetectionDatasetFromAPI(trainval_2012_dataset) test_2007 = DetectionDatasetFromAPI(test_2007_dataset, use_difficult=True, return_difficult=True) train = TransformDataset(ConcatenatedDataset(trainval_2007, trainval_2012), Transform(model.coder, model.insize, model.mean)) train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE) test_iter = chainer.iterators.SerialIterator(test_2007, BATCHSIZE, repeat=False, shuffle=False) # initial lr is set to 1e-3 by ExponentialShift optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(train_chain) for param in train_chain.params(): if param.name == 'b': param.update_rule.add_hook(GradientScaling(2)) else: param.update_rule.add_hook(WeightDecay(0.0005)) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=USE_GPU) trainer = training.Trainer(updater, (nb_iterations, 'iteration'), out=ABEJA_TRAINING_RESULT_DIR) trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3), trigger=triggers.ManualScheduleTrigger([80000, 100000], 'iteration')) trainer.extend(DetectionVOCEvaluator(test_iter, model, use_07_metric=True, label_names=voc_bbox_label_names), trigger=(10000, 'iteration')) log_interval = 100, 'iteration' trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) print_entries = [ 'iteration', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ] report_entries = [ 'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ] trainer.extend(Statistics(report_entries, nb_iterations, obs_key='iteration'), trigger=log_interval) trainer.extend(Tensorboard(report_entries, out_dir=log_path)) trainer.extend(extensions.PrintReport(print_entries), trigger=log_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=(nb_iterations, 'iteration')) trainer.run()
def handler(context): dataset_alias = context.datasets trainval_dataset_id = dataset_alias['trainval'] test_dataset_id = dataset_alias['test'] trainval_dataset = list(load_dataset_from_api(trainval_dataset_id)) test_dataset = list(load_dataset_from_api(test_dataset_id)) trainval = DetectionDatasetFromAPI(trainval_dataset, transform=SSDAugmentation( min_dim, MEANS)) test = DetectionDatasetFromAPI(test_dataset, transform=SSDAugmentation(min_dim, MEANS)) train_dataset = trainval test_dataset = test priorbox = PriorBox(min_dim, PARAMS) with torch.no_grad(): priors = priorbox.forward().to(device) ssd_net = build_ssd('train', priors, min_dim, num_classes) ssd_net = ssd_net.to(device) url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth' weight_file = os.path.join(ABEJA_TRAINING_RESULT_DIR, 'vgg16_reducedfc.pth') download(url, weight_file) vgg_weights = torch.load(weight_file) print('Loading base network...') ssd_net.vgg.load_state_dict(vgg_weights) optimizer = optim.SGD(ssd_net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False, PARAMS['variance'], device) # loss counters step_index = 0 trainloader = data.DataLoader(train_dataset, batch_size, num_workers=0, shuffle=True, collate_fn=tools.detection_collate, pin_memory=True) testloader = data.DataLoader(test_dataset, batch_size, num_workers=0, shuffle=False, collate_fn=tools.detection_collate, pin_memory=True) # create batch iterator iteration = 1 while iteration <= max_iter: ssd_net.train() for images, targets in trainloader: if iteration > max_iter: break if iteration in lr_steps: step_index += 1 adjust_learning_rate(optimizer, 0.1, step_index) # load train data images = images.to(device) targets = [ann.to(device) for ann in targets] # forward out = ssd_net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c loss.backward() optimizer.step() if iteration % 100 == 0: print('[Train] iter {}, loss: {:.4f}'.format( iteration, loss.item())) statistics(iteration, loss.item(), None, None, None) writer.add_scalar('main/loss', loss.item(), iteration) writer.add_scalar('main/loc_loss', loss_l.item(), iteration) writer.add_scalar('main/conf_loss', loss_c.item(), iteration) if iteration % 10000 == 0: eval(testloader, ssd_net, criterion, iteration) ssd_net.train() iteration += 1 torch.save(ssd_net.state_dict(), os.path.join(ABEJA_TRAINING_RESULT_DIR, 'model.pth'))
def handler(context): class_labels = 10 dataset_alias = context.datasets train_dataset_id = dataset_alias['train'] test_dataset_id = dataset_alias['test'] train_data = list(load_dataset_from_api(train_dataset_id)) test_data = list(load_dataset_from_api(test_dataset_id)) train = ImageDatasetFromAPI(train_data, train=True) test = ImageDatasetFromAPI(test_data) net = utils.VGG.VGG(class_labels) model = L.Classifier(net) if USE_GPU >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(USE_GPU).use() model.to_gpu() # Copy the model to the GPU optimizer = chainer.optimizers.MomentumSGD(learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) stop_trigger = (epochs, 'epoch') # Early stopping option if early_stopping: stop_trigger = triggers.EarlyStoppingTrigger(monitor=early_stopping, verbose=True, max_trigger=(epochs, 'epoch')) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, device=USE_GPU) trainer = training.Trainer(updater, stop_trigger, out=ABEJA_TRAINING_RESULT_DIR) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=USE_GPU)) # Reduce the learning rate by half every 25 epochs. trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch')) # Take a snapshot at each epoch trainer.extend(extensions.snapshot_object(net, 'net.model'), trigger=(epochs, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. report_entries = [ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy' ] trainer.extend(Statistics(report_entries, epochs), trigger=(1, 'epoch')) trainer.extend(Tensorboard(report_entries, out_dir=log_path)) trainer.extend(extensions.PrintReport(report_entries)) trainer.run()
def handler(context): print( f'start training with parameters : {Parameters.as_dict()}, context : {context}' ) try: dataset_alias = context.datasets # for older version except AttributeError: dataset_alias = context['datasets'] train_dataset_id, val_dataset_id = get_dataset_ids(dataset_alias) id2index, _ = set_categories(list(dataset_alias.values())) num_classes = len(id2index) num_classes += 1 # add for background class print(f'number of classes : {num_classes}') print("Start downloading datasets.") dataset_items = list( load_dataset_from_api(train_dataset_id, max_num=Parameters.MAX_ITEMS)) print("Finish downloading datasets.") random.shuffle(dataset_items) if val_dataset_id is not None: val_dataset_items = list( load_dataset_from_api(val_dataset_id, max_num=Parameters.MAX_ITEMS)) random.shuffle(val_dataset_items) train_dataset_items = dataset_items else: test_size = int(len(dataset_items) * Parameters.TEST_SIZE) train_dataset_items, val_dataset_items = dataset_items[ test_size:], dataset_items[:test_size] train_dataset = ABEJAPlatformDataset(train_dataset_items, phase="train", transform=DataTransform( Parameters.IMG_SIZE, Parameters.MEANS)) val_dataset = ABEJAPlatformDataset(val_dataset_items, phase="val", transform=DataTransform( Parameters.IMG_SIZE, Parameters.MEANS)) print(f'train dataset : {len(train_dataset)}') print(f'val dataset : {len(val_dataset)}') train_dataloader = data.DataLoader(train_dataset, batch_size=Parameters.BATCH_SIZE, shuffle=Parameters.SHUFFLE, collate_fn=od_collate_fn) val_dataloader = data.DataLoader(val_dataset, batch_size=Parameters.BATCH_SIZE, shuffle=False, collate_fn=od_collate_fn) dataloaders_dict = {"train": train_dataloader, "val": val_dataloader} print(f'data loaders : {dataloaders_dict}') ssd_cfg = { 'num_classes': num_classes, # number of classes including background class 'input_size': Parameters.IMG_SIZE, 'bbox_aspect_num': Parameters.BBOX_ASPECT_NUM, 'feature_maps': Parameters.FEATURE_MAPS, 'steps': Parameters.STEPS, 'min_sizes': Parameters.MIN_SIZES, 'max_sizes': Parameters.MAX_SIZES, 'aspect_ratios': Parameters.ASPECT_RATIOS, 'conf_thresh': Parameters.CONF_THRESHOLD, 'top_k': Parameters.TOP_K, 'nms_thresh': Parameters.NMS_THRESHOLD } net = SSD(phase="train", cfg=ssd_cfg) # TODO: better to host this file by ourselves # https://github.com/amdegroot/ssd.pytorch#training-ssd url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth' weight_file = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'vgg16_reducedfc.pth') download(url, weight_file) vgg_weights = torch.load(weight_file) print('finish loading base network...') net.vgg.load_state_dict(vgg_weights) def weights_init(m): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight.data) if m.bias is not None: # in case of bias nn.init.constant_(m.bias, 0.0) # apply initial values of He net.extras.apply(weights_init) net.loc.apply(weights_init) net.conf.apply(weights_init) # configure loss function criterion = MultiBoxLoss(jaccard_thresh=Parameters.OVERLAP_THRESHOLD, neg_pos=Parameters.NEG_POS, device=device) # configure optimizer optimizer = optim.SGD(net.parameters(), lr=Parameters.LR, momentum=Parameters.MOMENTUM, dampening=Parameters.DAMPENING, weight_decay=Parameters.WEIGHT_DECAY, nesterov=Parameters.NESTEROV) # move network to device net.to(device) # NOTE: This flag allows to enable the inbuilt cudnn auto-tuner # to find the best algorithm to use for your hardware. # cf. https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2 torch.backends.cudnn.benchmark = True iteration = 1 epoch_train_loss = 0.0 epoch_val_loss = 0.0 latest_epoch_train_loss = epoch_train_loss latest_epoch_val_loss = epoch_val_loss for epoch in range(Parameters.EPOCHS): t_epoch_start = time.time() t_iter_start = time.time() print('-------------') print('Epoch {}/{}'.format(epoch + 1, Parameters.EPOCHS)) print('-------------') # loop of train and validation for each epoch for phase in ['train', 'val']: if phase == 'train': net.train() print('(train)') else: if (epoch + 1) % 10 == 0: net.eval() print('-------------') print('(val)') else: # perform validation once every ten times continue # loop each mini-batch from data loader for images, targets in dataloaders_dict[phase]: images = images.to(device) targets = [ann.to(device) for ann in targets] # initialize optimizer optimizer.zero_grad() # calculate forward with torch.set_grad_enabled(phase == 'train'): outputs = net(images) # calculate loss loss_l, loss_c = criterion(outputs, targets) loss = loss_l + loss_c if phase == 'train': # back propagate when training loss.backward() # calculate gradient nn.utils.clip_grad_value_( net.parameters(), clip_value=Parameters.CLIP_VALUE) optimizer.step() # update parameters if iteration % 10 == 0: # display loss once every ten iterations t_iter_finish = time.time() duration = t_iter_finish - t_iter_start print( 'iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.' .format(iteration, loss.item(), duration)) t_iter_start = time.time() epoch_train_loss += loss.item() iteration += 1 else: epoch_val_loss += loss.item() # loss and accuracy rate of each phase of epoch t_epoch_finish = time.time() # keep latest epoch loss if epoch_train_loss != 0.0: num_total = len(dataloaders_dict['train']) latest_epoch_train_loss = epoch_train_loss / num_total if epoch_val_loss != 0.0: num_total = len(dataloaders_dict['val']) latest_epoch_val_loss = epoch_val_loss / num_total print('-------------') print('epoch {} || Epoch_TRAIN_Loss:{:.4f} || Epoch_VAL_Loss:{:.4f}'. format(epoch + 1, latest_epoch_train_loss, latest_epoch_val_loss)) print('timer: {:.4f} sec.'.format(t_epoch_finish - t_epoch_start)) t_epoch_start = time.time() statistics(epoch + 1, latest_epoch_train_loss, None, latest_epoch_val_loss, None) writer.add_scalar('main/loss', latest_epoch_train_loss, epoch + 1) if (epoch + 1) % 10 == 0: writer.add_scalar('test/loss', latest_epoch_val_loss, epoch + 1) model_path = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, f'ssd300_{str(epoch + 1)}.pth') torch.save(net.state_dict(), model_path) writer.flush() epoch_train_loss = 0.0 epoch_val_loss = 0.0 torch.save(net.state_dict(), os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'model.pth')) writer.close()
def handler(context): dataset_alias = context.datasets data = list(load_dataset_from_api(dataset_alias['train'])) np.random.seed(0) data = np.random.permutation(data) nb_data = len(data) nb_train = int(7 * nb_data // 10) train_data_raw = data[:nb_train] test_data_raw = data[nb_train:] premodel = SSD300(n_fg_class=20, pretrained_model='voc0712') model = SSD300(n_fg_class=1) copy_ssd(model, premodel) model.use_preset('evaluate') train_chain = MultiboxTrainChain(model) if USE_GPU >= 0: chainer.cuda.get_device_from_id(USE_GPU).use() model.to_gpu() # initial lr is set to 1e-3 by ExponentialShift optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(train_chain) for param in train_chain.params(): if param.name == 'b': param.update_rule.add_hook(GradientScaling(2)) else: param.update_rule.add_hook(WeightDecay(0.0005)) fix_ssd(train_chain) train_data = DetectionDatasetFromAPI(train_data_raw) test_data = DetectionDatasetFromAPI(test_data_raw, use_difficult=True, return_difficult=True) train_data = TransformDataset( train_data, Transform(model.coder, model.insize, model.mean)) train_iter = chainer.iterators.SerialIterator(train_data, BATCHSIZE) test_iter = chainer.iterators.SerialIterator(test_data, BATCHSIZE, repeat=False, shuffle=False) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=USE_GPU) trainer = training.Trainer(updater, (nb_epochs, 'epoch'), out=ABEJA_TRAINING_RESULT_DIR) trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3), trigger=triggers.ManualScheduleTrigger([1200, 1600], 'epoch')) trainer.extend(DetectionVOCEvaluator(test_iter, model, use_07_metric=True, label_names=['cup']), trigger=(1, 'epoch')) log_interval = 1, 'epoch' trainer.extend(extensions.LogReport(trigger=log_interval)) print_entries = [ 'epoch', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ] report_entries = [ 'epoch', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ] trainer.extend(Statistics(report_entries, nb_epochs), trigger=log_interval) trainer.extend(Tensorboard(report_entries, out_dir=log_path)) trainer.extend(extensions.PrintReport(print_entries), trigger=log_interval) trainer.extend(extensions.snapshot_object(model, 'model_epoch_{.updater.epoch}'), trigger=(nb_epochs, 'epoch')) trainer.run()
def handler(context): dataset_alias = context.datasets train_dataset_id = dataset_alias['train'] test_dataset_id = dataset_alias['test'] train_data = list(load_dataset_from_api(train_dataset_id)) test_data = list(load_dataset_from_api(test_dataset_id)) # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) #trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) trainset = ImageDatasetFromAPI(train_data, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) #testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) testset = ImageDatasetFromAPI(test_data, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) # Model print('==> Building model..') if model == 'VGG19': net = VGG('VGG19') elif model == 'ResNet18': net = ResNet18() elif model == 'PreActResNet18': net = PreActResNet18() elif model == 'GoogLeNet': net = GoogLeNet() elif model == 'DenseNet121': net = DenseNet121() elif model == 'ResNeXt29_2x64d': net = ResNeXt29_2x64d() elif model == 'MobileNet': net = MobileNet() elif model == 'MobileNetV2': net = MobileNetV2() elif model == 'DPN92': net = DPN92() elif model == 'ShuffleNetG2': net = ShuffleNetG2() elif model == 'SENet18': net = SENet18() elif model == 'ShuffleNetV2': net = ShuffleNetV2(1) net = net.to(device) #if device == 'cuda': # net = torch.nn.DataParallel(net) # cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[150, 250], gamma=0.1) statistics = Statistics(epochs) for epoch in range(epochs): scheduler.step() train_loss, train_acc = train(net, optimizer, trainloader, criterion, epoch) test_loss, test_acc = test(net, testloader, criterion, epoch) # Reporting print( '[{:d}] main/loss: {:.3f} main/acc: {:.3f}, main/validation/loss: {:.3f}, main/validation/acc: {:.3f}' .format(epoch + 1, train_loss, train_acc, test_loss, test_acc)) statistics(epoch + 1, train_loss, train_acc, test_loss, test_acc) writer.add_scalar('main/loss', train_loss, epoch + 1) writer.add_scalar('main/acc', train_acc, epoch + 1) writer.add_scalar('main/validation/loss', test_loss, epoch + 1) writer.add_scalar('main/validation/acc', test_acc, epoch + 1) torch.save(net.state_dict(), os.path.join(ABEJA_TRAINING_RESULT_DIR, 'model.pth'))