def handler(context): print( f'start training with parameters : {Parameters.as_dict()}, context : {context}' ) try: dataset_alias = context.datasets # for older version except AttributeError: dataset_alias = context['datasets'] train_dataset_id, val_dataset_id = get_dataset_ids(dataset_alias) id2index, _ = set_categories(list(dataset_alias.values())) num_classes = len(id2index) num_classes += 1 # add for background class print(f'number of classes : {num_classes}') print("Start downloading datasets.") dataset_items = list( load_dataset_from_api(train_dataset_id, max_num=Parameters.MAX_ITEMS)) print("Finish downloading datasets.") random.shuffle(dataset_items) if val_dataset_id is not None: val_dataset_items = list( load_dataset_from_api(val_dataset_id, max_num=Parameters.MAX_ITEMS)) random.shuffle(val_dataset_items) train_dataset_items = dataset_items else: test_size = int(len(dataset_items) * Parameters.TEST_SIZE) train_dataset_items, val_dataset_items = dataset_items[ test_size:], dataset_items[:test_size] train_dataset = ABEJAPlatformDataset(train_dataset_items, phase="train", transform=DataTransform( Parameters.IMG_SIZE, Parameters.MEANS)) val_dataset = ABEJAPlatformDataset(val_dataset_items, phase="val", transform=DataTransform( Parameters.IMG_SIZE, Parameters.MEANS)) print(f'train dataset : {len(train_dataset)}') print(f'val dataset : {len(val_dataset)}') train_dataloader = data.DataLoader(train_dataset, batch_size=Parameters.BATCH_SIZE, shuffle=Parameters.SHUFFLE, collate_fn=od_collate_fn) val_dataloader = data.DataLoader(val_dataset, batch_size=Parameters.BATCH_SIZE, shuffle=False, collate_fn=od_collate_fn) dataloaders_dict = {"train": train_dataloader, "val": val_dataloader} print(f'data loaders : {dataloaders_dict}') ssd_cfg = { 'num_classes': num_classes, # number of classes including background class 'input_size': Parameters.IMG_SIZE, 'bbox_aspect_num': Parameters.BBOX_ASPECT_NUM, 'feature_maps': Parameters.FEATURE_MAPS, 'steps': Parameters.STEPS, 'min_sizes': Parameters.MIN_SIZES, 'max_sizes': Parameters.MAX_SIZES, 'aspect_ratios': Parameters.ASPECT_RATIOS, 'conf_thresh': Parameters.CONF_THRESHOLD, 'top_k': Parameters.TOP_K, 'nms_thresh': Parameters.NMS_THRESHOLD } net = SSD(phase="train", cfg=ssd_cfg) # TODO: better to host this file by ourselves # https://github.com/amdegroot/ssd.pytorch#training-ssd url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth' weight_file = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'vgg16_reducedfc.pth') download(url, weight_file) vgg_weights = torch.load(weight_file) print('finish loading base network...') net.vgg.load_state_dict(vgg_weights) def weights_init(m): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight.data) if m.bias is not None: # in case of bias nn.init.constant_(m.bias, 0.0) # apply initial values of He net.extras.apply(weights_init) net.loc.apply(weights_init) net.conf.apply(weights_init) # configure loss function criterion = MultiBoxLoss(jaccard_thresh=Parameters.OVERLAP_THRESHOLD, neg_pos=Parameters.NEG_POS, device=device) # configure optimizer optimizer = optim.SGD(net.parameters(), lr=Parameters.LR, momentum=Parameters.MOMENTUM, dampening=Parameters.DAMPENING, weight_decay=Parameters.WEIGHT_DECAY, nesterov=Parameters.NESTEROV) # move network to device net.to(device) # NOTE: This flag allows to enable the inbuilt cudnn auto-tuner # to find the best algorithm to use for your hardware. # cf. https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2 torch.backends.cudnn.benchmark = True iteration = 1 epoch_train_loss = 0.0 epoch_val_loss = 0.0 latest_epoch_train_loss = epoch_train_loss latest_epoch_val_loss = epoch_val_loss for epoch in range(Parameters.EPOCHS): t_epoch_start = time.time() t_iter_start = time.time() print('-------------') print('Epoch {}/{}'.format(epoch + 1, Parameters.EPOCHS)) print('-------------') # loop of train and validation for each epoch for phase in ['train', 'val']: if phase == 'train': net.train() print('(train)') else: if (epoch + 1) % 10 == 0: net.eval() print('-------------') print('(val)') else: # perform validation once every ten times continue # loop each mini-batch from data loader for images, targets in dataloaders_dict[phase]: images = images.to(device) targets = [ann.to(device) for ann in targets] # initialize optimizer optimizer.zero_grad() # calculate forward with torch.set_grad_enabled(phase == 'train'): outputs = net(images) # calculate loss loss_l, loss_c = criterion(outputs, targets) loss = loss_l + loss_c if phase == 'train': # back propagate when training loss.backward() # calculate gradient nn.utils.clip_grad_value_( net.parameters(), clip_value=Parameters.CLIP_VALUE) optimizer.step() # update parameters if iteration % 10 == 0: # display loss once every ten iterations t_iter_finish = time.time() duration = t_iter_finish - t_iter_start print( 'iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.' .format(iteration, loss.item(), duration)) t_iter_start = time.time() epoch_train_loss += loss.item() iteration += 1 else: epoch_val_loss += loss.item() # loss and accuracy rate of each phase of epoch t_epoch_finish = time.time() # keep latest epoch loss if epoch_train_loss != 0.0: num_total = len(dataloaders_dict['train']) latest_epoch_train_loss = epoch_train_loss / num_total if epoch_val_loss != 0.0: num_total = len(dataloaders_dict['val']) latest_epoch_val_loss = epoch_val_loss / num_total print('-------------') print('epoch {} || Epoch_TRAIN_Loss:{:.4f} || Epoch_VAL_Loss:{:.4f}'. format(epoch + 1, latest_epoch_train_loss, latest_epoch_val_loss)) print('timer: {:.4f} sec.'.format(t_epoch_finish - t_epoch_start)) t_epoch_start = time.time() statistics(epoch + 1, latest_epoch_train_loss, None, latest_epoch_val_loss, None) writer.add_scalar('main/loss', latest_epoch_train_loss, epoch + 1) if (epoch + 1) % 10 == 0: writer.add_scalar('test/loss', latest_epoch_val_loss, epoch + 1) model_path = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, f'ssd300_{str(epoch + 1)}.pth') torch.save(net.state_dict(), model_path) writer.flush() epoch_train_loss = 0.0 epoch_val_loss = 0.0 torch.save(net.state_dict(), os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'model.pth')) writer.close()
def train(train_file, test_file, num_epoch): use_gpu = torch.cuda.is_available() Loss = MultiBoxLoss_2() ## loss learning_rate = 0.01 num_epochs = num_epoch batch_size = 4 model = SSD(depth=50, width=1) #optimizer = torch.optim.SGD([{"params":model.parameters()}], lr=learning_rate, momentum=0.9, weight_decay=5e-4) optimizer = torch.optim.Adam([{ "params": model.parameters() }], lr=learning_rate) scheduler = ReduceLROnPlateau(optimizer) if use_gpu: model.cuda() model.train() train_dataset = ListDataset(root='GUN/WeaponS/', list_file=train_file, train=True, transform=transforms.ToTensor()) train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2) test_dataset = ListDataset(root='GUN/WeaponS/', list_file=test_file, train=True, transform=transforms.ToTensor()) test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=2) for epoch in range(num_epochs): t1 = time.time() model.train() total_loss, valid_loss = 0, 0 # Adjust learninig rate ## train model print("Train {} epoch: ".format(epoch + 1)) for i, (imgs, loc, conf) in enumerate(train_loader): imgs, loc, conf = Variable(imgs), Variable(loc), Variable(conf) if use_gpu: imgs = imgs.cuda() loc = loc.cuda() conf = conf.cuda() loc_pred, con_pred = model(imgs) loss = Loss(loc_pred, loc, con_pred, conf) total_loss += loss.item() #loss = conf_loss + loc_loss optimizer.zero_grad() loss.backward() optimizer.step() #print('Training progress %.1f %%' %(100*(i+1)/len(train_loader)), end='') #print('loc loss: ', loc_loss_total/len(train_loader)) #print('conf loss: ', conf_loss_total/len(train_loader)) print('\rEpoch [%d/%d], Training loss: %.4f' % (epoch + 1, num_epochs, total_loss / len(train_loader)), end='\n') ## test model model.eval() with torch.no_grad(): for i, (imgs, loc, conf) in enumerate(test_loader): imgs, loc, conf = Variable(imgs), Variable(loc), Variable(conf) if use_gpu: imgs = imgs.cuda() loc = loc.cuda() conf = conf.cuda() loc_pred, con_pred = model(imgs) loss = Loss(loc_pred, loc, con_pred, conf) valid_loss += loss.item() #print('Validing progress %.1f %%' %(100*(i+1)/len(test_loader)), end='') print('\rEpoch [%d/%d], Validing loss: %.4f' % (epoch + 1, num_epochs, valid_loss / len(test_loader)), end='\n') print('\n') scheduler.step(valid_loss) t2 = time.time() #print('epoch escape time %f secs' %t2-t1) # Save model #PATH_1 = 'drive/My Drive/BootCamp4/SSD/ssd_2.pki' #torch.save(model, PATH_1) PATH = 'drive/My Drive/BootCamp4/SSD/ssd_state_dict.pki' torch.save(model.state_dict(), PATH)