assert (valid_num == len(valid_loader.sampler)) valid_loss = valid_loss / valid_num print(predicts[0].shape) print(truths[0].shape) #-------------------------------------------------------- predicts = np.concatenate(predicts).squeeze() truths = np.concatenate(truths).squeeze() precision, result, threshold = do_kaggle_metric(predicts, truths, threshold) valid_loss[2] = precision.mean() return valid_loss batch_size = 25 net = Net().cuda() public_losses = [] private_losses = [] i = -1 results = {} m = 0 for fold in FOLD: for step in RANGE[m]: seeds = {} SIZE = 202 bsize = 22 #ResNet34_OHEM00079500_model.pth initial_checkpoint = '/data/liao_checkpoints/ocnet_256/fold1_stage2/' + step + '.pth' model = "OCnet256_" + 'fold' + fold + step from tqdm import tqdm if initial_checkpoint is not None:
image_path = os.path.join(image_folder, file_id + ".png") mask_folder = os.path.join(self.root_path, "masks") mask_path = os.path.join(mask_folder, file_id + ".png") image = load_image(image_path) if self.is_test: return (image, ) else: mask = load_image(mask_path, mask=True) return image, mask device = "cuda" model = Net() #initial_checkpoint = '/home/liaop20/Kaggle-TGS/kaggle_tgs/20180826/code/checkpoint/fold8/00050000_model_f.pth' #initial_checkpoint = CHECKPOINTS+'/list_train_3600/0000000_model.pth' #if initial_checkpoint is not None: # print('\tinitial_checkpoint = %s\n' % initial_checkpoint) # model.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage)) test_path = os.path.join(DATA, 'test') test_file_list = glob.glob(os.path.join(test_path, 'images', '*.png')) test_file_list = [f.split('/')[-1].split('.')[0] for f in test_file_list] test_file_list[:3], test_path train_path = os.path.join(DATA, 'train') train_file_list = glob.glob(os.path.join(train_path, 'images', '*.png')) train_file_list = [f.split('/')[-1].split('.')[0] for f in train_file_list]
mask_folder = os.path.join(self.root_path, "masks") mask_path = os.path.join(mask_folder, file_id + ".png") image = load_image(image_path) #image, _ = do_resize2(image,image, SIZE, SIZE) #image, _ = do_center_pad_to_factor2(image,image, factor=256) if self.is_test: return (image, ) else: mask = load_image(mask_path, mask=True) return image, mask device = "cuda" model = Net() #initial_checkpoint = '/home/liaop20/Kaggle-TGS/kaggle_tgs/20180826/code/checkpoint/fold8/00050000_model_f.pth' #initial_checkpoint = CHECKPOINTS+'/list_train_3600/0000000_model.pth' #if initial_checkpoint is not None: # print('\tinitial_checkpoint = %s\n' % initial_checkpoint) # model.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage)) test_path = os.path.join(DATA, 'test') test_file_list = glob.glob(os.path.join(test_path, 'images', '*.png')) test_file_list = [f.split('/')[-1].split('.')[0] for f in test_file_list] test_file_list[:3], test_path train_path = os.path.join(DATA, 'train') train_file_list = glob.glob(os.path.join(train_path, 'images', '*.png')) train_file_list = [f.split('/')[-1].split('.')[0] for f in train_file_list]
def train(initial_checkpoint): ## setup ----------------- os.makedirs(CHECKPOINTS + '/checkpoint', exist_ok=True) os.makedirs(CHECKPOINTS + '/train', exist_ok=True) os.makedirs(CHECKPOINTS + '/backup', exist_ok=True) log = Logger() log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\tSEED = %u\n' % SEED) log.write('\tPROJECT_PATH = %s\n' % CODE) log.write('\t__file__ = %s\n' % __file__) log.write('\tRESULT = %s\n' % CHECKPOINTS) log.write('\n') log.write('\t<additional comments>\n') log.write('\t ... \n') log.write('\n') ## dataset ---------------------------------------- log.write('Configuring dataset...\n') batch_size = 16 train_dataset = TGSDataset( 'list_train' + str(FOLD) + '_3600' + ne + "_balanced", train_augment, 'train') os.makedirs(CHECKPOINTS + '/list_train' + str(FOLD) + '_3600' + ne + "_balanced", exist_ok=True) train_loader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), #sampler = ConstantSampler(train_dataset,[31]*batch_size*100), batch_size=batch_size, drop_last=True, num_workers=8, pin_memory=True, collate_fn=null_collate) valid_dataset = TGSDataset('list_valid' + str(FOLD) + '_400' + ne, valid_augment, 'train') valid_loader = DataLoader(valid_dataset, sampler=RandomSampler(valid_dataset), batch_size=batch_size, drop_last=False, num_workers=8, pin_memory=True, collate_fn=null_collate) assert (len(train_dataset) >= batch_size) log.write('batch_size = %d\n' % (batch_size)) log.write('train_dataset.split = %s\n' % (train_dataset.split)) log.write('valid_dataset.split = %s\n' % (valid_dataset.split)) log.write('\n') #debug if 0: #debug ##------------------------------- for input, truth, index, cache in train_loader: images = input.cpu().data.numpy().squeeze() masks = truth.cpu().data.numpy().squeeze() batch_size = len(index) for b in range(batch_size): image = images[b] * 255 image = np.dstack([image, image, image]) mask = masks[b] image_show('image', image, resize=2) image_show_norm('mask', mask, max=1, resize=2) overlay0 = draw_mask_overlay(mask, image, color=[0, 0, 255]) overlay0 = draw_mask_to_contour_overlay(mask, overlay0, 2, color=[0, 0, 255]) image_show('overlay0', overlay0, resize=2) cv2.waitKey(0) #-------------------------------------- ## net ---------------------------------------- log.write('Configuring neural network...\n') net = Net().cuda() if initial_checkpoint is not None: log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) net.load_state_dict( torch.load(initial_checkpoint, map_location=lambda storage, loc: storage)) log.write("The net is an instance of {}.".format(type(net))) log.write('\n') ## optimiser ---------------------------------- num_iters = 300 * 1000 iter_smooth = 20 iter_log = 50 iter_valid = 100 iter_save = [0, num_iters-1]\ + list(range(0,num_iters,500))#1*1000 FREEZE = False #------------------------------------------------------ if FREEZE: ##freeze for p in net.feature_net.parameters(): p.requires_grad = False #------------------------------------------------------ scheduler = lambda x: (0.01 / 2) * (np.cos(PI * (np.mod( x - 1, 300 * 1000 / 30) / (300 * 1000 / 30))) + 1) #log.write(scheduler(1)) #log.write(scheduler(5000)) #log.write(scheduler(10001)) optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.01, momentum=0.9, weight_decay=0.0001) start_iter = 0 start_epoch = 0 if initial_checkpoint is not None: checkpoint = torch.load( initial_checkpoint.replace('_model.pth', '_optimizer.pth')) start_iter = checkpoint['iter'] start_epoch = checkpoint['epoch'] rate = get_learning_rate(optimizer) #load all except learning rate optimizer.load_state_dict(checkpoint['optimizer']) adjust_learning_rate(optimizer, rate) pass ## start training here! ############################################## log.write('Start training...\n') log.write( ' rate iter epoch | valid_loss | train_loss | batch_loss | time \n' ) log.write( '-------------------------------------------------------------------------------------------------------------------------------\n' ) train_loss = np.zeros(6, np.float32) valid_loss = np.zeros(6, np.float32) batch_loss = np.zeros(6, np.float32) rate = 0 iter = 0 i = 0 start = timer() while iter < num_iters: # loop over the dataset multiple times sum_train_loss = np.zeros(6, np.float32) sum = 0 optimizer.zero_grad() for input, truth, index, cache in train_loader: if 0: #debug ##------------------------------- image = input.cpu().data.numpy().squeeze() mask = truth.cpu().data.numpy().squeeze() batch_size = len(index) for b in range(batch_size): image_show_norm('image', image[b], max=1, resize=2) image_show_norm('mask', mask[b], max=1, resize=2) cv2.waitKey(0) #-------------------------------------- len_train_dataset = len(train_dataset) batch_size = len(index) iter = i + start_iter epoch = (iter - start_iter) * batch_size / len_train_dataset + start_epoch num_samples = epoch * len_train_dataset if iter % iter_valid == 0: net.set_mode('valid') valid_loss = validation(net, valid_loader) net.set_mode('train') log.write2('\r') log.write('%0.4f %5.1f %6.1f | %0.3f %0.3f (%0.3f) | %0.3f %0.3f | %0.3f %0.3f | %s \n' % (\ rate, iter/1000, epoch, valid_loss[0], valid_loss[1], valid_loss[2], train_loss[0], train_loss[1], batch_loss[0], batch_loss[1], time_to_str((timer() - start),'min'))) time.sleep(0.01) if iter in iter_save: torch.save( net.state_dict(), CHECKPOINTS + "/" + train_dataset.split + '/' + MODEL + OHEM + '%08d_model.pth' % (iter)) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': iter, 'epoch': epoch, }, CHECKPOINTS + "/" + train_dataset.split + '/' + MODEL + OHEM + '%08d_optimizer.pth' % (iter)) pass # learning rate schduler ------------- if scheduler is not None: #scheduler.batch_step() lr = scheduler(iter) if lr < 0: break adjust_learning_rate(optimizer, lr) rate = get_learning_rate(optimizer) #rate = 0.01 # one iteration update ------------- #net.set_mode('train',is_freeze_bn=True) net.set_mode('train') input = input.cuda() truth = truth.cuda() logit = data_parallel(net, input) #net(input) if OHEM == "OHEM": loss = net.focal_loss(logit, truth, 1.0, 0.5, 0.25) + net.criterion(logit, truth) else: loss = net.criterion(logit, truth) dice = net.metric(logit, truth) loss.backward() optimizer.step() optimizer.zero_grad() #torch.nn.utils.clip_grad_norm(net.parameters(), 1) # print statistics ------------ batch_loss = np.array(( loss.item(), dice.item(), 0, 0, 0, 0, )) sum_train_loss += batch_loss sum += 1 if iter % iter_smooth == 0: train_loss = sum_train_loss / sum sum_train_loss = np.zeros(6, np.float32) sum = 0 log.write2('\r%0.4f %5.1f %6.1f | %0.3f %0.3f (%0.3f) | %0.3f %0.3f | %0.3f %0.3f | %s ' % (\ rate, iter/1000, epoch, valid_loss[0], valid_loss[1], valid_loss[2], train_loss[0], train_loss[1], batch_loss[0], batch_loss[1], time_to_str((timer() - start), 'min'))) i = i + 1 pass #-- end of one data loader -- pass #-- end of all iterations -- log.write('\n')