def main(): num_classes = 3 # larger model if model_choice == 'unet': model = Unet(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) # year 2 best solution XD_XD's model, as the baseline model elif model_choice == 'unet_baseline': model = UnetBaseline(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) else: sys.exit( 'Invalid model_choice {}, choose unet_baseline or unet'.format( model_choice)) # can also use Nesterov momentum in optim.SGD # optimizer = optim.SGD(model.parameters(), lr=learning_rate, # momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate) train_model(model, optimizer, epochs=total_epochs, print_every=print_every, checkpoint_path=starting_checkpoint_path)
def main(): num_classes = 3 # create checkpoint dir checkpoint_dir = 'checkpoints/{}'.format(experiment_name) os.makedirs(checkpoint_dir, exist_ok=True) logger_train = Logger('logs/{}/train'.format(experiment_name)) logger_val = Logger('logs/{}/val'.format(experiment_name)) log_sample_img_gt(sample_images_train, sample_images_val, logger_train, logger_val) logging.info('Logged ground truth image samples') # larger model if model_choice == 'unet': model = Unet(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) # year 2 best solution XD_XD's model, as the baseline model elif model_choice == 'unet_baseline': model = UnetBaseline(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) else: sys.exit( 'Invalid model_choice {}, choose unet_baseline or unet'.format( model_choice)) model = model.to(device=device, dtype=dtype) # move the model parameters to CPU/GPU criterion = nn.CrossEntropyLoss(weight=loss_weights).to(device=device, dtype=dtype) # can also use Nesterov momentum in optim.SGD # optimizer = optim.SGD(model.parameters(), lr=learning_rate, # momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # resume from a checkpoint if provided starting_epoch = 0 best_acc = 0.0 if os.path.isfile(starting_checkpoint_path): logging.info( 'Loading checkpoint from {0}'.format(starting_checkpoint_path)) checkpoint = torch.load(starting_checkpoint_path) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) starting_epoch = checkpoint['epoch'] best_acc = checkpoint.get('best_acc', 0.0) else: logging.info( 'No valid checkpoint is provided. Start to train from scratch...') model.apply(weights_init) if evaluate_only: val_loss, val_acc = evaluate(loader_val, model, criterion) print('Evaluated on val set, loss is {}, accuracy is {}'.format( val_loss, val_acc)) return step = starting_epoch * len(dset_train) for epoch in range(starting_epoch, total_epochs): logging.info('Epoch {} of {}'.format(epoch, total_epochs)) # train for one epoch step = train(loader_train, model, criterion, optimizer, epoch, step, logger_train) # evaluate on val set logging.info( 'Evaluating model on the val set at the end of epoch {}...'.format( epoch)) val_loss, val_acc = evaluate(loader_val, model, criterion) logging.info('\nEpoch {}, val loss is {}, val accuracy is {}\n'.format( epoch, step, val_loss, val_acc)) logger_val.scalar_summary('val_loss', val_loss, step + 1) logger_val.scalar_summary('val_acc', val_acc, step + 1) # log the val images too # record the best accuracy; save checkpoint for every epoch is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) checkpoint_path = os.path.join( checkpoint_dir, 'checkpoint_epoch{}_{}.pth.tar'.format( epoch, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) logging.info( 'Saving to checkoutpoint file at {}. Is it the highest accuracy checkpoint so far: {}' .format(checkpoint_path, str(is_best))) save_checkpoint( { 'epoch': epoch + 1, # saved checkpoints are numbered starting from 1 'arch': model_choice, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_acc': best_acc }, is_best, checkpoint_path, checkpoint_dir)
def main(cp_path, input_image_dir, out_path, vis_dir=None, save_pred=save_pred_polys): """ Applies the model at cp_path to input images and output the csv required for SpaceNet to compute the F-1 score and other metrics against the ground truth. Args: cp_path: path to the model checkpoint to use input_image_dir: path to directory containing the images to extract building footprints from, usually the val or test dir out_path: path of the output csv vis_dir: optionally a directory to place the visualization of polygons on each image save_pred: whether to save visualizations to vis_dir """ if vis_dir: os.makedirs(vis_dir, exist_ok=True) checkpoint = torch.load(cp_path) if model_choice == 'unetv2': model = Unet(feature_scale=feature_scale, n_classes=3, is_deconv=True, in_channels=3, is_batchnorm=True) elif model_choice == 'unetbase': model = UnetBaseline(feature_scale=feature_scale, n_classes=3, is_deconv=True, in_channels=3, is_batchnorm=True) else: raise ValueError('Unknown model_choice={0}'.format(model_choice)) model.load_state_dict(checkpoint['state_dict']) model = model.to(device=device, dtype=dtype) model.eval() # set model to evaluation mode logging.info('Model loaded from checkpoint.') result_dfs = [] image_files = os.listdir(input_image_dir) image_files = [ image_file for image_file in image_files if image_file.endswith('.jpg') ] for image_name in tqdm(image_files): image_name_no_file_type = image_name.split('.jpg')[0] image_id = image_name_no_file_type.split('RGB-PanSharpen_')[ 1] # of format _-115.3064538_36.1756826998 image_path = os.path.join(input_image_dir, image_name) original_image = io.imread(image_path) image = original_image.transpose((2, 0, 1)) image = torch.from_numpy(np.expand_dims(image, 0)).type( torch.float32).to(device=device, dtype=dtype) with torch.no_grad(): scores = model(image) _, prediction = scores.max(1) prediction = prediction.reshape((256, 256)).cpu().data.numpy() result_df, polygons = mask_to_poly(prediction, image_id) result_dfs.append(result_df) # save prediction polygons visualization to output if save_pred and vis_dir: visualize_poly(polygons, prediction, os.path.join(vis_dir, 'poly_' + image_name)) all_df = pd.concat(result_dfs) logging.info('Writing result to csv, length of all_df is {}'.format( len(all_df))) with open(out_path, 'w') as f: f.write('ImageId,BuildingId,PolygonWKT_Pix,Confidence\n') for i, row in tqdm(all_df.iterrows()): f.write("{},{},\"{}\",{:.6f}\n".format(row.image_id, int(row.bid), row.wkt, row.area_ratio))
def main(): global args, sample_images_train_tensors, sample_images_val_tensors args = parser.parse_args() print('args.world_size: ', args.world_size) print('args.dist_backend: ', args.dist_backend) print('args.rank: ', args.rank) # more info on distributed PyTorch see https://pytorch.org/tutorials/intermediate/dist_tuto.html args.distributed = args.world_size >= 2 print('is distributed: '.format(args.distributed)) if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) print('dist.init_process_group() finished.') # data sets and loaders dset_train = SpaceNetDataset(data_path_train, split_tags, transform=T.Compose([ToTensor()])) dset_val = SpaceNetDataset(data_path_val, split_tags, transform=T.Compose([ToTensor()])) logging.info('Training set size: {}, validation set size: {}'.format( len(dset_train), len(dset_val))) # need to instantiate these data loaders to produce the sample images because they need to be shuffled! loader_train = DataLoader(dset_train, batch_size=train_batch_size, shuffle=True, num_workers=num_workers) # shuffle True to reshuffle at every epoch loader_val = DataLoader(dset_val, batch_size=val_batch_size, shuffle=True, num_workers=num_workers) # get one batch of sample images that are used to visualize the training progress throughout this run sample_images_train, sample_images_train_tensors = get_sample_images(loader_train, which_set='train') sample_images_val, sample_images_val_tensors = get_sample_images(loader_val, which_set='val') if args.distributed: # re-instantiate the training data loader to make distributed training possible train_batch_size_dist = train_batch_size * args.world_size logging.info('Using train_batch_size_dist {}.'.format(train_batch_size_dist)) train_sampler = torch.utils.data.BatchSampler( torch.utils.data.distributed.DistributedSampler(dset_train), batch_size=train_batch_size_dist, drop_last=False) # TODO https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler # check if need num_replicas and rank print('train_sampler created successfully.') loader_train = DataLoader(dset_train, num_workers=num_workers, pin_memory=True, batch_sample=train_sampler) loader_val = DataLoader(dset_val, batch_size=val_batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) print('both data loaders created successfully.') # checkpoint dir checkpoint_dir = out_checkpoint_dir logger_train = Logger('{}/train'.format(tensorboard_path)) logger_val = Logger('{}/val'.format(tensorboard_path)) log_sample_img_gt(sample_images_train, sample_images_val, logger_train, logger_val) logging.info('Logged ground truth image samples') num_classes = 3 # larger model if model_choice == 'unet': model = Unet(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) # year 2 best solution XD_XD's model, as the baseline model elif model_choice == 'unet_baseline': model = UnetBaseline(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) else: sys.exit('Invalid model_choice {}, choose unet_baseline or unet'.format(model_choice)) print('model instantiated.') if not args.distributed: model = model.to(device=device, dtype=dtype) # move the model parameters to target device #model = torch.nn.DataParallel(model).cuda() # Batch AI example else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) print('torch.nn.parallel.DistributedDataParallel() ran.') criterion = nn.CrossEntropyLoss(weight=loss_weights).to(device=device, dtype=dtype) # can also use Nesterov momentum in optim.SGD # optimizer = optim.SGD(model.parameters(), lr=learning_rate, # momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # resume from a checkpoint if provided starting_epoch = 0 best_acc = 0.0 if os.path.isfile(starting_checkpoint_path): logging.info('Loading checkpoint from {0}'.format(starting_checkpoint_path)) checkpoint = torch.load(starting_checkpoint_path) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) starting_epoch = checkpoint['epoch'] best_acc = checkpoint.get('best_acc', 0.0) else: logging.info('No valid checkpoint is provided. Start to train from scratch...') model.apply(weights_init) # run training or evaluation if evaluate_only: val_loss, val_acc = evaluate(loader_val, model, criterion) print('Evaluated on val set, loss is {}, accuracy is {}'.format(val_loss, val_acc)) return step = starting_epoch * len(dset_train) for epoch in range(starting_epoch, total_epochs): logging.info('Epoch {} of {}'.format(epoch, total_epochs)) # train for one epoch step = train(loader_train, model, criterion, optimizer, epoch, step, logger_train) # evaluate on val set logging.info('Evaluating model on the val set at the end of epoch {}...'.format(epoch)) val_loss, val_acc = evaluate(loader_val, model, criterion) logging.info('\nEpoch {}, val loss is {}, val accuracy is {}\n'.format(epoch, step, val_loss, val_acc)) logger_val.scalar_summary('val_loss', val_loss, step + 1) logger_val.scalar_summary('val_acc', val_acc, step + 1) # log the val images too # record the best accuracy; save checkpoint for every epoch is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint_epoch{}_{}.pth.tar'.format(epoch, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) logging.info( 'Saving to checkoutpoint file at {}. Is it the highest accuracy checkpoint so far: {}'.format( checkpoint_path, str(is_best))) save_checkpoint({ 'epoch': epoch + 1, # saved checkpoints are numbered starting from 1 'arch': model_choice, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_acc': best_acc }, is_best, checkpoint_path, checkpoint_dir)