def trainingNetwork(images_folder_train, labels_folder_train, images_folder_val, labels_folder_val, dictionary, target_classes, num_classes, save_network_as, classifier_name, epochs, batch_sz, batch_mult, learning_rate, L2_penalty, validation_frequency, flagShuffle, experiment_name, progress): ##### DATA ##### # setup the training dataset datasetTrain = CoralsDataset(images_folder_train, labels_folder_train, dictionary, target_classes, num_classes) print("Dataset setup..", end='') datasetTrain.computeAverage() datasetTrain.computeWeights() target_classes = datasetTrain.dict_target print("done.") datasetTrain.enableAugumentation() datasetVal = CoralsDataset(images_folder_val, labels_folder_val, dictionary, target_classes, num_classes) datasetVal.dataset_average = datasetTrain.dataset_average datasetVal.weights = datasetTrain.weights #AUGUMENTATION IS NOT APPLIED ON THE VALIDATION SET datasetVal.disableAugumentation() # setup the data loader dataloaderTrain = DataLoader(datasetTrain, batch_size=batch_sz, shuffle=flagShuffle, num_workers=0, drop_last=True, pin_memory=True) validation_batch_size = 4 dataloaderVal = DataLoader(datasetVal, batch_size=validation_batch_size, shuffle=False, num_workers=0, drop_last=True, pin_memory=True) training_images_number = len(datasetTrain.images_names) validation_images_number = len(datasetVal.images_names) ###### SETUP THE NETWORK ##### net = DeepLab(backbone='resnet', output_stride=16, num_classes=datasetTrain.num_classes) models_dir = "models/" network_name = os.path.join(models_dir, "deeplab-resnet.pth.tar") state = torch.load(network_name) # RE-INIZIALIZE THE CLASSIFICATION LAYER WITH THE RIGHT NUMBER OF CLASSES, DON'T LOAD WEIGHTS OF THE CLASSIFICATION LAYER new_dictionary = state['state_dict'] del new_dictionary['decoder.last_conv.8.weight'] del new_dictionary['decoder.last_conv.8.bias'] net.load_state_dict(state['state_dict'], strict=False) print("NETWORK USED: DEEPLAB V3+") # LOSS weights = datasetTrain.weights class_weights = torch.FloatTensor(weights).cuda() lossfn = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-1) # OPTIMIZER # optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=0.0002, momentum=0.9) optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=L2_penalty) USE_CUDA = torch.cuda.is_available() if USE_CUDA: device = torch.device("cuda") net.to(device) ##### TRAINING LOOP ##### scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True) best_accuracy = 0.0 best_jaccard_score = 0.0 print("Training Network") for epoch in range(epochs): # loop over the dataset multiple times txt = "Epoch " + str(epoch + 1) + "/" + str(epochs) progress.setMessage(txt) progress.setProgress((100.0 * epoch) / epochs) QApplication.processEvents() net.train() optimizer.zero_grad() running_loss = 0.0 for i, minibatch in enumerate(dataloaderTrain): # get the inputs images_batch = minibatch['image'] labels_batch = minibatch['labels'] if USE_CUDA: images_batch = images_batch.to(device) labels_batch = labels_batch.to(device) # forward+loss+backward outputs = net(images_batch) loss = lossfn(outputs, labels_batch) loss.backward() # TO AVOID MEMORY TRUBLE UPDATE WEIGHTS EVERY BATCH SIZE X BATCH MULT if (i + 1) % batch_mult == 0: optimizer.step() optimizer.zero_grad() print(epoch, i, loss.item()) running_loss += loss.item() print("Epoch: %d , Running loss = %f" % (epoch, running_loss)) ### VALIDATION ### if epoch > 0 and (epoch + 1) % validation_frequency == 0: print("RUNNING VALIDATION.. ", end='') # datasetVal.weights are the same of datasetTrain metrics_val, mean_loss_val = evaluateNetwork( dataloaderVal, datasetVal.weights, datasetVal.num_classes, net, flagTrainingDataset=False) accuracy = metrics_val['Accuracy'] jaccard_score = metrics_val['JaccardScore'] scheduler.step(mean_loss_val) metrics_train, mean_loss_train = evaluateNetwork( dataloaderTrain, datasetTrain.weights, datasetTrain.num_classes, net, flagTrainingDataset=True) accuracy_training = metrics_train['Accuracy'] jaccard_training = metrics_train['JaccardScore'] if jaccard_score > best_jaccard_score: best_accuracy = accuracy best_jaccard_score = jaccard_score torch.save(net.state_dict(), save_network_as) # performance of the best accuracy network on the validation dataset metrics_filename = save_network_as[:len(save_network_as) - 4] + "-val-metrics.txt" saveMetrics(metrics_val, metrics_filename) metrics_filename = save_network_as[:len(save_network_as) - 4] + "-train-metrics.txt" saveMetrics(metrics_train, metrics_filename) print("-> CURRENT BEST ACCURACY ", best_accuracy) print("***** TRAINING FINISHED *****") return datasetTrain
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_loss = float('inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: model = DeepLab(backbone='mobilenet', output_stride=16, num_classes=num_classes) model = nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.99), weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) # Custom dataloaders train_dataset = MICDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers) valid_dataset = MICDataset('val') valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=num_workers) # Epochs for epoch in range(start_epoch, args.end_epoch): # One epoch's training train_loss, train_acc = train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger) lr = get_learning_rate(optimizer) print('Current effective learning rate: {}\n'.format(lr)) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_acc', train_acc, epoch) # One epoch's validation valid_loss, valid_acc = valid(valid_loader=valid_loader, model=model, logger=logger) writer.add_scalar('model/valid_loss', valid_loss, epoch) writer.add_scalar('model/valid_acc', valid_acc, epoch) # Check if there was an improvement is_best = valid_loss < best_loss best_loss = min(valid_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_loss, is_best)
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_loss = float('inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: model = DeepLab(backbone='mobilenet', output_stride=16, num_classes=1) model = nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'].module model = nn.DataParallel(model) optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) # Custom dataloaders train_dataset = DIMDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers) # valid_dataset = DIMDataset('valid') # valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, # num_workers=num_workers) # scheduler = MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): # scheduler.step(epoch) # One epoch's training train_loss = train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger) effective_lr = get_learning_rate(optimizer) print('Current effective learning rate: {}\n'.format(effective_lr)) writer.add_scalar('model/train_loss', train_loss, epoch) # One epoch's validation # valid_loss = valid(valid_loader=valid_loader, # model=model, # logger=logger) # # writer.add_scalar('Valid_Loss', valid_loss, epoch) # One epoch's test sad_loss, mse_loss = test(model) writer.add_scalar('model/sad_loss', sad_loss, epoch) writer.add_scalar('model/mse_loss', mse_loss, epoch) # Print status status = 'Test: SAD {:.4f} MSE {:.4f}\n'.format(sad_loss, mse_loss) logger.info(status) # Check if there was an improvement is_best = mse_loss < best_loss best_loss = min(mse_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,)) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_loss, is_best)
pretrained_backbone=True).to(device) else: NotImplementedError() if (args.debug): print("model_G :\n", model_G) # モデルを読み込む if not args.load_checkpoints_path_G == '' and os.path.exists( args.load_checkpoints_path_G): load_checkpoint(model_G, device, args.load_checkpoints_path_G) #================================ # optimizer_G の設定 #================================ optimizer_G = optim.Adam(params=model_G.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) #================================ # loss_G 関数の設定 #================================ loss_entropy_fn = CrossEntropy2DLoss(device) #================================ # モデルの学習 #================================ if (args.train_mode == "train"): print("Starting Training Loop...") n_print = 1 step = 0
def trainingNetwork(images_folder_train, labels_folder_train, images_folder_val, labels_folder_val, dictionary, target_classes, output_classes, save_network_as, classifier_name, epochs, batch_sz, batch_mult, learning_rate, L2_penalty, validation_frequency, loss_to_use, epochs_switch, epochs_transition, tversky_alpha, tversky_gamma, optimiz, flag_shuffle, flag_training_accuracy, progress): ##### DATA ##### # setup the training dataset datasetTrain = CoralsDataset(images_folder_train, labels_folder_train, dictionary, target_classes) print("Dataset setup..", end='') datasetTrain.computeAverage() datasetTrain.computeWeights() print(datasetTrain.dict_target) print(datasetTrain.weights) freq = 1.0 / datasetTrain.weights print(freq) print("done.") save_classifier_as = save_network_as.replace(".net", ".json") datasetTrain.enableAugumentation() datasetVal = CoralsDataset(images_folder_val, labels_folder_val, dictionary, target_classes) datasetVal.dataset_average = datasetTrain.dataset_average datasetVal.weights = datasetTrain.weights #AUGUMENTATION IS NOT APPLIED ON THE VALIDATION SET datasetVal.disableAugumentation() # setup the data loader dataloaderTrain = DataLoader(datasetTrain, batch_size=batch_sz, shuffle=flag_shuffle, num_workers=0, drop_last=True, pin_memory=True) validation_batch_size = 4 dataloaderVal = DataLoader(datasetVal, batch_size=validation_batch_size, shuffle=False, num_workers=0, drop_last=True, pin_memory=True) training_images_number = len(datasetTrain.images_names) validation_images_number = len(datasetVal.images_names) print("NETWORK USED: DEEPLAB V3+") if os.path.exists(save_network_as): net = DeepLab(backbone='resnet', output_stride=16, num_classes=output_classes) net.load_state_dict(torch.load(save_network_as)) print("Checkpoint loaded.") else: ###### SETUP THE NETWORK ##### net = DeepLab(backbone='resnet', output_stride=16, num_classes=output_classes) state = torch.load("models/deeplab-resnet.pth.tar") # RE-INIZIALIZE THE CLASSIFICATION LAYER WITH THE RIGHT NUMBER OF CLASSES, DON'T LOAD WEIGHTS OF THE CLASSIFICATION LAYER new_dictionary = state['state_dict'] del new_dictionary['decoder.last_conv.8.weight'] del new_dictionary['decoder.last_conv.8.bias'] net.load_state_dict(state['state_dict'], strict=False) # OPTIMIZER if optimiz == "SGD": optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=L2_penalty, momentum=0.9) elif optimiz == "ADAM": optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=L2_penalty) USE_CUDA = torch.cuda.is_available() if USE_CUDA: device = torch.device("cuda") net.to(device) ##### TRAINING LOOP ##### reduce_lr_patience = 2 if loss_to_use == "DICE+BOUNDARY": reduce_lr_patience = 200 print("patience increased !") scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=reduce_lr_patience, verbose=True) best_accuracy = 0.0 best_jaccard_score = 0.0 # Crossentropy loss weights = datasetTrain.weights class_weights = torch.FloatTensor(weights).cuda() CEloss = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-1) # weights for GENERALIZED DICE LOSS (GDL) freq = 1.0 / datasetTrain.weights[1:] w = 1.0 / (freq * freq) w = w / w.sum() + 0.00001 w_for_GDL = torch.from_numpy(w) w_for_GDL = w_for_GDL.to(device) # Focal Tversky loss focal_tversky_gamma = torch.tensor(tversky_gamma) focal_tversky_gamma = focal_tversky_gamma.to(device) tversky_loss_alpha = torch.tensor(tversky_alpha) tversky_loss_beta = torch.tensor(1.0 - tversky_alpha) tversky_loss_alpha = tversky_loss_alpha.to(device) tversky_loss_beta = tversky_loss_beta.to(device) print("Training Network") num_iter = 0 total_iter = epochs * int(len(datasetTrain) / dataloaderTrain.batch_size) for epoch in range(epochs): net.train() optimizer.zero_grad() loss_values = [] for i, minibatch in enumerate(dataloaderTrain): txt = "Training - Iterations " + str(num_iter + 1) + "/" + str(total_iter) progress.setMessage(txt) progress.setProgress((100.0 * num_iter) / total_iter) QApplication.processEvents() num_iter += 1 # get the inputs images_batch = minibatch['image'] labels_batch = minibatch['labels'] if USE_CUDA: images_batch = images_batch.to(device) labels_batch = labels_batch.to(device) # forward+loss+backward outputs = net(images_batch) loss = computeLoss(loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, labels_batch, outputs) loss.backward() # TO AVOID MEMORY TROUBLE UPDATE WEIGHTS EVERY BATCH SIZE x BATCH MULT if (i+1)% batch_mult == 0: optimizer.step() optimizer.zero_grad() print(epoch, i, loss.item()) loss_values.append(loss.item()) mean_loss_train = sum(loss_values) / len(loss_values) print("Epoch: %d , Mean loss = %f" % (epoch, mean_loss_train)) ### VALIDATION ### if epoch > 0 and (epoch+1) % validation_frequency == 0: print("RUNNING VALIDATION.. ", end='') metrics_val, mean_loss_val = evaluateNetwork(datasetVal, dataloaderVal, loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, output_classes, net, flag_compute_mIoU=False) accuracy = metrics_val['Accuracy'] jaccard_score = metrics_val['JaccardScore'] scheduler.step(mean_loss_val) accuracy_training = 0.0 jaccard_training = 0.0 if flag_training_accuracy is True: metrics_train, mean_loss_train = evaluateNetwork(datasetTrain, dataloaderTrain, loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, output_classes, net, flag_compute_mIoU=False) accuracy_training = metrics_train['Accuracy'] jaccard_training = metrics_train['JaccardScore'] #if jaccard_score > best_jaccard_score: if accuracy > best_accuracy: best_accuracy = accuracy best_jaccard_score = jaccard_score torch.save(net.state_dict(), save_network_as) # performance of the best accuracy network on the validation dataset metrics_filename = save_network_as[:len(save_network_as) - 4] + "-val-metrics.txt" saveMetrics(metrics_val, metrics_filename) print("-> CURRENT BEST ACCURACY ", best_accuracy) # main loop ended torch.cuda.empty_cache() del net net = None print("***** TRAINING FINISHED *****") print("BEST ACCURACY REACHED ON THE VALIDATION SET: %.3f " % best_accuracy) return datasetTrain
class CustomModel(): def __init__(self, cfg, writer, logger): # super(CustomModel, self).__init__() self.cfg = cfg self.writer = writer self.class_numbers = 19 self.logger = logger cfg_model = cfg['model'] self.cfg_model = cfg_model self.best_iou = -100 self.iter = 0 self.nets = [] self.split_gpu = 0 self.default_gpu = cfg['model']['default_gpu'] self.PredNet_Dir = None self.valid_classes = cfg['training']['valid_classes'] self.G_train = True self.objective_vectors = np.zeros([19, 256]) self.objective_vectors_num = np.zeros([19]) self.objective_vectors_dis = np.zeros([19, 19]) self.class_threshold = np.zeros(self.class_numbers) self.class_threshold = np.full([19], 0.95) self.metrics = CustomMetrics(self.class_numbers) self.cls_feature_weight = cfg['training']['cls_feature_weight'] bn = cfg_model['bn'] if bn == 'sync_bn': BatchNorm = SynchronizedBatchNorm2d # elif bn == 'sync_abn': # BatchNorm = InPlaceABNSync elif bn == 'bn': BatchNorm = nn.BatchNorm2d # elif bn == 'abn': # BatchNorm = InPlaceABN elif bn == 'gn': BatchNorm = nn.GroupNorm else: raise NotImplementedError( 'batch norm choice {} is not implemented'.format(bn)) self.PredNet = DeepLab( num_classes=19, backbone=cfg_model['basenet']['version'], output_stride=16, bn=cfg_model['bn'], freeze_bn=True, ).cuda() self.load_PredNet(cfg, writer, logger, dir=None, net=self.PredNet) self.PredNet_DP = self.init_device(self.PredNet, gpu_id=self.default_gpu, whether_DP=True) self.PredNet.eval() self.PredNet_num = 0 self.BaseNet = DeepLab( num_classes=19, backbone=cfg_model['basenet']['version'], output_stride=16, bn=cfg_model['bn'], freeze_bn=False, ) logger.info('the backbone is {}'.format( cfg_model['basenet']['version'])) self.BaseNet_DP = self.init_device(self.BaseNet, gpu_id=self.default_gpu, whether_DP=True) self.nets.extend([self.BaseNet]) self.nets_DP = [self.BaseNet_DP] self.optimizers = [] self.schedulers = [] # optimizer_cls = get_optimizer(cfg) optimizer_cls = torch.optim.SGD optimizer_params = { k: v for k, v in cfg['training']['optimizer'].items() if k != 'name' } # optimizer_cls_D = torch.optim.SGD # optimizer_params_D = {k:v for k, v in cfg['training']['optimizer_D'].items() # if k != 'name'} self.BaseOpti = optimizer_cls(self.BaseNet.parameters(), **optimizer_params) self.optimizers.extend([self.BaseOpti]) self.BaseSchedule = get_scheduler(self.BaseOpti, cfg['training']['lr_schedule']) self.schedulers.extend([self.BaseSchedule]) self.setup(cfg, writer, logger) self.adv_source_label = 0 self.adv_target_label = 1 self.bceloss = nn.BCEWithLogitsLoss(size_average=True) self.loss_fn = get_loss_function(cfg) self.mseloss = nn.MSELoss() self.l1loss = nn.L1Loss() self.smoothloss = nn.SmoothL1Loss() self.triplet_loss = nn.TripletMarginLoss() def create_PredNet(self, ): ss = DeepLab( num_classes=19, backbone=self.cfg_model['basenet']['version'], output_stride=16, bn=self.cfg_model['bn'], freeze_bn=True, ).cuda() ss.eval() return ss def setup(self, cfg, writer, logger): ''' set optimizer and load pretrained model ''' for net in self.nets: # name = net.__class__.__name__ self.init_weights(cfg['model']['init'], logger, net) print("Initializition completed") if hasattr( net, '_load_pretrained_model') and cfg['model']['pretrained']: print("loading pretrained model for {}".format( net.__class__.__name__)) net._load_pretrained_model() '''load pretrained model ''' if cfg['training']['resume_flag']: self.load_nets(cfg, writer, logger) pass def forward(self, input): feat, feat_low, feat_cls, output = self.BaseNet_DP(input) return feat, feat_low, feat_cls, output def forward_Up(self, input): feat, feat_low, feat_cls, output = self.forward(input) output = F.interpolate(output, size=input.size()[2:], mode='bilinear', align_corners=True) return feat, feat_low, feat_cls, output def PredNet_Forward(self, input): with torch.no_grad(): _, _, feat_cls, output_result = self.PredNet_DP(input) return _, _, feat_cls, output_result def calculate_mean_vector( self, feat_cls, outputs, labels, ): outputs_softmax = F.softmax(outputs, dim=1) outputs_argmax = outputs_softmax.argmax(dim=1, keepdim=True) outputs_argmax = self.process_label(outputs_argmax.float()) labels_expanded = self.process_label(labels) outputs_pred = labels_expanded * outputs_argmax scale_factor = F.adaptive_avg_pool2d(outputs_pred, 1) vectors = [] ids = [] for n in range(feat_cls.size()[0]): for t in range(self.class_numbers): if scale_factor[n][t].item() == 0: continue if (outputs_pred[n][t] > 0).sum() < 10: continue s = feat_cls[n] * outputs_pred[n][t] scale = torch.sum( outputs_pred[n][t]) / labels.shape[2] / labels.shape[3] * 2 s = normalisation_pooling()(s, scale) s = F.adaptive_avg_pool2d(s, 1) / scale_factor[n][t] vectors.append(s) ids.append(t) return vectors, ids def step(self, source_x, source_label, target_x, target_label): _, _, source_feat_cls, source_output = self.forward(input=source_x) source_outputUp = F.interpolate(source_output, size=source_x.size()[2:], mode='bilinear', align_corners=True) loss_GTA = self.loss_fn(input=source_outputUp, target=source_label) self.PredNet.eval() with torch.no_grad(): _, _, feat_cls, output = self.PredNet_Forward(target_x) # calculate pseudo-labels threshold_arg, cluster_arg = self.metrics.update( feat_cls, output, target_label, self) loss_L2_source_cls = torch.Tensor([0]).cuda(self.split_gpu) loss_L2_target_cls = torch.Tensor([0]).cuda(self.split_gpu) _, _, target_feat_cls, target_output = self.forward(target_x) if self.cfg['training']['loss_L2_cls']: # distance loss _batch, _w, _h = source_label.shape source_label_downsampled = source_label.reshape( [_batch, 1, _w, _h]).float() source_label_downsampled = F.interpolate( source_label_downsampled.float(), size=source_feat_cls.size()[2:], mode='nearest') #or F.softmax(input=source_output, dim=1) source_vectors, source_ids = self.calculate_mean_vector( source_feat_cls, source_output, source_label_downsampled) target_vectors, target_ids = self.calculate_mean_vector( target_feat_cls, target_output, cluster_arg.float()) loss_L2_source_cls = self.class_vectors_alignment( source_ids, source_vectors) loss_L2_target_cls = self.class_vectors_alignment( target_ids, target_vectors) # target_vectors, target_ids = self.calculate_mean_vector(target_feat_cls, target_output, threshold_arg.float()) # loss_L2_target_cls += self.class_vectors_alignment(target_ids, target_vectors) loss_L2_cls = self.cls_feature_weight * (loss_L2_source_cls + loss_L2_target_cls) loss = torch.Tensor([0]).cuda() batch, _, w, h = cluster_arg.shape # cluster_arg[cluster_arg != threshold_arg] = 250 loss_CTS = (self.loss_fn(input=target_output, target=cluster_arg.reshape([batch, w, h])) \ + self.loss_fn(input=target_output, target=threshold_arg.reshape([batch, w, h]))) / 2 # CAG-based and probability-based PLA # loss_CTS = self.loss_fn(input=target_output, target=cluster_arg.reshape([batch, w, h])) # CAG-based PLA # loss_CTS = self.loss_fn(input=target_output, target=threshold_arg.reshape([batch, w, h])) # probability-based PLA if self.G_train and self.cfg['training']['loss_pseudo_label']: loss = loss + loss_CTS if self.G_train and self.cfg['training']['loss_source_seg']: loss = loss + loss_GTA if self.cfg['training']['loss_L2_cls']: loss = loss + torch.sum(loss_L2_cls) if loss.item() != 0: loss.backward() self.BaseOpti.step() self.BaseOpti.zero_grad() return loss, loss_L2_cls.item(), loss_CTS.item() def process_label(self, label): batch, channel, w, h = label.size() pred1 = torch.zeros(batch, 20, w, h).cuda() id = torch.where(label < 19, label, torch.Tensor([19]).cuda()) pred1 = pred1.scatter_(1, id.long(), 1) return pred1 def class_vectors_alignment(self, ids, vectors): loss = torch.Tensor([0]).cuda(self.default_gpu) for i in range(len(ids)): if ids[i] not in self.valid_classes: continue new_loss = self.smoothloss( vectors[i].squeeze().cuda(self.default_gpu), torch.Tensor(self.objective_vectors[ids[i]]).cuda( self.default_gpu)) while (new_loss.item() > 5): new_loss = new_loss / 10 loss = loss + new_loss loss = loss / len(ids) * 10 pass return loss def freeze_bn_apply(self): for net in self.nets: net.apply(freeze_bn) for net in self.nets_DP: net.apply(freeze_bn) def scheduler_step(self): # for net in self.nets: # self.schedulers[net.__class__.__name__].step() for scheduler in self.schedulers: scheduler.step() def optimizer_zerograd(self): # for net in self.nets: # self.optimizers[net.__class__.__name__].zero_grad() for optimizer in self.optimizers: optimizer.zero_grad() def optimizer_step(self): # for net in self.nets: # self.optimizers[net.__class__.__name__].step() for opt in self.optimizers: opt.step() def init_device(self, net, gpu_id=None, whether_DP=False): gpu_id = gpu_id or self.default_gpu device = torch.device( "cuda:{}".format(gpu_id) if torch.cuda.is_available() else 'cpu') net = net.to(device) # if torch.cuda.is_available(): if whether_DP: net = DataParallelWithCallback(net, device_ids=range( torch.cuda.device_count())) return net def eval(self, net=None, logger=None): """Make specific models eval mode during test time""" # if issubclass(net, nn.Module) or issubclass(net, BaseModel): if net == None: for net in self.nets: net.eval() for net in self.nets_DP: net.eval() if logger != None: logger.info("Successfully set the model eval mode") else: net.eval() if logger != None: logger("Successfully set {} eval mode".format( net.__class__.__name__)) return def train(self, net=None, logger=None): if net == None: for net in self.nets: net.train() for net in self.nets_DP: net.train() # if logger!=None: # logger.info("Successfully set the model train mode") else: net.train() # if logger!= None: # logger.info(print("Successfully set {} train mode".format(net.__class__.__name__))) return def set_requires_grad(self, logger, net, requires_grad=False): """Set requires_grad=Fasle for all the networks to avoid unnecessary computations Parameters: net (BaseModel) -- the network which will be operated on requires_grad (bool) -- whether the networks require gradients or not """ # if issubclass(net, nn.Module) or issubclass(net, BaseModel): for parameter in net.parameters(): parameter.requires_grad = requires_grad # print("Successfully set {} requires_grad with {}".format(net.__class__.__name__, requires_grad)) # return def set_requires_grad_layer(self, logger, net, layer_type='batchnorm', requires_grad=False): ''' set specific type of layers whether needing grad ''' # print('Warning: all the BatchNorm params are fixed!') # logger.info('Warning: all the BatchNorm params are fixed!') for net in self.nets: for _i in net.modules(): if _i.__class__.__name__.lower().find( layer_type.lower()) != -1: _i.weight.requires_grad = requires_grad return def init_weights(self, cfg, logger, net, init_type='normal', init_gain=0.02): """Initialize network weights. Parameters: net (network) -- network to be initialized init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal init_gain (float) -- scaling factor for normal, xavier and orthogonal. We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might work better for some applications. Feel free to try yourself. """ init_type = cfg.get('init_type', init_type) init_gain = cfg.get('init_gain', init_gain) def init_func(m): # define the initialization function classname = m.__class__.__name__ if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): if init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, init_gain) elif init_type == 'xavier': nn.init.xavier_normal_(m.weight.data, gain=init_gain) elif init_type == 'kaiming': nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') elif init_type == 'orthogonal': nn.init.orthogonal_(m.weight.data, gain=init_gain) else: raise NotImplementedError( 'initialization method [%s] is not implemented' % init_type) if hasattr(m, 'bias') and m.bias is not None: nn.init.constant_(m.bias.data, 0.0) elif isinstance(m, SynchronizedBatchNorm2d) or classname.find('BatchNorm2d') != -1 \ or isinstance(m, nn.GroupNorm): # or isinstance(m, InPlaceABN) or isinstance(m, InPlaceABNSync): m.weight.data.fill_(1) m.bias.data.zero_( ) # BatchNorm Layer's weight is not a matrix; only normal distribution applies. print('initialize {} with {}'.format(init_type, net.__class__.__name__)) logger.info('initialize {} with {}'.format(init_type, net.__class__.__name__)) net.apply(init_func) # apply the initialization function <init_func> pass def adaptive_load_nets(self, net, model_weight): model_dict = net.state_dict() pretrained_dict = { k: v for k, v in model_weight.items() if k in model_dict } model_dict.update(pretrained_dict) net.load_state_dict(model_dict) def load_nets(self, cfg, writer, logger): # load pretrained weights on the net if os.path.isfile(cfg['training']['resume']): logger.info( "Loading model and optimizer from checkpoint '{}'".format( cfg['training']['resume'])) checkpoint = torch.load(cfg['training']['resume']) _k = -1 for net in self.nets: name = net.__class__.__name__ _k += 1 if checkpoint.get(name) == None: continue if name.find('FCDiscriminator') != -1 and cfg['training'][ 'gan_resume'] == False: continue self.adaptive_load_nets(net, checkpoint[name]["model_state"]) if cfg['training']['optimizer_resume']: self.adaptive_load_nets( self.optimizers[_k], checkpoint[name]["optimizer_state"]) self.adaptive_load_nets( self.schedulers[_k], checkpoint[name]["scheduler_state"]) self.iter = checkpoint["iter"] self.best_iou = checkpoint['best_iou'] logger.info("Loaded checkpoint '{}' (iter {})".format( cfg['training']['resume'], checkpoint["iter"])) else: raise Exception("No checkpoint found at '{}'".format( cfg['training']['resume'])) def load_PredNet(self, cfg, writer, logger, dir=None, net=None): # load pretrained weights on the net dir = dir or cfg['training']['Pred_resume'] best_iou = 0 if os.path.isfile(dir): logger.info( "Loading model and optimizer from checkpoint '{}'".format(dir)) checkpoint = torch.load(dir) name = net.__class__.__name__ if checkpoint.get(name) == None: return if name.find('FCDiscriminator' ) != -1 and cfg['training']['gan_resume'] == False: return self.adaptive_load_nets(net, checkpoint[name]["model_state"]) iter = checkpoint["iter"] best_iou = checkpoint['best_iou'] logger.info( "Loaded checkpoint '{}' (iter {}) (best iou {}) for PredNet". format(dir, checkpoint["iter"], best_iou)) else: raise Exception("No checkpoint found at '{}'".format(dir)) if hasattr(net, 'best_iou'): net.best_iou = best_iou return best_iou def set_optimizer(self, optimizer): #set optimizer to all nets pass def reset_objective_SingleVector(self, ): self.objective_vectors = np.zeros([19, 256]) self.objective_vectors_num = np.zeros([19]) self.objective_vectors_dis = np.zeros([19, 19]) def update_objective_SingleVector( self, id, vector, name='moving_average', ): if isinstance(vector, torch.Tensor): vector = vector.squeeze().detach().cpu().numpy() if np.sum(vector) == 0: return if self.objective_vectors_num[id] < 100: name = 'mean' if name == 'moving_average': self.objective_vectors[id] = self.objective_vectors[ id] * 0.9999 + 0.0001 * vector.squeeze() self.objective_vectors_num[id] += 1 self.objective_vectors_num[id] = min( self.objective_vectors_num[id], 3000) elif name == 'mean': self.objective_vectors[id] = self.objective_vectors[ id] * self.objective_vectors_num[id] + vector.squeeze() self.objective_vectors_num[id] += 1 self.objective_vectors[id] = self.objective_vectors[ id] / self.objective_vectors_num[id] self.objective_vectors_num[id] = min( self.objective_vectors_num[id], 3000) pass else: raise NotImplementedError( 'no such updating way of objective vectors {}'.format(name))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) #valid_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=1,shuffle=True, num_workers=0) print("trainset lenght :: ", len(train_dataset)) m = nn.Upsample(scale_factor=0.0625) #loss criterion = nn.CrossEntropyLoss() #criterion2 = FocalLoss(a, b, gamma=0, alpha=None) seg_criterion = nn.NLLLoss2d(weight=None) cls_criterion = nn.BCEWithLogitsLoss(weight=None) #optim setting optimizer = optim.RMSprop(model.parameters(), lr=startlr, weight_decay=5e-4, momentum=0.9) scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=startlr, max_lr=startlr * 3, step_size_up=2000, mode='triangular2', gamma=0.9994, cycle_momentum=False) opt = SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=0.05) global_iter = 0 for epoch in range(num_epochs): losses = list()
def main(): # define and parse arguments parser = argparse.ArgumentParser() # general parser.add_argument('--experiment_name', type=str, default="experiment", help="experiment name. will be used in the path names \ for log- and savefiles") parser.add_argument('--seed', type=int, default=None, help='fixes random seed and sets model to \ the potentially faster cuDNN deterministic mode \ (default: non-deterministic mode)') parser.add_argument('--val_freq', type=int, default=1000, help='validation will be run every val_freq \ batches/optimization steps during training') parser.add_argument('--save_freq', type=int, default=1000, help='training state will be saved every save_freq \ batches/optimization steps during training') parser.add_argument('--log_freq', type=int, default=100, help='tensorboard logs will be written every log_freq \ number of batches/optimization steps') # input/output parser.add_argument('--use_s2hr', action='store_true', default=False, help='use sentinel-2 high-resolution (10 m) bands') parser.add_argument('--use_s2mr', action='store_true', default=False, help='use sentinel-2 medium-resolution (20 m) bands') parser.add_argument('--use_s2lr', action='store_true', default=False, help='use sentinel-2 low-resolution (60 m) bands') parser.add_argument('--use_s1', action='store_true', default=False, help='use sentinel-1 data') parser.add_argument('--no_savanna', action='store_true', default=False, help='ignore class savanna') # training hyperparameters parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 1e-2)') parser.add_argument('--momentum', type=float, default=0.9, help='momentum (default: 0.9), only used for deeplab') parser.add_argument('--weight_decay', type=float, default=5e-4, help='weight-decay (default: 5e-4)') parser.add_argument('--batch_size', type=int, default=32, help='batch size for training and validation \ (default: 32)') parser.add_argument('--workers', type=int, default=4, help='number of workers for dataloading (default: 4)') parser.add_argument('--max_epochs', type=int, default=100, help='number of training epochs (default: 100)') # network parser.add_argument('--model', type=str, choices=['deeplab', 'unet'], default='deeplab', help="network architecture (default: deeplab)") # deeplab-specific parser.add_argument('--pretrained_backbone', action='store_true', default=False, help='initialize ResNet-101 backbone with ImageNet \ pre-trained weights') parser.add_argument('--out_stride', type=int, choices=[8, 16], default=16, help='network output stride (default: 16)') # data parser.add_argument('--data_dir_train', type=str, default=None, help='path to training dataset') parser.add_argument( '--dataset_val', type=str, default="sen12ms_holdout", choices=['sen12ms_holdout', 'dfc2020_val', 'dfc2020_test'], help='dataset to use for validation (default: \ sen12ms_holdout)') parser.add_argument('--data_dir_val', type=str, default=None, help='path to validation dataset') parser.add_argument('--log_dir', type=str, default=None, help='path to dir for tensorboard logs \ (default runs/CURRENT_DATETIME_HOSTNAME)') args = parser.parse_args() print("=" * 20, "CONFIG", "=" * 20) for arg in vars(args): print('{0:20} {1}'.format(arg, getattr(args, arg))) print() # fix seeds and set pytorch to deterministic mode if args.seed is not None: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # set flags for GPU processing if available if torch.cuda.is_available(): args.use_gpu = True if torch.cuda.device_count() > 1: raise NotImplementedError("multi-gpu training not implemented! " + "try to run script as: " + "CUDA_VISIBLE_DEVICES=0 train.py") else: args.use_gpu = False # load datasets train_set = SEN12MS(args.data_dir_train, subset="train", no_savanna=args.no_savanna, use_s2hr=args.use_s2hr, use_s2mr=args.use_s2mr, use_s2lr=args.use_s2lr, use_s1=args.use_s1) n_classes = train_set.n_classes n_inputs = train_set.n_inputs if args.dataset_val == "sen12ms_holdout": val_set = SEN12MS(args.data_dir_train, subset="holdout", no_savanna=args.no_savanna, use_s2hr=args.use_s2hr, use_s2mr=args.use_s2mr, use_s2lr=args.use_s2lr, use_s1=args.use_s1) else: dfc2020_subset = args.dataset_val.split("_")[-1] val_set = DFC2020(args.data_dir_val, subset=dfc2020_subset, no_savanna=args.no_savanna, use_s2hr=args.use_s2hr, use_s2mr=args.use_s2mr, use_s2lr=args.use_s2lr, use_s1=args.use_s1) # set up dataloaders train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=False) val_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, drop_last=False) # set up network if args.model == "deeplab": model = DeepLab(num_classes=n_classes, backbone='resnet', pretrained_backbone=args.pretrained_backbone, output_stride=args.out_stride, sync_bn=False, freeze_bn=False, n_in=n_inputs) else: model = UNet(n_classes=n_classes, n_channels=n_inputs) if args.use_gpu: model = model.cuda() # define loss function loss_fn = nn.CrossEntropyLoss(ignore_index=255, reduction='mean') # set up optimizer if args.model == "deeplab": train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # set up tensorboard logging if args.log_dir is None: args.log_dir = "logs" writer = SummaryWriter( log_dir=os.path.join(args.log_dir, args.experiment_name)) # create checkpoint dir args.checkpoint_dir = os.path.join(args.log_dir, args.experiment_name, "checkpoints") os.makedirs(args.checkpoint_dir, exist_ok=True) # save config pkl.dump(args, open(os.path.join(args.checkpoint_dir, "args.pkl"), "wb")) # train network step = 0 trainer = ModelTrainer(args) for epoch in range(args.max_epochs): print("=" * 20, "EPOCH", epoch + 1, "/", str(args.max_epochs), "=" * 20) # run training for one epoch model, step = trainer.train(model, train_loader, val_loader, loss_fn, optimizer, writer, step=step) # export final set of weights trainer.export_model(model, args.checkpoint_dir, name="final")