def train(net, lr, trainloader, epoch): """ Train SSD @args net: (nn.Module) network lr: (float) learning rate trainloader: (DataLoader) dataloader epoch: (int) training epoch """ net.train() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) criterion = MultiBoxLoss(num_classes=config[args.dataset]['num_classes']+1) progress_bar = ProgressBar(total=len(trainloader)) train_loss = 0 torch.set_printoptions(threshold=10000) for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): images = Variable(images.cuda()) loc_targets = Variable(loc_targets.cuda()) conf_targets = Variable(conf_targets.cuda()) optimizer.zero_grad() loc_preds, conf_preds = net(images) loc_loss, conf_loss, loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) loss.backward() optimizer.step() writer.add_scalar('train/loss_loc', loc_loss, batch_idx + epoch * len(trainloader)) writer.add_scalar('train/loss_conf', conf_loss, batch_idx + epoch * len(trainloader)) writer.add_scalar('train/loss_total', loss, batch_idx + epoch * len(trainloader)) train_loss += loss.item() progress_bar.move(leftmsg="training epoch " + str(epoch), rightmsg="loss: %.6f" % (train_loss/(batch_idx+1)))
def __init__(self, net, checkpoint, cfg): super().__init__("TrackerDefault") self.cfg = cfg self.net = net if checkpoint is not None: utils.load_checkpoint(checkpoint, self.net) self.net.eval() self.anchors = utils.generate_anchors(cfg) if torch.cuda.is_available(): self.net.cuda() self.anchors = self.anchors.cuda() self.z_transform = Compose([ ToAbsoluteCoords(), Crop(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_Z, make_square=False), ToPercentCoords(), Resize(cfg.MODEL.Z_SIZE), ]) self.x_crop = Crop(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_X, return_rect=True, make_square=True) self.x_resize = Resize(size=cfg.MODEL.X_SIZE) self.z_crop = Crop(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_Z, return_rect=True, make_square=False) self.z_resize = Resize(size=cfg.MODEL.Z_SIZE) self.criterion = MultiBoxLoss(self.anchors, self.cfg)
def train(): print('start training ...........') batch_size = 32 num_epochs = 600 lr = 0.001 device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu") model = SSD_VGG(num_classes=21, device=device, freeze=False) # model.load_state_dict(torch.load('output/weight.pth', map_location=device)) train_loader, val_loader = get_loader(batch_size=batch_size) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=0.0005) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50) criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy, device=device) train_losses, val_losses = [], [] for epoch in range(num_epochs): train_epoch_loss = fit(epoch, model, optimizer, criterion, device, train_loader, phase='training') val_epoch_loss = fit(epoch, model, optimizer, criterion, device, val_loader, phase='validation') print('-----------------------------------------') if epoch == 0 or val_epoch_loss <= np.min(val_losses): torch.save(model.state_dict(), 'output/weight.pth') # if epoch == 0 or train_epoch_loss <= np.min(train_losses): # torch.save(model.state_dict(), 'output/weight.pth') train_losses.append(train_epoch_loss) val_losses.append(val_epoch_loss) write_figure('output', train_losses, val_losses) write_log('output', epoch, train_epoch_loss, val_epoch_loss) scheduler.step(val_epoch_loss)
def test(net, testloader, epoch): net.eval() criterion = MultiBoxLoss(num_classes=config[args.dataset]['num_classes']+1) progress_bar = ProgressBar(total=len(testloader)) test_loss = 0 for batch_idx, (images, loc_targets, conf_targets) in enumerate(testloader): images = Variable(images.cuda()) loc_targets = Variable(loc_targets.cuda()) conf_targets = Variable(conf_targets.cuda()) with torch.no_grad(): loc_preds, conf_preds = net(images) _, _, loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) test_loss += loss.item() progress_bar.move(leftmsg="test epoch " + str(epoch), rightmsg="loss: %.6f" % (test_loss/(batch_idx+1))) writer.add_scalar('test/loss', loss, epoch) return test_loss / len(testloader) # average
def __init__(self, net_path=None, **kargs): super(TrackerSiamRPN, self).__init__(name='SiamRPN', is_deterministic=True) '''setup GPU device if available''' self.cuda = torch.cuda.is_available() self.device = torch.device('cuda:0' if self.cuda else 'cpu') '''setup model''' self.net = SiamRPN() if self.cuda: self.net = self.net.cuda() if net_path is not None: self.net.load_state_dict( torch.load(net_path, map_location=lambda storage, loc: storage)) #self.net = self.net.to(self.device) '''setup optimizer''' self.criterion = MultiBoxLoss() self.optimizer = torch.optim.SGD(self.net.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay)
def main(args): # Model parameters # Not too many here since the SSD300 has a very specific structure with open(args.config_file_path, "r") as fp: config = json.load(fp) n_classes = len(label_map) # number of different types of objects device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #Mobilenetv2 #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]) # Learning parameters checkpoint = None # path to model checkpoint, None if none batch_size = config['batch_size'] # batch size start_epoch = 0 # start at this epoch epochs = config[ 'n_epochs'] # number of epochs to run without early-stopping epochs_since_improvement = 0 # number of epochs since there was an improvement in the validation metric best_loss = 100. # assume a high loss at first workers = 2 # number of workers for loading data in the DataLoader lr = config['lr'] # learning rate momentum = 0.9 # momentum weight_decay = config['weight_decay'] # weight decay grad_clip = None # clip if g backbone_network = config['backbone_network'] model = SSD(num_classes=n_classes, backbone_network=backbone_network) # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo biases = list() not_biases = list() param_names_biases = list() param_names_not_biases = list() for param_name, param in model.named_parameters(): if param.requires_grad: if param_name.endswith('.bias'): biases.append(param) param_names_biases.append(param_name) else: not_biases.append(param) param_names_not_biases.append(param_name) optimizer = torch.optim.SGD(params=[{ 'params': biases, 'lr': 2 * lr }, { 'params': not_biases }], lr=lr, momentum=momentum, weight_decay=weight_decay) model = model.to(device) criterion = MultiBoxLoss(priors_cxcy=model.priors).to(device) #voc07_path = 'VOCdevkit/VOC2007' voc07_path = config['voc07_path'] #voc12_path = 'VOCdevkit/VOC2012' # voc12_path = config['voc12_path'] #from utils import create_data_lists create_data_lists(voc07_path, output_folder=config['data_folder']) #data_folder = 'VOC/VOCdevkit/' data_folder = config['data_folder'] train_dataset = PascalVOCDataset(data_folder, split='train', keep_difficult=keep_difficult) val_dataset = PascalVOCDataset(data_folder, split='test', keep_difficult=keep_difficult) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=workers, pin_memory=True) # note that we're passing the collate function here val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=val_dataset.collate_fn, num_workers=workers, pin_memory=True) print(start_epoch) for epoch in range(start_epoch, epochs): # Paper describes decaying the learning rate at the 80000th, 100000th, 120000th 'iteration', i.e. model update or batch # The paper uses a batch size of 32, which means there were about 517 iterations in an epoch # Therefore, to find the epochs to decay at, you could do, # if epoch in {80000 // 517, 100000 // 517, 120000 // 517}: # adjust_learning_rate(optimizer, 0.1) # In practice, I just decayed the learning rate when loss stopped improving for long periods, # and I would resume from the last best checkpoint with the new learning rate, # since there's no point in resuming at the most recent and significantly worse checkpoint. # So, when you're ready to decay the learning rate, just set checkpoint = 'BEST_checkpoint_ssd300.pth.tar' above # and have adjust_learning_rate(optimizer, 0.1) BEFORE this 'for' loop # One epoch's training train(train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, grad_clip=grad_clip) # One epoch's validation val_loss = validate(val_loader=val_loader, model=model, criterion=criterion) # Did validation loss improve? is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, val_loss, best_loss, is_best)
def train(): """ Introduction ------------ 训练Retinanet模型 """ train_transform = Augmentation(size=config.image_size) # train_dataset = COCODataset(config.coco_train_dir, config.coco_train_annaFile, config.coco_label_file, training = True, transform = train_transform) from VOCDataset import build_vocDataset train_dataset = build_vocDataset(config.voc_root) train_dataloader = DataLoader(train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=2, collate_fn=train_dataset.collate_fn) print("training on {} samples".format(train_dataset.__len__())) net = RetinaNet(config.num_classes, pre_train_path=config.resnet50_path) net.cuda() optimizer = optim.SGD(net.parameters(), lr=config.learning_rate, momentum=0.9, weight_decay=1e-4) criterion = MultiBoxLoss(alpha=config.focal_alpha, gamma=config.focal_gamma, num_classes=config.num_classes) anchors = Anchor(config.anchor_areas, config.aspect_ratio, config.scale_ratios) anchor_boxes = anchors(input_size=config.image_size) for epoch in range(config.Epochs): batch_time, loc_losses, conf_losses = AverageTracker(), AverageTracker( ), AverageTracker() net.train() net.freeze_bn() end = time.time() for index, (image, gt_boxes, labels) in enumerate(train_dataloader): loc_targets, cls_targets = [], [] image = image.cuda() loc_preds, cls_preds = net(image) batch_num = image.shape[0] for idx in range(batch_num): gt_box = gt_boxes[index] label = labels[index] loc_target, cls_target = encode(anchor_boxes, gt_box, label) loc_targets.append(loc_target) cls_targets.append(cls_target) loc_targets = torch.stack(loc_targets).cuda() cls_targets = torch.stack(cls_targets).cuda() loc_loss, cls_loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets) loss = loc_loss + cls_loss optimizer.zero_grad() loss.backward() optimizer.step() loc_losses.update(loc_loss.item(), image.size(0)) conf_losses.update(cls_loss.item(), image.size(0)) batch_time.update(time.time() - end) end = time.time() if idx % config.print_freq == 0: print( 'Epoch: {}/{} Batch: {}/{} loc Loss: {:.4f} {:.4f} conf loss: {:.4f} {:.4f} Time: {:.4f} {:.4f}' .format(epoch, config.Epochs, idx, len(train_dataloader), loc_losses.val, loc_losses.avg, conf_losses.val, conf_losses.avg, batch_time.val, batch_time.avg)) if epoch % config.save_freq == 0: print('save model') torch.save( net.state_dict(), config.model_dir + 'train_model_epoch{}.pth'.format(epoch + 1))
def train(args): create_time = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) save_folder_path = os.path.join(args.save_folder, create_time) # n_classes = [20, 80][args.dataset == 'COCO'] # n_classes = 91 if not ((args.train_image_folder and args.val_image_folder) or args.annotation): print("train/val image folder and annotation should not be None") return train_dataset = COCODetection( root=args.root, image_set=args.train_image_folder, annotation_json=args.annotation, transform=SSDAugmentation(img_size=args.image_size), # transform = BaseTransform(img_size = args.image_size), target_transform=COCOAnnotationTransform()) train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=detection_collate) val_dataset = COCODetection( root=args.root, image_set=args.val_image_folder, annotation_json=args.annotation, transform=BaseTransform(img_size=args.image_size), target_transform=COCOAnnotationTransform()) n_classes = train_dataset.get_class_size() + 1 if args.class_map_path: train_dataset.get_class_map(args.class_map_path) if args.model == "mobilenetv2": model = MobileNetv2( n_classes=n_classes, width_mult=args.width_mult, round_nearest=8, dropout_ratio=args.dropout_ratio, use_batch_norm=True, ) ssd = create_mobilenetv2_ssd_lite(model, n_classes, width_mult=args.width_mult, use_batch_norm=True) elif args.model == "mobilenetv3": model = MobileNetv3(model_mode=args.model_mode, n_classes=n_classes, width_mult=args.width_mult, dropout_ratio=args.dropout_ratio) ssd = create_mobilenetv3_ssd_lite(model, n_classes, model_mode=args.model_mode) else: print("model structure only accept mobilenetv2 or mobilenetv3") return print("builded ssd module") if GPU: import torch.backends.cudnn as cudnn model.cuda() ssd.cuda() cudnn.benchmark = True if args.pretrain_model: ssd.load_state_dict( torch.load(args.pretrain_model, map_location=torch.device('cpu'))) elif args.pretrain_tfmodel and args.pretrain_tfmodel_weight_list: ssd_state_dict = ssd.state_dict() tf_weights_dict = load_tf_weights(args, ssd_state_dict) ssd.load_state_dict(tf_weights_dict) optimizer = optim.Adam(ssd.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) criterion = MultiBoxLoss(n_classes, overlap_thresh=args.overlap_threshold, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=args.neg_pos_ratio, neg_overlap=0.5, encode_target=False) with torch.no_grad(): if args.model == "mobilenetv2": prior_box = PriorBox(MOBILEV2_300) elif args.model == "mobilenetv3": prior_box = PriorBox(MOBILEV3_300) priors = Variable(prior_box.forward()) print("created default bbox") n_train = min(train_dataset.__len__(), 5000) n_val = min(val_dataset.__len__(), 1000) global_step = 0 val_global_step = 0 writer = SummaryWriter(log_dir=args.summary_path) for epoch in range(args.epochs): mean_loss_conf = 0 mean_loss_loc = 0 inference_count = 0 ssd.train() with tqdm(total=n_train, desc=f"{epoch + 1} / {args.epochs}", unit='img') as pbar: for img, target in train_dataloader: if GPU: img = Variable(img.cuda()) target = [Variable(anno.cuda()) for anno in target] else: img = Variable(img) target = [Variable(anno) for anno in target] optimizer.zero_grad() inference = ssd(img) loss_loc, loss_conf = criterion(inference, priors, target) writer.add_scalar('Train/location_loss', float(loss_loc), global_step) writer.add_scalar('Train/confidence_loss', float(loss_conf), global_step) pbar.set_postfix( **{ "location loss": float(loss_loc), "confidence loss": float(loss_conf) }) mean_loss_loc += float(loss_loc) mean_loss_conf += float(loss_conf) total_loss = loss_loc + loss_conf total_loss.backward() # # clip gradient # # clip_grad_norm_(net.parameters(), 0.1) optimizer.step() pbar.update(img.shape[0]) global_step += 1 inference_count += img.shape[0] if inference_count > n_train: break pbar.set_postfix( **{ "location loss": float(mean_loss_loc / n_train), "confidence loss": float(mean_loss_conf / n_train) }) ssd.eval() val_mean_loss_loc = 0 val_mean_loss_conf = 0 with tqdm(total=n_val, desc="Validation", unit="img") as vpbar: for i in range(n_val): img = val_dataset.get_image(i) img = cv2.resize(img, (args.image_size, args.image_size)) height, width, _ = img.shape target = val_dataset.get_annotation(i, width, height) if GPU: img = torch.from_numpy( np.expand_dims(img.transpose(2, 0, 1), 0)).to(dtype=torch.float32).cuda() target = torch.FloatTensor(target).unsqueeze(0).cuda() else: img = torch.from_numpy( np.expand_dims(img.transpose(2, 0, 1), 0)).to(dtype=torch.float32) target = torch.FloatTensor(target).unsqueeze(0) inference = ssd(img) loss_loc, loss_conf = criterion(inference, priors, target) val_mean_loss_loc += float(loss_loc) val_mean_loss_conf += float(loss_conf) vpbar.set_postfix( **{ 'location loss': float(loss_loc), 'confidnece loss': float(loss_conf) }) vpbar.update(1) vpbar.set_postfix( **{ 'location loss': float(val_mean_loss_loc / n_val), 'confidnece loss': float(val_mean_loss_conf / n_val) }) writer.add_scalar('Test/location_loss', float(val_mean_loss_loc / n_val), val_global_step) writer.add_scalar('Test/confidence_loss', float(val_mean_loss_conf / n_val), val_global_step) val_global_step += 1 if epoch % 10 == 0 or epoch == args.epochs - 1: save_model(save_folder_path, ssd, epoch) writer.close()
train_set = loader.VOC_loader(root_dir, transform=transform) # 4. data loader 정의 train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, collate_fn=train_set.collate_fn, shuffle=True, num_workers=0) # 5. model 정의 net = SSD().to(device) net.train() # 6. loss 정의 criterion = MultiBoxLoss().to(device) # 7. optimizer 정의 optimizer = optim.Adam(net.parameters(), lr=0.001) total_step = len(train_loader) # 8. train for epoch in range(30): epoch_time = time.time() for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = [l.to(device) for l in labels] # labels = labels.to(device)
# Filter top k objects that have largest confidence score if boxes[i].size(0) > top_k: scores[i], sort_ind = scores[i].sort(dim=0, descending=True) scores[i] = scores[i][:top_k] # (top_k) boxes[i] = boxes[i][sort_ind[:top_k]] # (top_k, 4) labels[i] = labels[i][sort_ind[:top_k]] # (top_k) return boxes, labels, scores def inference(self, images, score_threshold, iou_threshold, top_k): ''' images: tensor size (N, 3, 300, 300), normalized ''' predicted_offsets, predicted_scores = self.forward(images) return self.post_process_top_k(predicted_offsets, predicted_scores, score_threshold, iou_threshold, top_k) if __name__ == "__main__": from loss import MultiBoxLoss torch.set_grad_enabled(False) MySSD300 = SSD300(n_classes = 21, vgg16_dir='models/') loss_func = MultiBoxLoss(priors_cxcy = MySSD300.priors_cxcy, threshold=0.5, neg_pos_ratio=3, alpha=1.) #loss = loss_func.forward(predicted_offsets, predicted_scores, boxes, labels) #print(loss.item()) # test detect objects #boxes, labels, scores = MySSD300.detect_objects(predicted_offsets, predicted_scores, score_threshold=0.6, iou_threshold=0.5) #breakpoint()
def main(opt): """ Training and validation. """ global epochs_since_improvement, start_epoch, label_map, best_loss, epoch, checkpoint, lr_scheduler epochs_since_improvement = opt['epochs_since_improvement'] start_epoch = opt['start_epoch'] best_loss = opt['best_loss'] checkpoint = opt['checkpoint'] lr_scheduler = opt['lr_scheduler'] batch_size = opt['batch_size'] epochs = opt['epochs'] lr = opt['lr'] momentum = opt['momentum'] weight_decay = opt['weight_decay'] grad_clip = opt['grad_clip'] workers = opt['workers'] print_freq = opt['print_freq'] root = opt['root'] # Initialize model or load checkpoint if checkpoint is None: model = DSOD(n_classes=n_classes) # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo biases = list() not_biases = list() for param_name, param in model.named_parameters(): if param.requires_grad: if param_name.endswith('.bias'): biases.append(param) else: not_biases.append(param) optimizer = torch.optim.SGD(params=[{ 'params': biases, 'lr': 2 * lr }, { 'params': not_biases }], lr=lr, momentum=momentum, weight_decay=weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_loss = checkpoint['best_loss'] print( '\nLoaded checkpoint from epoch %d. Best loss so far is %.3f.\n' % (start_epoch, best_loss)) model = checkpoint['model'] # optimizer = checkpoint['optimizer'] # or # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) print('Learning Rate: ', optimizer.param_groups[-1]['lr']) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=20, verbose=True) # Move to default device model = model.to(device) criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy, use_focalloss=use_focalloss).to(device) # Custom dataloaders train_dataset = mydateset(root='../data', transform=True) val_dataset = mydateset(root='../data', mode='test') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=workers, pin_memory=True) # note that we're passing the collate function here val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=val_dataset.collate_fn, num_workers=workers, pin_memory=True) # Epochs for epoch in range(start_epoch, epochs): # One epoch's training train(train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch) # One epoch's validation val_loss = validate(val_loader=val_loader, model=model, criterion=criterion) # Did validation loss improve? is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) if lr_scheduler is not None: lr_scheduler.step(best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, val_loss, best_loss, is_best)
def main(): parser = get_parse() dist.init_process_group(backend="nccl") torch.cuda.set_device(parser.local_rank) # Create the data loaders dataset = detection_dataset(parser.data_file, parser.classes_file) sampler = torch.utils.data.distributed.DistributedSampler(dataset) dataloader = DataLoader(dataset, batch_size=parser.batch_size, num_workers=parser.num_workers, collate_fn=collate_fn, pin_memory=True, sampler=sampler) print(f"num_clses:{dataset.num_classes()}, num_data: {len(dataset)}") # Create the model ssd = SSD(dataset.num_classes()+1) if parser.model is not None and os.path.isfile(parser.model): print("Loading model.") # ssd.load_state_dict(torch.load(parser.model)) d = collections.OrderedDict() checkpoint = torch.load(parser.model) for key, value in checkpoint.items(): tmp = key[7:] d[tmp] = value ssd.load_state_dict(d) else: print(f"{parser.model} 不存在") config["cuda"] = config["cuda"] and torch.cuda.is_available() if config["cuda"]: ssd = torch.nn.parallel.DistributedDataParallel(ssd.cuda(), device_ids=[parser.local_rank]) mbox_loss = MultiBoxLoss(config) optimizer = optim.Adam(ssd.parameters(), lr=parser.lr, weight_decay=parser.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True, factor=0.5, threshold=1e-3) ssd.train() for epoch_num in range(1, parser.epochs+1): epoch_loss = [] t = time.time() for iter_num, data in enumerate(dataloader, 1): optimizer.zero_grad() img_tensor, boxes_tensor = data["img"], data["boxes"] if config["cuda"]: img_tensor = img_tensor.cuda(non_blocking=True) boxes_tensor = boxes_tensor.cuda(non_blocking=True) predictions = ssd(img_tensor) loc_loss, conf_loss = mbox_loss(predictions, boxes_tensor) loss = loc_loss*parser.loss_lamda + conf_loss loss.backward() torch.nn.utils.clip_grad_norm_(ssd.parameters(), 0.1) optimizer.step() reduce_conf_loss = reduce_tensor(conf_loss.data) reduce_loc_loss = reduce_tensor(loc_loss.data) reduce_loss = reduce_conf_loss + reduce_loc_loss if parser.local_rank == 0: pre_t = t t = time.time() text = f"[Epoch: {epoch_num}|{parser.epochs} Iteration: {iter_num}]" \ + f" conf_loss: {reduce_conf_loss.item():1.4f} loc_loss: {reduce_loc_loss.item():1.4f}" \ + f" loss: {reduce_loss.item():1.4f}" \ + f" time:{(t-pre_t)*1000:.1f}ms" print(text) if iter_num % parser.batch_num_log == 0: with open(parser.log_path, "a", encoding="utf-8") as f: f.write(text+"\n") if iter_num % parser.batch_num_save == 0: save_model(ssd, "{}/batch.pt".format(parser.save_path)) epoch_loss.append(float(reduce_loss.data)) if iter_num % 200 == 0: torch.cuda.empty_cache() if parser.local_rank == 0: save_model(ssd, f"{parser.save_path}/epoch.pt") scheduler.step(np.mean(epoch_loss)) print(f"epoch_mean_loss:{np.mean(epoch_loss):.4f}") ssd.eval() if parser.local_rank == 0: save_model(ssd, f"{parser.save_path}/model_final.pt")
def main( data_folder, keep_difficult, lp, # learning_parameters device, ): """ Training. """ label_map, rev_label_map, label_color_map = load_maps( os.path.join(data_folder, 'label_map.json')) checkpoint_path = os.path.join( data_folder, "checkpoint_ssd300.pkl") # path to model checkpoint, None if none n_classes = len(label_map) # number of different types of objects # Initialize model or load checkpoint if checkpoint_path is None or not os.path.exists(checkpoint_path): start_epoch = 0 model = SSD300(n_classes=n_classes) # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo biases = list() not_biases = list() for param_name, param in model.named_parameters(): if param.requires_grad: if param_name.endswith('.bias'): biases.append(param) else: not_biases.append(param) optimizer = torch.optim.SGD(params=[{ 'params': biases, 'lr': 2 * lp['lr'] }, { 'params': not_biases }], lr=lp['lr'], momentum=lp['momentum'], weight_decay=lp['weight_decay']) else: checkpoint = torch.load(checkpoint_path) start_epoch = checkpoint['epoch'] + 1 print('\nLoaded checkpoint from epoch %d.\n' % start_epoch) model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to default device model = model.to(device) criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy, device=device) # Custom dataloaders train_dataset = PascalVOCDataset(data_folder, split='train', keep_difficult=keep_difficult) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=lp['batch_size'], shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=lp['workers'], pin_memory=True) # note that we're passing the collate function here # Calculate total number of epochs to train and the epochs to decay learning rate at (i.e. convert iterations to epochs) # To convert iterations to epochs, divide iterations by the number of iterations per epoch # The paper trains for 120,000 iterations with a batch size of 32, decays after 80,000 and 100,000 iterations epochs = lp['iterations'] // (len(train_dataset) // 32) decay_lr_at = [ it // (len(train_dataset) // 32) for it in lp['decay_lr_at'] ] # Epochs for epoch in range(start_epoch, epochs): # Decay learning rate at particular epochs if epoch in decay_lr_at: adjust_learning_rate(optimizer, lp['decay_lr_to']) # One epoch's training train(lp, train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch) # Save checkpoint save_checkpoint(epoch, model, optimizer, checkpoint_path)
torch.manual_seed(opt.seed) if opt.cuda: assert torch.cuda.is_available(), 'No GPU found, please run without --cuda' torch.cuda.manual_seed_all(opt.seed) # model model = SSD300(VOC.N_CLASSES) cfg = model.config if opt.checkpoint: model.load_state_dict(torch.load(opt.checkpoint)) else: model.init_parameters(opt.backbone) encoder = MultiBox(cfg) criterion = MultiBoxLoss() # cuda if opt.cuda: model.cuda() criterion.cuda() cudnn.benchmark = True # optimizer optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4) # learning rate / iterations init_lr = cfg.get('init_lr', 1e-3)
def main(): n_classes = len(label_map) logger.debug(n_classes) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint = configs['checkpoint'] batch_size = configs['batch_size'] start_epoch = configs['start_epoch'] #开始的epoch epochs = configs['epochs'] #本次训练的epoch epochs_since_improvement = configs['epochs_since_improvement'] best_loss = configs['best_loss'] num_workers = configs['num_workers'] lr = configs['lr'] momentum = configs['momentum'] weight_decay = configs['weight_decay'] grad_clip = configs['grad_clip'] backbone = configs['backbone'] best_save = configs['best_model'] save_model = configs['save_model'] model = SSD(class_num=n_classes, backbone=backbone, device=device) #model = SSDLite(class_num=n_classes, backbone=backbone, device=device) if checkpoint is not None: model = load_pretrained(model, checkpoint) #加载预训练模型 data_folder = configs['data_folder'] val_dataset = Dataset(data_folder, split='test', keep_difficult=keep_difficult) train_dataset = Dataset(data_folder, split='train', keep_difficult=keep_difficult) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=val_dataset.collate_fn, num_workers=num_workers, pin_memory=True) biases = list() not_biases = list() param_names_biases = list() param_names_not_biases = list() for param_name, param in model.named_parameters(): if param.requires_grad: if param_name.endswith('.bias'): biases.append(param) param_names_biases.append(param_name) else: not_biases.append(param) param_names_not_biases.append(param_name) optimizer = torch.optim.SGD(params=[{'params':biases,'lr': 2*lr}, {'params':not_biases}], lr=lr, momentum=momentum, weight_decay=weight_decay) model = model.to(device) criterion = MultiBoxLoss(priors_cxcy=model.priors).to(device) print(start_epoch) logger.debug(start_epoch) logger.debug(backbone) for epoch in range(start_epoch, epochs): train(train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, grad_clip=grad_clip) val_loss = validate(val_loader=val_loader, model=model, criterion=criterion) is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) if not is_best: epochs_since_improvement += 1 print('\nEpochs since last improvment: %d\n' %(epochs_since_improvement)) logger.debug('\nEpochs since last improvment: %d\n' %(epochs_since_improvement)) else: epochs_since_improvement = 0 torch.save(model.state_dict(), best_save) torch.save(model.state_dict(), save_model) logger.debug("End of training.")
# 学習の再開時はargs['resume']のパラメータをロード if args['resume']: print('Resuming training, loading {}...'.format(args['resume'])) net.load_weights(args['save_folder'] + args['resume']) sources = list() loc = list() conf = list() if args['cuda']: net = torch.nn.DataParallel(net) cudnn.benchmark = True # 損失関数の設定 criterion = MultiBoxLoss(voc_config['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args['cuda']) # 最適化パラメータの設定 optimizer = optim.SGD(net.parameters(), lr=args['lr'], momentum=args['momentum'], weight_decay=args['weight_decay']) # 訓練モード net.train() # loss counters loc_loss = 0 conf_loss = 0 epoch = 0 print('Loading the dataset...')
def __init__(self, net, cfg): self.cfg = cfg self.net = net self.anchors = generate_anchors(cfg) if torch.cuda.is_available(): self.net.cuda() self.anchors = self.anchors.cuda() # Dataset transform transform = [ Transform(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_Z, size=cfg.MODEL.Z_SIZE), Transform(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_X, size=cfg.MODEL.X_SIZE, random_translate=True, random_resize=True, motion_blur=True, random_translate_range=cfg.TRAIN.DATA_AUG_TRANSLATE_RANGE, random_resize_scale_min=cfg.TRAIN.DATA_AUG_RESIZE_SCALE_MIN, random_resize_scale_max=cfg.TRAIN.DATA_AUG_RESIZE_SCALE_MAX ) ] # Training dataset trackingnet = TrackingNet(cfg.PATH.TRACKINGNET, subset="train", debug_seq=cfg.TRAIN.DEBUG_SEQ) imagenet = ImageNetVID(cfg.PATH.ILSVRC, subset="train") sampler = PairSampler([trackingnet, imagenet], cfg=cfg, transform=transform, pairs_per_video=cfg.TRAIN.PAIRS_PER_VIDEO, frame_range=cfg.TRAIN.FRAME_RANGE) # Distractor dataset coco = CocoDetection(cfg.PATH.COCO, cfg.PATH.COCO_ANN_FILE) # coco_distractor = COCODistractor(coco, 4000) coco_positive = COCOPositivePair(coco, 4000, cfg=cfg, transform=transform) coco_negative = COCONegativePair(coco, 12000, cfg=cfg, transform=transform) dataset = ConcatDataset([sampler, coco_positive, coco_negative]) self.dataloader = DataLoader(dataset, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=4, shuffle=True, pin_memory=True, drop_last=True) # Validation dataset val_trackingnet = TrackingNet(cfg.PATH.TRACKINGNET, subset="val") val_imagenet = ImageNetVID(cfg.PATH.ILSVRC, subset="val") validation_sampler = PairSampler([val_trackingnet, val_imagenet], cfg=cfg, transform=transform, pairs_per_video=1, frame_range=cfg.TRAIN.FRAME_RANGE) val_coco_positive = COCOPositivePair(coco, 100, cfg=cfg, transform=transform) val_dataset = ConcatDataset([validation_sampler, val_coco_positive]) if cfg.TRAIN.DEBUG_SEQ >= 0: # When debugging on a single sequence, the validation is performed on the same one val_dataset = PairSampler([trackingnet], cfg=cfg, transform=transform, pairs_per_video=200) self.validation_dataloader = DataLoader(val_dataset, batch_size=min(cfg.TRAIN.BATCH_SIZE, 20), num_workers=4, shuffle=True, pin_memory=True, drop_last=False) # Loss self.criterion = MultiBoxLoss(self.anchors, cfg) self.optimizer = optim.Adam(self.net.parameters(), lr=cfg.TRAIN.LR, weight_decay=cfg.TRAIN.WEIGHT_DECAY) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=cfg.TRAIN.SCHEDULER_STEP_SIZE, gamma=cfg.TRAIN.SCHEDULER_GAMMA) # Summary Writer self.run_id = datetime.now().strftime('%b%d_%H-%M-%S') if not cfg.DEBUG: self.save_config() self.save_code() self.writer = SummaryWriter(log_dir=os.path.join(cfg.PATH.DATA_DIR, "runs", self.run_id)) self.start_epoch = 0 if cfg.TRAIN.RESUME_CHECKPOINT: self.start_epoch = utils.load_checkpoint(cfg.TRAIN.RESUME_CHECKPOINT, self.net, self.optimizer) if torch.cuda.is_available(): self.net = nn.DataParallel(self.net) self.best_IOU = 0.
def train(): set_seed(seed=10) os.makedirs(args.save_root, exist_ok=True) # create model, optimizer and criterion model = SSD300(n_classes=len(label_map), device=device) biases = [] not_biases = [] for name, param in model.named_parameters(): if param.requires_grad: if name.endswith('.bias'): biases.append(param) else: not_biases.append(param) model = model.to(device) optimizer = torch.optim.SGD(params=[{ 'params': biases, 'lr': 2 * args.lr }, { 'params': not_biases }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume is None: start_epoch = 0 else: checkpoint = torch.load(args.resume, map_location=device) start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print(f'Training will start at epoch {start_epoch}.') criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy, device=device, alpha=args.alpha) criterion = criterion.to(device) ''' scheduler = StepLR(optimizer=optimizer, step_size=20, gamma=0.5, last_epoch=start_epoch - 1, verbose=True) ''' # load data transform = Transform(size=(300, 300), train=True) train_dataset = VOCDataset(root=args.data_root, image_set=args.image_set, transform=transform, keep_difficult=True) train_loader = DataLoader(dataset=train_dataset, collate_fn=collate_fn, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True) losses = AverageMeter() for epoch in range(start_epoch, args.num_epochs): # decay learning rate at particular epochs if epoch in [120, 140, 160]: adjust_learning_rate(optimizer, 0.1) # train model model.train() losses.reset() bar = tqdm(train_loader, desc='Train the model') for i, (images, bboxes, labels, _) in enumerate(bar): images = images.to(device) bboxes = [b.to(device) for b in bboxes] labels = [l.to(device) for l in labels] predicted_bboxes, predicted_scores = model( images) # (N, 8732, 4), (N, 8732, num_classes) loss = criterion(predicted_bboxes, predicted_scores, bboxes, labels) optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item(), images.size(0)) if i % args.print_freq == args.print_freq - 1: bar.write(f'Average Loss: {losses.avg:.4f}') bar.write(f'Epoch: [{epoch + 1}|{args.num_epochs}] ' f'Average Loss: {losses.avg:.4f}') # adjust learning rate # scheduler.step() # save model state_dict = { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } save_path = os.path.join(args.save_root, 'ssd300.pth') torch.save(state_dict, save_path) if epoch % args.save_freq == args.save_freq - 1: shutil.copyfile( save_path, os.path.join(args.save_root, f'ssd300_epochs_{epoch + 1}.pth'))
return boxes, labels, scores checkpoint_path = gdrive_dir+'/pretrained/mine.pt' # checkpoint = torch.load(checkpoint_path) # None if False: ssd_eff = checkpoint['model'] optimizer = checkpoint['optimizer'] #exp_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.96, last_epoch=checkpoint['epoch']-1) else: print('New model') ssd_eff = SSDEff(n_classes=15).to(device) optimizer = torch.optim.Adam(ssd_eff.parameters(), lr=2e-3, weight_decay=5e-4) #exp_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.96) loss_func = MultiBoxLoss(priors_cxcy=ssd_eff.get_prior_boxes(), threshold=0.5, alpha=1., neg_pos_ratio=3, focal_loss=False) grad_clip = None def train_epoch(model, trainset_loader, loss_func, optimizer, epoch_id): model.train() train_loss = 0 for step, (imgs, boxes, labels) in enumerate(trainset_loader): # print(type(imgs),imgs.shape, imgs) # print(type(boxes),boxes[0].shape, boxes) # print(type(labels),labels[0].shape, labels) # break # move input data to GPU imgs = imgs.to(device) boxes = [b.to(device) for b in boxes]