def train(params): params = Params(params) set_random_seeds(params.seed) time_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") params.save_root = params.save_root + f'/{params.project_name}_{time_now}_{params.version}' os.makedirs(params.save_root, exist_ok=True) logging.basicConfig( filename= f'{params.save_root}/{params.project_name}_{time_now}_{params.version}.log', filemode='a', format='%{asctime}s - %(levalname)s: %(message)s') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' logging.info(f'Available GPUs: {torch.cuda.device_count()}') # Train pipeline files = glob.glob( os.path.join(params.data_root, params.project_name, params.train_set, '*/*.JPEG')) labels = [] for fp in files: label = int(fp.split('/')[-2]) - 1 labels.append(label) assert len(files) == len(labels) train_pipeline = TrainImageDecoderPipeline(params=params, device_id=0, files=files, labels=labels) train_pipeline.build() train_pii = pytorchIterator(train_pipeline, last_batch_policy=LastBatchPolicy.DROP, reader_name='Reader', auto_reset=True) # Evaluation pipeline files = glob.glob( os.path.join(params.data_root, params.project_name, params.val_set, '*.JPEG')) files = sorted(files, key=lambda f: f.split('/')[-1].split('_')[-1].split('.')[0]) labels = loadlabel( os.path.join( params.data_root, params.project_name, 'ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt' )) eval_pipeline = EvalImageDecoderPipeline(params=params, device_id=0, files=files, labels=labels) eval_pipeline.build() eval_pii = pytorchIterator(eval_pipeline, last_batch_policy=LastBatchPolicy.PARTIAL, reader_name='Reader', auto_reset=True) model = Darknet() last_step = 0 last_epoch = 0 if params.load_weights != 'None': try: state_dict = torch.load(params.load_weights) model.load_state_dict(state_dict) last_step = int(params.load_weights.split('_')[-1].split('.')[0]) last_epoch = int(params.load_weights.split('_')[-2]) except: logging.error('Fail to resuming from weight!') exit() if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = nn.DataParallel(model) if params.optim == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate) else: optimizer = torch.optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, nesterov=True) criterion = nn.CrossEntropyLoss() # ls_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.5, verbose=True, patience=8) epoch = 0 begin_epoch = max(0, last_epoch) step = max(0, last_step) best_loss = 100 logging.info('Begin to train...') model.train() try: for epoch in range(begin_epoch, params.epoch): for iter, data in enumerate(train_pii): x = data[0]['data'] label = data[0]['label'].squeeze(-1).long().cuda() output = model(x) loss = criterion(output, label) optimizer.zero_grad() loss.backward() optimizer.step() if iter % params.save_interval == 0: logging.info( f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} ' f'Train Epoch: {epoch} iter: {iter} loss: {loss.item()}' ) step += 1 if epoch % params.eval_interval == 0: model.eval() epoch_loss = 0 prediciton = [] target = [] with torch.no_grad(): for iter, data in enumerate(eval_pii): x = data[0]['data'] label = data[0]['label'].squeeze(-1).long().cuda() output = model(x) loss = criterion(output, label).item() epoch_loss += loss * x.shape[0] prediciton.append(output) target.append(label) loss = epoch_loss / 50000 prediciton = torch.cat(prediciton, dim=0) target = torch.cat(target, dim=0) acc = top1accuracy(prediciton, target) acctop5 = top5accuracy(prediciton, target) logging.info( f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} ' f'Eval Epoch: {epoch} loss: {loss} accuracy: {acc} Top5 acc: {acctop5}' ) if loss < best_loss: best_loss = loss save_checkpoint( model, f'{params.save_root}/{epoch}_{step}.pth') model.train() except KeyboardInterrupt: save_checkpoint(model, f'{params.save_root}/Interrupt_{epoch}_{step}.pth')
"x", "y", "w", "h", "conf", "cls", "cls_acc", "recall50", "recall75", "precision", "conf_obj", "conf_noobj", ] for epoch in range(opt.epochs): model.train() start_time = time.time() if epoch % opt.evaluation_interval == 0: print("\n---- Evaluating Model ----") # Evaluate the model on the validation set precision, recall, AP, f1, ap_class = evaluate( model, path=valid_path, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5, img_size=opt.img_size, batch_size=8, ) evaluation_metrics = [ ("val_precision", precision.mean()),
def train(params): params = Params(params) set_random_seeds(params.seed) time_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") params.save_root = params.save_root + f'/{params.project_name}_{time_now}_{params.version}' os.makedirs(params.save_root, exist_ok=True) logging.basicConfig( filename= f'{params.save_root}/{params.project_name}_{time_now}_{params.version}.log', filemode='a', format='%{asctime}s - %(levalname)s: %(message)s') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' logging.info(f'Available GPUs: {torch.cuda.device_count()}') data_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=params.mean, std=params.std) ]) train_set = TrainDataset(root=os.path.join(params.data_root, params.project_name, params.train_set), transform=data_transform) # val_set = EvalDataset(root=os.path.join(params.data_root, params.project_name, params.val_set), # label_path=os.path.join(params.data_root, params.project_name, # 'ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt'), # transform=data_transform) train_params = { 'batch_size': params.batch_size, 'shuffle': False, 'num_workers': 4, 'drop_last': True } # val_params = {'batch_size': params.batch_size, 'shuffle': False, 'num_workers': params.num_gpus * 4, # 'drop_last': False} train_loader = DataLoader(train_set, **train_params) # val_loader = DataLoader(val_set, **val_params) eli = ExternalInputIterator(params.batch_size) params.mean = torch.Tensor(params.mean).unsqueeze(0).unsqueeze(0) params.std = torch.Tensor(params.std).unsqueeze(0).unsqueeze(0) pipe = ExternalSourcePipeline(params=params, num_threads=4, device_id=0, external_date=eli, seed=params.seed) # pipe.build() # images, _ = pipe.run() # print(np.array(images[0].as_cpu()).shape) import matplotlib.pyplot as plt # plt.imsave('image[0].jpg', np.array(images[0].as_cpu()).transpose((1, 2, 0))) # plt.imsave('image[0].jpg', images[0].as_cpu()) # exit() pii = pytorchIterator(pipe, last_batch_padded=True, last_batch_policy=LastBatchPolicy.DROP) model = Darknet() criterion = nn.CrossEntropyLoss() last_step = 0 last_epoch = 0 if params.load_weights != 'None': try: state_dict = torch.load(params.load_weights) model.load_state_dict(state_dict) last_step = int(params.load_weights.split('_')[-1].split('.')[0]) last_epoch = int(params.load_weights.split('_')[-2]) except: logging.error('Fail to resuming from weight!') exit() if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = nn.DataParallel(model) if params.optim == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate) else: optimizer = torch.optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, nesterov=True) epoch = 0 begin_epoch = max(0, last_epoch) step = max(0, last_step) logging.info('Begin to train...') model.train() try: import time for epoch in range(begin_epoch, params.epoch): for iter, (data_pii, data_torch) in enumerate(zip(pii, train_loader)): t = time.time() # type(x)显示torch.Tensor,但是x已经在显存上 x_pii = data_pii[0]['data'] label_pii = data_pii[0]['label'].cuda() x_torch = data_torch[0].cuda() label_torch = data_torch[1].cuda() x_pii = x_pii.cpu().squeeze(0).numpy().transpose((1, 2, 0)) x_torch = x_torch.cpu().squeeze(0).numpy().transpose((1, 2, 0)) import matplotlib.pyplot as plt plt.imsave('x_pii.jpg', x_pii) plt.imsave('x_torch.jpg', x_torch) exit() # print('load data time:', time.time() - t) # t = time.time() # output = model(x) # loss = criterion(output, label) # optimizer.zero_grad() # loss.backward() # optimizer.step() # print('running time:', time.time() - t) # if iter == 6: # exit() except KeyboardInterrupt: save_checkpoint(model, f'{params.save_root}/Interrupt_{epoch}_{step}.pth')
def train(payload): labeled = payload["labeled"] resume_from = payload["resume_from"] ckpt_file = payload["ckpt_file"] # hyperparameters batch_size = 16 epochs = 2 # just for demo lr = 1e-2 weight_decay = 1e-2 coco = COCO("./data", Transforms(), samples=labeled, train=True) loader = DataLoader(coco, shuffle=True, batch_size=batch_size, collate_fn=collate_fn) config_file = "yolov3.cfg" model = Darknet(config_file).to(device) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # resume model and optimizer from previous loop if resume_from is not None: ckpt = torch.load(os.path.join("./log", resume_from)) model.load_state_dict(ckpt["model"]) optimizer.load_state_dict(ckpt["optimizer"]) # loss function priors = anchors.normalize("xyxy") loss_fn = HardNegativeMultiBoxesLoss(priors, device=device) model.train() for img, boxes, labels in loader: img = img.to(device) # 3 predictions from 3 yolo layers output = model(img) # batch predictions on each image batched_prediction = [] for p in output: # (batch_size, 3, gx, gy, 85) batch_size = p.shape[0] p = p.view(batch_size, -1, 85) batched_prediction.append(p) batched_prediction = torch.cat(batched_prediction, dim=1) # (batch_size, n_priors, 85) # the last dim of batched_prediction represent the predicted box # batched_prediction[...,:4] is the coordinate of the predicted bbox # batched_prediction[...,4] is the objectness score # batched_prediction[...,5:] is the pre-softmax class distribution # we need to apply some transforms to the those predictions # before we can use HardNegativeMultiBoxesLoss # In particular, the predicted bbox need to be relative to # normalized anchor priors # we will define another function bbox_transform # to do those transform, since it will be used by other processes # as well. # see documentation on HardNegativeMultiBoxesLoss # on its input parameters predicted_boxes, predicted_objectness, predicted_class_dist = bbox_transform( batched_prediction) loss = loss_fn(predicted_boxes, predicted_objectness, predicted_class_dist, boxes, labels) optimizer.zero_grad() loss.backward() optimizer.step() # save ckpt for this loop ckpt = {"model": model.state_dict(), "optimizer": optimizer.state_dict()} torch.save(ckpt, os.path.join("./log", ckpt_file)) return
def main(): # Hyperparameters parser parser = argparse.ArgumentParser() parser.add_argument("--year", type=str, default='2012', help="used to select training set") parser.add_argument("--set", type=str, default='train', help="used to select training set") parser.add_argument("--epochs", type=int, default=201, help="number of epochs") parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch") parser.add_argument("--model_def", type=str, default="config/net/resnet_dropout.cfg", help="path to model definition file") # parser.add_argument("--model_def", type=str, default="config/net/dqnyolo_large.cfg", help="path to model definition file") # parser.add_argument("--model_def", type=str, default="config/net/dqnyolo_mini.cfg", help="path to model definition file") # parser.add_argument("--model_def", type=str, default="config/net/dqnyolo_tiny.cfg", help="path to model definition file") parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension") parser.add_argument("--opt_lr", type=float, default=1e-5, help="learning rate for optimizer") parser.add_argument("--use_gpu", default=True, help="use GPU to accelerate training") parser.add_argument("--shuffle_train", default=True, help="shuffle the training dataset") parser.add_argument("--checkpoint_interval", type=int, default=20, help="interval between saving model weights") parser.add_argument("--evaluation_interval", type=int, default=10, help="interval evaluations on validation set") # parser.add_argument("--pretrained_weights", type=str, default="data/backbone/darknet53.conv.74", help="if specified starts from checkpoint model") # parser.add_argument("--pretrained_weights", type=str, default="logs/model/model_params_200.ckpt", help="if specified starts from checkpoint model") parser.add_argument("--pretrained_weights", default=False, help="if specified starts from checkpoint model") opt = parser.parse_args() print(opt) if opt.use_gpu is True: if torch.cuda.is_available(): device = torch.device('cuda') else: raise RuntimeError("Current Torch doesn't have GPU support.") else: device = torch.device('cpu') logger = SummaryWriter(exist_or_create_folder("./logs/tb/")) # Initiate model eval_model = Darknet(opt.model_def).to(device) if opt.pretrained_weights: print("Initialize model with pretrained_model") if opt.pretrained_weights.endswith(".ckpt"): eval_model.load_state_dict(torch.load(opt.pretrained_weights)) else: eval_model.load_darknet_weights(opt.pretrained_weights) else: print("Initialize model randomly") eval_model.apply(weights_init_normal) # eval_model.load_state_dict(torch.load("./logs/saved_exp/master-v2/model_params_80.ckpt")) print(eval_model) summary(eval_model, (3, 416, 416)) learn_batch_counter = 0 # for logger update (total numbers) batch_size = opt.batch_size # Get dataloader print("Begin loading train dataset ......") t_load_data = time.time() dataset = torchvision.datasets.VOCDetection(root='data/VOC/', year=opt.year, image_set=opt.set, transforms=None, download=True) dataset_dict = trans_voc(dataset) dataset = ListDataset(dataset_dict) loader = torch.utils.data.DataLoader( dataset, batch_size=opt.batch_size, shuffle=opt.shuffle_train, pin_memory=True, collate_fn=dataset.collate_fn, ) print("Complete loading train dataset in {} s".format(time.time() - t_load_data)) optimizer = torch.optim.Adam(eval_model.parameters(), lr=opt.opt_lr) # Warmup and learning rate decay scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, opt.epochs) # 5 epoch warmup, lr from 1e-5 to 1e-4, after that schedule as after_scheduler scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=scheduler_cosine) start_time = time.time() for i_epoch in range(opt.epochs): eval_model.train() for i_batch, (_, imgs, raw_targets, transform_params, tar_boxes) in enumerate(loader): print("\n++++++++++ i_epoch-i_batch {}-{} ++++++++++".format(i_epoch, i_batch)) batch_step_counter = 0 if len(imgs) != batch_size: print("Current batch size is smaller than opt.batch_size!") continue imgs = imgs.to(device) raw_targets = raw_targets.to(device) tar_boxes = tar_boxes.to(device) input_img = imgs if i_epoch == 0 and i_batch == 0: logger.add_graph(eval_model, input_img) # print(raw_targets) # print(raw_targets.size()) # print(raw_targets[:, :, :, 6:].size()) # print(raw_targets[:, :, :, 0].unsqueeze(3).size()) cls_targets = torch.cat((raw_targets[:, :, :, 0].unsqueeze(3), raw_targets[:, :, :, 6:]), 3) # print(cls_targets.size()) loss, pred = eval_model(input_img, cls_targets) optimizer.zero_grad() loss.backward() optimizer.step() batch_step_counter += 1 learn_batch_counter += 1 print("Ep-bt: {}-{} | Loss: {}".format(i_epoch, i_batch, loss.item())) logger.add_scalar('loss/loss', loss.item(), learn_batch_counter) if (i_epoch + 1) % opt.checkpoint_interval == 0: print("Saving model in epoch {}".format(i_epoch)) torch.save(eval_model.state_dict(), exist_or_create_folder("./logs/model/model_params_{}.ckpt".format(i_epoch))) # Evaluate the model on the validation set if (i_epoch + 1) % opt.evaluation_interval == 0: precision, recall, AP, f1, ap_class = evaluate( eval_model, [opt.year, 'val'], [0.5, 0.5, 0.5], batch_size, True, diagnosis_code=1 ) evaluation_metrics = [ ("val_precision", precision.mean()), ("val_recall", recall.mean()), ("val_mAP", AP.mean()), ("val_f1", f1.mean()), ] for tag, value in evaluation_metrics: logger.add_scalar("val/{}".format(tag), value.item(), i_epoch) # Print class APs and mAP ap_table = [["Index", "Class name", "AP"]] for i, c in enumerate(ap_class): ap_table += [[c, val2labels(c), "%.5f" % AP[i]]] print(AsciiTable(ap_table).table) print(f"---- validation mAP {AP.mean()}") # Evaluate the model on the training set if (i_epoch + 1) % opt.evaluation_interval == 0: precision, recall, AP, f1, ap_class = evaluate( eval_model, [opt.year, 'train'], [0.5, 0.5, 0.5], batch_size, True, diagnosis_code=1 ) evaluation_metrics = [ ("train_precision", precision.mean()), ("train_recall", recall.mean()), ("train_mAP", AP.mean()), ("train_f1", f1.mean()), ] for tag, value in evaluation_metrics: logger.add_scalar("train/{}".format(tag), value.item(), i_epoch) # Print class APs and mAP ap_table = [["Index", "Class name", "AP"]] for i, c in enumerate(ap_class): ap_table += [[c, val2labels(c), "%.5f" % AP[i]]] print(AsciiTable(ap_table).table) print(f"---- training mAP {AP.mean()}") # Warmup and lr decay scheduler_warmup.step() # Free GPU memory torch.cuda.empty_cache() total_train_time = time.time() - start_time print("Training complete in {} hours".format(total_train_time / 3600))
"x", "y", "w", "h", "conf", "cls", "cls_acc", "recall_50", "recall_75", "precision", "conf_obj", "conf_noobj", ] mAP = 0 for epoch in range(args.epochs): model.train() # 训练的时候需要这一步,如果是测试的时候那就改成model.eval() start_time = time.time() for batch_i, (_, imgs, targets) in enumerate(train_dataloader): batches_done = len(train_dataloader) * epoch + batch_i imgs = imgs.cuda() targets = targets.cuda() loss, outputs = model(imgs, targets) loss.backward() # if batches_done % args.gradient_accumulations: optimizer.step() optimizer.zero_grad() # ---------------- # 日志处理相关 # ---------------- # 获取每个yolo层的损失相关数据