def get_softmax_triplet_loss_fn(cfg, num_class): cfg.SOLVER.LOSS_NAME = 'softmax' softmax_fn = make_loss(cfg, num_class) cfg.SOLVER.LOSS_NAME = 'triplet' triplet_fn = make_loss(cfg, num_class) return [softmax_fn, triplet_fn]
def train(): # 1、make dataloader # prepare train,val img_info list, elem is tuple; train_loader, val_loader, num_class = cifar_make_data_loader(cfg) # 2、make model model = build_model(cfg, num_class) # 3、 make optimizer optimizer = make_optimizer(cfg, model) # 4、 make lr_scheduler scheduler = make_lr_scheduler(cfg, optimizer) # 5、make loss: default use softmax loss loss_fn = make_loss(cfg, num_class) # get parameters device = cfg.MODEL.DEVICE use_gpu = device == "cuda" pretrained = cfg.MODEL.PRETRAIN_PATH != "" parallel = cfg.MODEL.PARALLEL log_period = cfg.OUTPUT.LOG_PERIOD ckpt_period = cfg.OUTPUT.CKPT_PERIOD eval_period = cfg.OUTPUT.EVAL_PERIOD output_dir = cfg.OUTPUT.DIRS ckpt_save_path = output_dir + cfg.OUTPUT.CKPT_DIRS epochs = cfg.SOLVER.MAX_EPOCHS batch_size = cfg.SOLVER.BATCH_SIZE grad_clip = cfg.SOLVER.GRAD_CLIP batch_num = len(train_loader) log_iters = batch_num // log_period if not os.path.exists(ckpt_save_path): os.makedirs(ckpt_save_path) # create *_result.xlsx # save the result for analyze name = (cfg.OUTPUT.LOG_NAME).split(".")[0] + ".xlsx" result_path = cfg.OUTPUT.DIRS + name wb = xl.Workbook() sheet = wb.worksheets[0] titles = [ 'size/M', 'speed/ms', 'final_planes', 'acc', 'loss', 'acc', 'loss', 'acc', 'loss' ] sheet.append(titles) check_epochs = [40, 80, 120, 160, 200, 240, 280, 320, 360, epochs] values = [] logger = logging.getLogger("CDNet.train") size = count_parameters(model) values.append(format(size, '.2f')) values.append(model.final_planes) logger.info("the model size is {:.2} M".format(size)) logger.info("Starting Training CDNetwork") best_acc = 0. is_best = False avg_loss, avg_acc = RunningAverageMeter(), RunningAverageMeter() avg_time, global_avg_time = AverageMeter(), AverageMeter() if parallel: model = nn.DataParallel(model) if use_gpu: model = model.to(device) for epoch in range(epochs): scheduler.step() lr = scheduler.get_lr()[0] # if save epoch_num k, then run k+1 epoch next if pretrained and epoch < model.start_epoch: continue # rest the record model.train() avg_loss.reset() avg_acc.reset() avg_time.reset() for i, batch in enumerate(train_loader): t0 = time.time() imgs, labels = batch if use_gpu: imgs = imgs.to(device) labels = labels.to(device) res = model(imgs) loss, acc = compute_loss_acc(res, labels, loss_fn) loss.backward() if grad_clip != 0: nn.utils.clip_grad_norm_(model.parameters(), grad_clip) optimizer.step() optimizer.zero_grad() t1 = time.time() avg_time.update((t1 - t0) / batch_size) avg_loss.update(loss) avg_acc.update(acc) # log info if (i + 1) % log_iters == 0: logger.info( "epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}". format(epoch + 1, i + 1, batch_num, avg_loss.avg, avg_acc.avg)) logger.info( "end epochs {}/{} with lr: {:.5f} and avg_time is: {:.3f} ms". format(epoch + 1, epochs, lr, avg_time.avg * 1000)) global_avg_time.update(avg_time.avg) # test the model if (epoch + 1) % eval_period == 0 or (epoch + 1) in check_epochs: model.eval() logger.info("begin eval the model") val_acc = AverageMeter() with torch.no_grad(): for vi, batch in enumerate(val_loader): imgs, labels = batch if use_gpu: imgs = imgs.to(device) labels = labels.to(device) res = model(imgs) _, acc = compute_loss_acc(res, labels) val_acc.update(acc) logger.info("validation results at epoch:{}".format(epoch + 1)) logger.info("acc:{:.2%}".format(val_acc.avg)) # determine whether current model is the best if val_acc.avg > best_acc: logger.info("get a new best acc") best_acc = val_acc.avg is_best = True # add the result to sheet if (epoch + 1) in check_epochs: val = [ format(val_acc.avg * 100, '.2f'), format(avg_loss.avg, '.3f') ] values.extend(val) # whether to save the model if (epoch + 1) % ckpt_period == 0 or is_best: if parallel: torch.save( model.module.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) else: torch.save( model.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) logger.info("checkpoint {} was saved".format(epoch + 1)) if is_best: if parallel: torch.save(model.module.state_dict(), ckpt_save_path + "best_ckpt.pth") else: torch.save(model.state_dict(), ckpt_save_path + "best_ckpt.pth") logger.info("best_checkpoint was saved") is_best = False values.insert(1, format(global_avg_time.avg * 1000, '.2f')) sheet.append(values) wb.save(result_path) logger.info("best_acc:{:.2%}".format(best_acc)) logger.info("Ending training CDNetwork on cifar")
def train(): """ # get an image for test the model train_transform = build_transforms(cfg, is_train = True) imgs = get_image("1.jpg") img_tensor = train_transform(imgs[0]) # c,h,w = img_tensor.shape # img_tensor = img_tensor.view(-1,c,h,w) # add an axis img_tensor = img_tensor.unsqueeze(0) """ # 1、make dataloader train_loader, val_loader, num_query, num_class = make_data_loader(cfg) #print("num_query:{},num_class:{}".format(num_query,num_class)) # 2、make model model = build_model(cfg, num_class) # model.eval() # x = model(img_tensor) # print(x.shape) # 3、 make optimizer optimizer = make_optimizer(cfg, model) # 4、 make lr_scheduler scheduler = make_lr_scheduler(cfg, optimizer) # 5、 make loss_func if cfg.MODEL.PCB_NECK: # make loss specificially for pcb loss_func = get_softmax_triplet_loss_fn(cfg, num_class) else: loss_func = make_loss(cfg, num_class) # get paramters log_period = cfg.OUTPUT.LOG_PERIOD ckpt_period = cfg.OUTPUT.CHECKPOINT_PERIOD eval_period = cfg.OUTPUT.EVAL_PERIOD output_dir = cfg.OUTPUT.ROOT_DIR device = cfg.MODEL.DEVICE epochs = cfg.SOLVER.MAX_EPOCHS use_gpu = device == "cuda" use_neck = cfg.MODEL.NECK or cfg.MODEL.LEARN_REGION # how many batch for each log batch_size = cfg.SOLVER.IMGS_PER_BATCH batch_num = len(train_loader) log_iters = batch_num // log_period pretrained = cfg.MODEL.PRETRAIN_PATH != '' parallel = cfg.MODEL.PARALLEL grad_clip = cfg.DARTS.GRAD_CLIP feat_norm = cfg.TEST.FEAT_NORM ckpt_save_path = cfg.OUTPUT.ROOT_DIR + cfg.OUTPUT.CKPT_DIR if not os.path.exists(ckpt_save_path): os.makedirs(ckpt_save_path) # create *_result.xlsx # save the result for analyze name = (cfg.OUTPUT.LOG_NAME).split(".")[0] + ".xlsx" result_path = cfg.OUTPUT.ROOT_DIR + name wb = xl.Workbook() sheet = wb.worksheets[0] titles = [ 'size/M', 'speed/ms', 'final_planes', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss' ] sheet.append(titles) check_epochs = [40, 80, 120, 160, 200, 240, 280, 320, 360, epochs] values = [] logger = logging.getLogger('MobileNetReID.train') # count parameter size = count_parameters(model) logger.info("the param number of the model is {:.2f} M".format(size)) values.append(format(size, '.2f')) values.append(model.final_planes) logger.info("Start training") #count = 183, x, y = batch -> 11712 for train if pretrained: start_epoch = model.start_epoch if parallel: model = nn.DataParallel(model) if use_gpu: # model = nn.DataParallel(model) model.to(device) # save the best model best_mAP, best_r1 = 0., 0. is_best = False # batch : img, pid, camid, img_path avg_loss, avg_acc = RunningAverageMeter(), RunningAverageMeter() avg_time, global_avg_time = AverageMeter(), AverageMeter() global_avg_time.reset() for epoch in range(epochs): scheduler.step() if pretrained and epoch < start_epoch - 1: continue model.train() # sum_loss, sum_acc = 0., 0. avg_loss.reset() avg_acc.reset() avg_time.reset() for i, batch in enumerate(train_loader): t0 = time.time() imgs, labels = batch if use_gpu: imgs = imgs.to(device) labels = labels.to(device) res = model(imgs) # score, feat = model(imgs) # loss = loss_func(score, feat, labels) loss, acc = compute_loss_acc(use_neck, res, labels, loss_func) loss.backward() if grad_clip != 0: nn.utils.clip_grad_norm(model.parameters(), grad_clip) optimizer.step() optimizer.zero_grad() # acc = (score.max(1)[1] == labels).float().mean() # sum_loss += loss # sum_acc += acc t1 = time.time() avg_time.update((t1 - t0) / batch_size) avg_loss.update(loss) avg_acc.update(acc) #log the info if (i + 1) % log_iters == 0: logger.info( "epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}". format(epoch + 1, i + 1, batch_num, avg_loss.avg, avg_acc.avg)) lr = optimizer.state_dict()['param_groups'][0]['lr'] logger.info( "end epochs {}/{} with lr: {:.5f} and avg_time is {:.3f} ms". format(epoch + 1, epochs, lr, avg_time.avg * 1000)) global_avg_time.update(avg_time.avg) # change the lr # eval the model if (epoch + 1) % eval_period == 0 or (epoch + 1) == epochs: model.eval() metrics = R1_mAP(num_query, use_gpu=use_gpu, feat_norm=feat_norm) with torch.no_grad(): for vi, batch in enumerate(val_loader): imgs, labels, camids = batch if use_gpu: imgs = imgs.to(device) feats = model(imgs) metrics.update((feats, labels, camids)) #compute cmc and mAP cmc, mAP = metrics.compute() logger.info("validation results at epoch:{}".format(epoch + 1)) logger.info("mAP:{:.2%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.2%}".format( r, cmc[r - 1])) # determine whether cur model is the best if mAP > best_mAP: is_best = True best_mAP = mAP logger.info("Get a new best mAP") if cmc[0] > best_r1: is_best = True best_r1 = cmc[0] logger.info("Get a new best r1") # add the result to sheet if (epoch + 1) in check_epochs: val = [avg_acc.avg, mAP, cmc[0], cmc[4], cmc[9]] change = [format(v * 100, '.2f') for v in val] change.append(format(avg_loss.avg, '.3f')) values.extend(change) # we hope that eval_period == ckpt_period or eval_period == k* ckpt_period where k is int # whether to save the model if (epoch + 1) % ckpt_period == 0 or is_best: if parallel: torch.save( model.module.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) else: torch.save( model.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) logger.info("checkpoint {} saved !".format(epoch + 1)) if is_best: if parallel: torch.save(model.module.state_dict(), ckpt_save_path + "best_ckpt.pth") else: torch.save(model.state_dict(), ckpt_save_path + "best_ckpt.pth") logger.info("best checkpoint was saved") is_best = False values.insert(1, format(global_avg_time.avg * 1000, '.2f')) sheet.append(values) wb.save(result_path) logger.info("training is end, time for per imgs is {} ms".format( global_avg_time.avg * 1000))
def train(): # 1、make dataloader # prepare train,val img_info list, elem is tuple; train_loader, val_loader, num_query, num_class = make_data_loader(cfg) # 2、make model model = build_model(cfg, num_class) # 3、 make optimizer optimizer = make_optimizer(cfg, model) # 4、 make lr_scheduler scheduler = make_lr_scheduler(cfg, optimizer) # 5、make loss loss_fn = make_loss(cfg, num_class) # get parameters device = cfg.MODEL.DEVICE use_gpu = device == "cuda" pretrained = cfg.MODEL.PRETRAIN_PATH != "" parallel = cfg.MODEL.PARALLEL log_period = cfg.OUTPUT.LOG_PERIOD ckpt_period = cfg.OUTPUT.CKPT_PERIOD eval_period = cfg.OUTPUT.EVAL_PERIOD output_dir = cfg.OUTPUT.DIRS ckpt_save_path = output_dir + cfg.OUTPUT.CKPT_DIRS epochs = cfg.SOLVER.MAX_EPOCHS batch_size = cfg.SOLVER.BATCH_SIZE grad_clip = cfg.SOLVER.GRAD_CLIP batch_num = len(train_loader) log_iters = batch_num // log_period if not os.path.exists(ckpt_save_path): os.makedirs(ckpt_save_path) # create *_result.xlsx # save the result for analyze name = (cfg.OUTPUT.LOG_NAME).split(".")[0] + ".xlsx" result_path = cfg.OUTPUT.DIRS + name wb = xl.Workbook() sheet = wb.worksheets[0] titles = ['size/M','speed/ms','final_planes', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss','acc', 'mAP', 'r1', 'r5', 'r10', 'loss'] sheet.append(titles) check_epochs = [40, 80, 120, 160, 200, 240, 280, 320, 360, epochs] values = [] logger = logging.getLogger("CDNet.train") size = count_parameters(model) values.append(format(size, '.2f')) values.append(model.final_planes) logger.info("the param number of the model is {:.2f} M".format(size)) infer_size = infer_count_parameters(model) logger.info("the infer param number of the model is {:.2f}M".format(infer_size)) shape = [1, 3] shape.extend(cfg.DATA.IMAGE_SIZE) # if cfg.MODEL.NAME == 'cdnet' : # infer_model = CDNetwork(num_class, cfg) # elif cfg.MODEL.NAME == 'cnet': # infer_model = CNetwork(num_class, cfg) # else: # infer_model = model # for scaling experiment flops, _ = get_model_infos(model, shape) logger.info("the total flops number of the model is {:.2f} M".format(flops)) logger.info("Starting Training CDNetwork") best_mAP, best_r1 = 0., 0. is_best = False avg_loss, avg_acc = RunningAverageMeter(),RunningAverageMeter() avg_time, global_avg_time = AverageMeter(), AverageMeter() if parallel: model = nn.DataParallel(model) if use_gpu: model = model.to(device) for epoch in range(epochs): scheduler.step() lr = scheduler.get_lr()[0] # if save epoch_num k, then run k+1 epoch next if pretrained and epoch < model.start_epoch: continue # rest the record model.train() avg_loss.reset() avg_acc.reset() avg_time.reset() for i, batch in enumerate(train_loader): t0 = time.time() imgs, labels = batch if use_gpu: imgs = imgs.to(device) labels = labels.to(device) res = model(imgs) loss, acc = compute_loss_acc(res, labels, loss_fn) loss.backward() if grad_clip != 0: nn.utils.clip_grad_norm_(model.parameters(), grad_clip) optimizer.step() optimizer.zero_grad() t1 = time.time() avg_time.update((t1 - t0) / batch_size) avg_loss.update(loss) avg_acc.update(acc) # log info if (i+1) % log_iters == 0: logger.info("epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}".format( epoch+1, i+1, batch_num, avg_loss.avg, avg_acc.avg)) logger.info("end epochs {}/{} with lr: {:.5f} and avg_time is: {:.3f} ms".format(epoch+1, epochs, lr, avg_time.avg * 1000)) global_avg_time.update(avg_time.avg) # test the model if (epoch + 1) % eval_period == 0 or (epoch + 1) in check_epochs: model.eval() metrics = R1_mAP(num_query, use_gpu = use_gpu) with torch.no_grad(): for vi, batch in enumerate(val_loader): imgs, labels, camids = batch if use_gpu: imgs = imgs.to(device) feats = model(imgs) metrics.update((feats, labels, camids)) #compute cmc and mAP cmc, mAP = metrics.compute() logger.info("validation results at epoch {}".format(epoch + 1)) logger.info("mAP:{:2%}".format(mAP)) for r in [1,5,10]: logger.info("CMC curve, Rank-{:<3}:{:.2%}".format(r, cmc[r-1])) # determine whether current model is the best if mAP > best_mAP: is_best = True best_mAP = mAP logger.info("Get a new best mAP") if cmc[0] > best_r1: is_best = True best_r1 = cmc[0] logger.info("Get a new best r1") # add the result to sheet if (epoch + 1) in check_epochs: val = [avg_acc.avg, mAP, cmc[0], cmc[4], cmc[9]] change = [format(v * 100, '.2f') for v in val] change.append(format(avg_loss.avg, '.3f')) values.extend(change) # whether to save the model if (epoch + 1) % ckpt_period == 0 or is_best: torch.save(model.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) logger.info("checkpoint {} was saved".format(epoch + 1)) if is_best: torch.save(model.state_dict(), ckpt_save_path + "best_ckpt.pth") logger.info("best_checkpoint was saved") is_best = False values.insert(1, format(global_avg_time.avg * 1000, '.2f')) values.append(format(infer_size, '.2f')) sheet.append(values) wb.save(result_path) logger.info("best_mAP:{:.2%}, best_r1:{:.2%}".format(best_mAP, best_r1)) logger.info("Ending training CDNetwork")