def main(cfg): # build model, optimizer and scheduler model = make_model(cfg) model = model.to(cfg.DEVICE) if os.path.isfile(cfg.CKPT_DIR): model.load_state_dict(torch.load(cfg.CKPT_DIR)) print(colored('Loaded checkpoint:{}'.format(cfg.CKPT_DIR), 'blue', 'on_green')) else: print(colored('The cfg.CKPT_DIR id not a file: {}'.format(cfg.CKPT_DIR), 'green', 'on_red')) if cfg.USE_WANDB: logger = Logger("MPED_RNN", cfg, project = cfg.PROJECT, viz_backend="wandb" ) else: logger = logging.Logger("MPED_RNN") # get dataloaders test_dataloader = make_dataloader(cfg, 'test') if hasattr(logger, 'run_id'): run_id = logger.run_id else: run_id = 'no_wandb' _, _, inference = build_engine(cfg) inference(cfg, 0, model, test_dataloader, cfg.DEVICE, logger=logger, eval_kde_nll=True, test_mode=True)
def either_dataloader_dataset_to_both(data: Union[DataLoader, Dataset], *, batch_size=None, eval=False, **kwargs): if isinstance(data, DataLoader): dataloader = data dataset = data.dataset elif isinstance(data, Dataset): dataset = data dl_kwargs = {} if eval: dl_kwargs.update( dict(batch_size=1000, shuffle=False, drop_last=False)) else: dl_kwargs.update(dict(batch_size=128, shuffle=True)) if batch_size is not None: dl_kwargs["batch_size"] = batch_size dl_kwargs.update(kwargs) dataloader = datasets.make_dataloader(data, **dl_kwargs) else: raise NotImplementedError() return dataloader, dataset
def train(): logger.info('Initializing....') cudnn.enable = True cudnn.benchmark = True # torch.manual_seed(1) # torch.cuda.manual_seed(1) write_config_into_log(cfg) logger.info('Building model......') if cfg.pretrained: model = make_model(cfg) model.load_param(cfg) logger.info('Loaded pretrained model from {0}'.format(cfg.pretrained)) else: model = make_model(cfg) model.cuda() model = torch.nn.DataParallel(model) optimizer = torch.optim.Adam( [{ 'params': model.module.base.parameters(), 'lr': cfg.get_lr(0)[0] }, { 'params': model.module.classifiers.parameters(), 'lr': cfg.get_lr(0)[1] }], weight_decay=cfg.weight_decay) celoss = nn.CrossEntropyLoss().cuda() softloss = SoftLoss() sp_kd_loss = SP_KD_Loss() criterions = [celoss, softloss, sp_kd_loss] train_loader, val_loader = make_dataloader(cfg) logger.info('Begin training......') for epoch in range(cfg.start_epoch, cfg.max_epoch): train_one_epoch(train_loader, val_loader, model, criterions, optimizer, epoch, cfg) total_acc = test(cfg, val_loader, model, epoch) with open(cfg.test_log, 'a+') as f: f.write('Epoch {0}: Acc is {1:.4f}\n'.format(epoch, total_acc)) torch.save(obj=model.state_dict(), f=os.path.join( cfg.snapshot_dir, 'ep{}_acc{:.4f}.pth'.format(epoch, total_acc))) logger.info('Model saved')
def __init__(self, cfg): self.cfg = cfg model = RRNet(cfg).cuda(cfg.Distributed.gpu_id) model = nn.SyncBatchNorm.convert_sync_batchnorm(model) self.optimizer = optim.Adam(model.parameters(), lr=cfg.Train.lr) self.lr_sch = optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=cfg.Train.lr_milestones, gamma=0.1) self.training_loader, self.validation_loader = make_dataloader( cfg, collate_fn='rrnet') super(RRNetOperator, self).__init__(cfg=self.cfg, model=model, lr_sch=self.lr_sch) # TODO: change it to our class self.hm_focal_loss = FocalLossHM() self.l1_loss = RegL1Loss() self.main_proc_flag = cfg.Distributed.gpu_id == 0
def __init__(self, cfg): self.cfg = cfg #print(self.cfg.Val.threshold) model = CenterNet(cfg).cuda(cfg.Distributed.gpu_id) self.optimizer = optim.Adam(model.parameters(), lr=cfg.Train.lr) self.lr_sch = optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=cfg.Train.lr_milestones, gamma=0.1) self.training_loader, self.validation_loader = make_dataloader( cfg, collate_fn='ctnet') super(CenterNetOperator, self).__init__(cfg=self.cfg, model=model, lr_sch=self.lr_sch) # TODO: change it to our class self.focal_loss = FocalLossHM() self.l1_loss = RegL1Loss() self.main_proc_flag = cfg.Distributed.gpu_id == 0
def __init__(self, cfg): self.cfg = cfg model = RetinaNet(cfg).cuda(cfg.Distributed.gpu_id) self.optimizer = optim.Adam(model.parameters(), lr=cfg.Train.lr) self.lr_sch = optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=cfg.Train.lr_milestones, gamma=0.1) self.training_loader, self.validation_loader = make_dataloader(cfg) super(RetinaNetOperator, self).__init__(cfg=self.cfg, model=model, lr_sch=self.lr_sch) self.anchor_maker = Anchors(sizes=(16, 64, 128)) self.anchors, self.anchors_widths, self.anchors_heights, self.anchors_ctr_x, self.anchors_ctr_y = \ self.make_anchor(cfg.Train.crop_size) self.focal_loss = FocalLoss() self.main_proc_flag = cfg.Distributed.gpu_id == 0
import os import sys from config.cfg import Cfg from torch.backends import cudnn sys.path.append('.') from datasets import make_dataloader from processor import do_inference from model import make_model from utils.logger import setup_logger if __name__ == "__main__": # with open('cfg_test.json') as f: # cfg = json.load(f) Cfg.freeze() log_dir = Cfg.DATALOADER.LOG_DIR logger = setup_logger('Extract Feats', log_dir) logger.info("Running with config:\n{}".format(Cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = Cfg.MODEL.DEVICE_ID cudnn.benchmark = True val_loader = make_dataloader(Cfg) model = make_model(Cfg,255) model.load_param(Cfg.TEST.WEIGHT) do_inference(Cfg, model, val_loader)
if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logger = setup_logger("reid_baseline", output_dir, if_train=True) logger.info("Saving model in the path :{}".format(cfg.OUTPUT_DIR)) logger.info(args) if args.config_file != "": logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID train_loader, val_loader, num_query, num_classes = make_dataloader(cfg) if cfg.MODEL.PRETRAIN_CHOICE == 'finetune': model = make_model(cfg, num_class=num_classes) model.load_param_finetune(cfg.MODEL.PRETRAIN_PATH) print('Loading pretrained model for finetuning......') else: model = make_model(cfg, num_class=num_classes) loss_func, center_criterion = make_loss(cfg, num_classes=num_classes) optimizer, optimizer_center = make_optimizer(cfg, model, center_criterion) scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_EPOCHS, cfg.SOLVER.WARMUP_METHOD)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument('--gpu', default='0', type=str) parser.add_argument( "--config_file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # build model, optimizer and scheduler model = make_model(cfg) model = model.to(cfg.DEVICE) optimizer = build_optimizer(cfg, model) print('optimizer built!') # NOTE: add separate optimizers to train single object predictor and interaction predictor if cfg.USE_WANDB: logger = Logger("FOL", cfg, project=cfg.PROJECT, viz_backend="wandb") else: logger = logging.Logger("FOL") dataloader_params = { "batch_size": cfg.SOLVER.BATCH_SIZE, "shuffle": True, "num_workers": cfg.DATALOADER.NUM_WORKERS } # get dataloaders train_dataloader = make_dataloader(cfg, 'train') val_dataloader = make_dataloader(cfg, 'val') test_dataloader = make_dataloader(cfg, 'test') print('Dataloader built!') # get train_val_test engines do_train, do_val, inference = build_engine(cfg) print('Training engine built!') if hasattr(logger, 'run_id'): run_id = logger.run_id else: run_id = 'no_wandb' save_checkpoint_dir = os.path.join(cfg.CKPT_DIR, run_id) if not os.path.exists(save_checkpoint_dir): os.makedirs(save_checkpoint_dir) # NOTE: hyperparameter scheduler model.param_scheduler = ParamScheduler() model.param_scheduler.create_new_scheduler(name='kld_weight', annealer=sigmoid_anneal, annealer_kws={ 'device': cfg.DEVICE, 'start': 0, 'finish': 100.0, 'center_step': 400.0, 'steps_lo_to_hi': 100.0, }) model.param_scheduler.create_new_scheduler(name='z_logit_clip', annealer=sigmoid_anneal, annealer_kws={ 'device': cfg.DEVICE, 'start': 0.05, 'finish': 5.0, 'center_step': 300.0, 'steps_lo_to_hi': 300.0 / 5. }) if cfg.SOLVER.scheduler == 'exp': # exponential schedule lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.SOLVER.GAMMA) elif cfg.SOLVER.scheduler == 'plateau': # Plateau scheduler lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, min_lr=1e-07, verbose=1) else: lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25, 40], gamma=0.2) print('Schedulers built!') for epoch in range(cfg.SOLVER.MAX_EPOCH): logger.info("Epoch:{}".format(epoch)) do_train(cfg, epoch, model, optimizer, train_dataloader, cfg.DEVICE, logger=logger, lr_scheduler=lr_scheduler) val_loss = do_val(cfg, epoch, model, val_dataloader, cfg.DEVICE, logger=logger) if (epoch + 1) % 1 == 0: inference(cfg, epoch, model, test_dataloader, cfg.DEVICE, logger=logger, eval_kde_nll=False) torch.save( model.state_dict(), os.path.join(save_checkpoint_dir, 'Epoch_{}.pth'.format(str(epoch).zfill(3)))) # update LR if cfg.SOLVER.scheduler != 'exp': lr_scheduler.step(val_loss)
if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logger = setup_logger("reid_baseline", output_dir, if_train=True) logger.info("Saving model in the path :{}".format(cfg.OUTPUT_DIR)) logger.info(args) if args.config_file != "": logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID train_loader, val_loader_green, val_loader_normal, num_query_green, num_query_normal, num_classes = make_dataloader( cfg) if cfg.MODEL.PRETRAIN_CHOICE == 'finetune': model = make_model(cfg, num_class=num_classes) model.load_param_finetune(cfg.MODEL.PRETRAIN_PATH) print('Loading pretrained model for finetuning......') else: model = make_model(cfg, num_class=num_classes) loss_func, center_criterion = make_loss(cfg, num_classes=num_classes) optimizer, optimizer_center = make_optimizer(cfg, model, center_criterion) if cfg.SOLVER.TYPE == 'warmup': scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
from loss import make_loss from processor import do_train if __name__ == '__main__': Cfg.freeze() log_dir = Cfg.DATALOADER.LOG_DIR logger = setup_logger('pose-transfer-gan.train', log_dir) logger.info("Running with config:\n{}".format(Cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = Cfg.MODEL.DEVICE_ID cudnn.benchmark = True # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware. train_loader, val_loader = make_dataloader(Cfg) model_G, model_Dip, model_Dii, model_D_reid = make_model(Cfg) optimizerG = make_optimizer(Cfg, model_G) optimizerDip = make_optimizer(Cfg, model_Dip) optimizerDii = make_optimizer(Cfg, model_Dii) schedulerG = WarmupMultiStepLR(optimizerG, Cfg.SOLVER.STEPS, Cfg.SOLVER.GAMMA, Cfg.SOLVER.WARMUP_FACTOR, Cfg.SOLVER.WARMUP_EPOCHS, Cfg.SOLVER.WARMUP_METHOD) schedulerDip = WarmupMultiStepLR(optimizerDip, Cfg.SOLVER.STEPS, Cfg.SOLVER.GAMMA, Cfg.SOLVER.WARMUP_FACTOR, Cfg.SOLVER.WARMUP_EPOCHS, Cfg.SOLVER.WARMUP_METHOD)
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') print("torch.cuda.device_count()", torch.cuda.device_count()) if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) print("多卡训练") # model = DDP(model, delay_allreduce=True) # 必须在initialze之后 # model = nn.DataParallel(model) # model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # 字母小写o,不是零。 torch.distributed.init_process_group( 'gloo', init_method='file:///tmp/somefile', rank=0, world_size=1) # model = convert_syncbn_model(model) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # model = DistributedDataParallel(model, delay_allreduce=True) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) model = nn.DataParallel(model) # model = convert_syncbn_model(model) else: print("单卡训练") model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model.to(device=0) loss_meter = AverageMeter() acc_meter = AverageMeter() # evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) # model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) # model, optimizer = amp.initialize(model, optimizer, opt_level='O1') for epoch in range(1, epochs + 1): if epoch == 5: print("balance 数据训练") # cfg.DATASETS.ROOT_DIR = '/home/lab3/bi/0716/Veri/ai_city/tools/mix_train_balance_flip.pkl' cfg.DATASETS.ROOT_DIR = 'datasets/mix_train_balance.pkl' train_loader, val_loader, num_query, num_classes = make_dataloader( cfg) # model.base._freeze_stages() start_time = time.time() loss_meter.reset() acc_meter.reset() # evaluator.reset() scheduler.step() model.train() # print(scheduler.get_lr()[0]) for n_iter, (img, vid) in enumerate(tqdm(train_loader)): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) #grid mask # img = grid(img) # score, feat,score_f1,score_f2,score_f3,f4,f4_score = model(img, target) # score, feat,score_f1,score_f2,feat1,score_layer2 = model(img, target) score, feat, score_f1, score_f2, feat1 = model(img, target) # print(feat.shape) loss = loss_fn(score, feat, target, score_f1, score_f2, feat1) # loss = loss_fn(score, feat, target,score_f1,score_f2,feat1,score_layer2) if cfg.SOLVER.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # scaled_loss.backward(retain_graph=True) else: loss.backward() # loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) # print(loss_meter.val) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_epoch{}.pth'.format(epoch))) if epoch == 10: reduce_model_dict = model.half().state_dict() del_keys = [] for key in reduce_model_dict.keys(): if 'class' in key or 'sub1' in key or 'sub2' in key or 'base.fc' in key: del_keys.append(key) for key in del_keys: del reduce_model_dict[key] torch.save( reduce_model_dict, os.path.join( cfg.OUTPUT_DIR, cfg.MODEL.NAME + str(cfg.INPUT.SIZE_TRAIN[0]) + 'half.pth'))
logger = setup_logger("reid_baseline", output_dir, if_train=False) logger.info(args) if args.config_file != "": logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID if cfg.TEST.FLIP_FEATS != 'on': train_loader, val_loader, num_query_normal, num_classes,val_loader_center,val_loader_lb,val_loader_rb,val_loader_rt,val_loader_lt = make_dataloader(cfg) val_loader_normal = [val_loader,val_loader_center,val_loader_lt,val_loader_rt,val_loader_lb,val_loader_rb] model = make_model(cfg, num_class=num_classes) model.load_param(cfg.TEST.WEIGHT) do_inference_multi(cfg, model, val_loader_normal, num_query_normal) else: train_loader, val_loader_normal, num_query_normal, num_classes = make_dataloader(cfg) model = make_model(cfg, num_class=num_classes) model.load_param(cfg.TEST.WEIGHT) do_inference(cfg,
# model_name = 'MixNet' # model_path = ' ' model = Baseline(model='train', model_name=model_name, model_path=model_path) #model.load_param('models/model_1_180000.pth') model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) #exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) # kd_id = 0 # kd_num = 7 # batch_size = 48 # instance_num = 1 train_data, val_data, trains, vals = make_dataloader(kd_id, kd_num) train_loader = DataLoader(dataset=train_data, batch_size=batch_size, sampler=RandomSampler(trains, batch_size, instance_num), shuffle=False, num_workers=2, collate_fn=train_collate) #train_loader = DataLoader(dataset=train_data, batch_size=48, shuffle=False, num_workers=2, collate_fn=train_collate) val_loader = DataLoader(dataset=val_data, batch_size=64, shuffle=False, num_workers=2, collate_fn=train_collate) train_length = len(train_loader) val_length = len(val_loader)
from datasets import make_dataloader from configs.kmeans_config import Config from tqdm import tqdm import torch from ext.kmeans.kmeans import lloyd train_loader, val_loader = make_dataloader(Config) all_w = [] all_h = [] all_d = [] with torch.no_grad(): for i, batch in enumerate(tqdm(train_loader)): annos = batch[1][0] # d = (annos[:, 2].pow(2) + annos[:, 3].pow(2)).sqrt() all_w.append(annos[:, 3].clone()) all_h.append(annos[:, 2].clone()) all_w = torch.cat(all_w).unsqueeze(1) all_h = torch.cat(all_h).unsqueeze(1) h_results = lloyd(all_h, 3) print(h_results) # 20.3807, 73.2261, 182.68274 w_results = lloyd(all_w, 3) print(w_results) # 21.9839, 63.8345, 155.8799
os.makedirs(output_dir) logger = setup_logger("reid_baseline", output_dir, if_train=True) logger.info("Saving model in the path :{}".format(cfg.OUTPUT_DIR)) logger.info(args) if args.config_file != "": logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID #train_loader, val_loader_green, val_loader_normal, num_query_green, num_query_normal, num_classes = make_dataloader(cfg) train_loader, num_classes = make_dataloader(cfg) val_loader_green = None num_query_green = None if cfg.MODEL.PRETRAIN_CHOICE == 'finetune': model = make_model(cfg, num_class=num_classes) model.load_param_finetune(cfg.MODEL.PRETRAIN_PATH) print('Loading pretrained model for finetuning......') #print(model.base.state_dict()['conv1.weight']) for param in model.parameters(): # 冻结参数 param.requires_grad = True print("参数解冻") else: model = make_model(cfg, num_class=num_classes) loss_func, center_criterion = make_loss(cfg, num_classes=num_classes)
def train(): if args.local_rank == 0: logger.info('Initializing....') cudnn.enable = True cudnn.benchmark = True # torch.manual_seed(1) # torch.cuda.manual_seed(1) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.local_rank == 0: write_config_into_log(cfg) if args.local_rank == 0: logger.info('Building model......') if cfg.pretrained: model = make_model(cfg) model.load_param(cfg) if args.local_rank == 0: logger.info('Loaded pretrained model from {0}'.format( cfg.pretrained)) else: model = make_model(cfg) if args.sync_bn: if args.local_rank == 0: logging.info("using apex synced BN") model = convert_syncbn_model(model) model.cuda() if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # delay_allreduce delays all communication to the end of the backward pass. model = DistributedDataParallel(model, delay_allreduce=True) else: model = torch.nn.DataParallel(model) optimizer = torch.optim.Adam( [{ 'params': model.module.base.parameters(), 'lr': cfg.get_lr(0)[0] }, { 'params': model.module.classifiers.parameters(), 'lr': cfg.get_lr(0)[1] }], weight_decay=cfg.weight_decay) celoss = nn.CrossEntropyLoss().cuda() softloss = SoftLoss() sp_kd_loss = SP_KD_Loss() criterions = [celoss, softloss, sp_kd_loss] cfg.batch_size = cfg.batch_size // args.world_size cfg.num_workers = cfg.num_workers // args.world_size train_loader, val_loader = make_dataloader(cfg) if args.local_rank == 0: logger.info('Begin training......') for epoch in range(cfg.start_epoch, cfg.max_epoch): train_one_epoch(train_loader, val_loader, model, criterions, optimizer, epoch, cfg) total_acc = test(cfg, val_loader, model, epoch) if args.local_rank == 0: with open(cfg.test_log, 'a+') as f: f.write('Epoch {0}: Acc is {1:.4f}\n'.format(epoch, total_acc)) torch.save(obj=model.state_dict(), f=os.path.join( cfg.snapshot_dir, 'ep{}_acc{:.4f}.pth'.format(epoch, total_acc))) logger.info('Model saved')
if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logger = setup_logger("transreid", output_dir, if_train=False) logger.info(args) if args.config_file != "": logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID train_loader, train_loader_normal, val_loader, num_query, num_classes, camera_num, view_num = make_dataloader( cfg) model = make_model(cfg, num_class=num_classes, camera_num=camera_num, view_num=view_num) model.load_param(cfg.TEST.WEIGHT) if cfg.DATASETS.NAMES == 'VehicleID': for trial in range(10): train_loader, train_loader_normal, val_loader, num_query, num_classes, camera_num, view_num = make_dataloader( cfg) rank_1, rank5 = do_inference(cfg, model, val_loader, num_query) if trial == 0: all_rank_1 = rank_1 all_rank_5 = rank5
import os import sys from config.config import Configuration import torch from torch.backends import cudnn sys.path.append('.') from datasets import make_dataloader from processor import do_inference from model import make_model from utils.logger import setup_logger if __name__ == "__main__": Cfg = Configuration() log_dir = Cfg.LOG_DIR logger = setup_logger('{}.test'.format(Cfg.PROJECT_NAME), log_dir) os.environ['CUDA_VISIBLE_DEVICES'] = Cfg.DEVICE_ID cudnn.benchmark = True # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware. train_loader, test_loader = make_dataloader(Cfg) model = make_model(Cfg) model.load_state_dict(torch.load(Cfg.TEST_WEIGHT)) do_inference(Cfg, model, test_loader)