def train(args): if args.config_file != "": cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) shutil.copy(args.config_file, cfg.OUTPUT_DIR) num_gpus = torch.cuda.device_count() logger = setup_logger('reid_baseline', output_dir, 0) logger.info('Using {} GPUS'.format(num_gpus)) logger.info(args) logger.info('Running with config:\n{}'.format(cfg)) train_dl, val_dl, num_query, num_classes = make_dataloader(cfg, num_gpus) model = build_model(cfg, num_classes) # print(model) loss_func = make_loss(cfg, num_classes) trainer = BaseTrainer(cfg, model, train_dl, val_dl, loss_func, num_query, num_gpus) for epoch in range(trainer.epochs): for batch in trainer.train_dl: trainer.step(batch) trainer.handle_new_batch() trainer.handle_new_epoch()
def test(cfg, args): # train_dataset = dataset.HandGraph(cfg.DATASET.ROOT, # cfg.DATASET.TRAIN_SET, # 'png') # train_dataset.visualize_data() train_dataset = make_dataloader(cfg, is_train=True).dataset train_dataset.visualize_data()
def dataset(): vocab = Vocab('data/vocab', 50000) dataloader = make_dataloader('data/conv_dev.jsonl', 32, vocab, 200, False, False) for batch in dataloader: for key in batch: if key != 'id': print(key, batch[key].size()) break
def run(args): gpuids = tuple(map(int, args.gpus.split(","))) nnet = TasNet() trainer = SiSnrTrainer(nnet, gpuid=gpuids, checkpoint=args.checkpoint, resume=args.resume, **trainer_conf) train_loader = make_dataloader( train=True, #data_kwargs=train_data, batch_size=args.batch_size, chunk_size=chunk_size, num_workers=args.num_workers, ) #online=True, cone=False) dev_loader = make_dataloader( train=False, #data_kwargs=dev_data, batch_size=args.batch_size, chunk_size=chunk_size, num_workers=args.num_workers, ) #online=True, cone=False) #dataset = ConeData(dev_data['data_path'], num_spks) dataset = OnlineSimulationDataset(vctk_audio, ms_snsd, 48, simulation_config_test, truncator, "./test_cache", 50) fusion_list = [] mix_list = [] ref_list = [] for i in range(len(dataset)): input = dataset.__getitem__(i) fusion_list.append(Prep(input)) mix_list.append(input[0]) ref_list.append(input[3]) trainer.run(train_loader, dev_loader, num_epochs=args.epochs, fusion_list=fusion_list, mix_list=mix_list, ref_list=ref_list)
def run(args): gpuids = tuple(map(int, args.gpus.split(","))) nnet = ConvTasNet(**nnet_conf) trainer = SiSnrTrainer(nnet, gpuid=gpuids, checkpoint=args.checkpoint, resume=args.resume, **trainer_conf) for conf, fname in zip([nnet_conf, trainer_conf], ["mdl.json", "trainer.json"]): dump_json(conf, args.checkpoint, fname) train_loader = make_dataloader(shuffle=True, data_kwargs=train_data, batch_size=args.batch_size, chunk_size=chunk_size) dev_loader = make_dataloader(shuffle=False, data_kwargs=dev_data, batch_size=args.batch_size, chunk_size=chunk_size) trainer.run(train_loader, dev_loader, num_epochs=args.epochs)
def main(): output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) num_gpus = torch.cuda.device_count() logger = setup_logger('reid_baseline', output_dir, 0) logger.info('Using {} GPUS'.format(num_gpus)) logger.info('Running with config:\n{}'.format(cfg)) train_dl, val_dl, num_query, num_classes = make_dataloader(cfg, num_gpus) model = build_model(cfg, num_classes) loss = make_loss(cfg, num_classes) trainer = BaseTrainer(cfg, model, train_dl, val_dl, loss, num_query, num_gpus) for epoch in range(trainer.epochs): for batch in trainer.train_dl: trainer.step(batch) trainer.handle_new_batch() trainer.handle_new_epoch()
def main(): parser = argparse.ArgumentParser(description="Baseline Training") parser.add_argument("--config_file", default="", help="path to config file", type=str) parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() if args.config_file != "": cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) num_gpus = 0 device = torch.device("cpu") if cfg.MODEL.DEVICE == 'cuda' and torch.cuda.is_available(): num_gpus = len(cfg.MODEL.DEVICE_IDS)-1 device_ids = cfg.MODEL.DEVICE_IDS.strip("d") print(device_ids) device = torch.device("cuda:{0}".format(device_ids)) logger = setup_logger('baseline', output_dir, 0) logger.info('Using {} GPUS'.format(num_gpus)) logger.info('Running with config:\n{}'.format(cfg)) train_dl, val_dl = make_dataloader(cfg, num_gpus) model = build_model(cfg) loss = make_loss(cfg, device) trainer = BaseTrainer(cfg, model, train_dl, val_dl, loss, num_gpus, device) logger.info(type(model)) logger.info(loss) logger.info(trainer) for epoch in range(trainer.epochs): for batch in trainer.train_dl: trainer.step(batch) trainer.handle_new_batch() trainer.handle_new_epoch()
def predict_proba(self, x): ''' x (list(sample(dict))) sample (dict): keys are 's1', 's2', 'label' ''' dataloader = make_dataloader(x, 64, self.vocab, self.max_len, False, self.use_cuda) preds = [] for batch in dataloader: keys = ('s1', 's1_len', 's1_mask', 's2', 's2_len') outputs = self.model(*get_vars(batch, *keys, use_cuda=self.use_cuda)) # (B*3) if self.activate: outputs = F.softmax(outputs, 1) else: outputs = F.log_softmax(outputs, 1) preds.extend(outputs.cpu().data.tolist()) return np.asarray(preds)
def train(): trainloader, testloader = make_dataloader() # build model model = BasicModel() # loss func loss_func = nn.CrossEntropyLoss() # optimzier optimizier = optim.Adam(model.parameters(), lr=1e-3) # configuration epochs = 10 # training for epoch in range(epochs): model.train() pbar = tqdm(trainloader) for image, label in pbar: # forward output = model(image) # compute loss loss = loss_func(output, label) optimizier.zero_grad() loss.backward() optimizier.step() # compute batch accuracy predicts = torch.argmax(output, dim=-1) accu = torch.sum(predicts == label).float() / image.size(0) pbar.set_description('Epoch:[{:02d}]-Loss:{:.3f}-Accu:{:.3f}'\ .format(epoch+1,loss.item(),accu.item())) # testing model.eval() with torch.no_grad(): corrects = 0 total_nums = 0 for image, label in tqdm(testloader): output = model(image) predicts = torch.argmax(output, dim=-1) corrects += (predicts == label).sum() total_nums += label.size(0) test_accu = corrects.float() / total_nums print('Epoch:[{:02d}]-Test_Accu:{:.3f}'.format( epoch + 1, test_accu.item()))
def build_loader(vocab, hps): mode = hps.mode.replace('hypo', '') if mode == 'train': single_pass = False bsize = {'train': hps.batch_size, 'val': hps.batch_size} elif mode == 'val': single_pass = True bsize = {'val': hps.batch_size} elif mode == 'test': single_pass = True bsize = {'test': hps.batch_size} else: raise ValueError('Unknown mode: %s' % hps.mode) loader = {} args = (vocab, hps.max_steps, single_pass, hps.use_cuda) for key in bsize: dpath = path.join(hps.data_path, getattr(hps, key + '_data')) loader[key] = make_dataloader(dpath, bsize[key], *args) return loader
def main(): parser = argparse.ArgumentParser(description="ReID Baseline Training") parser.add_argument("--config_file", default="", help="path to config file", type=str) parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() if args.config_file != "": cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) num_gpus = torch.cuda.device_count() logger = setup_logger('reid_baseline', output_dir, 0) logger.info('Using {} GPUS'.format(num_gpus)) logger.info('Running with config:\n{}'.format(cfg)) train_dl, val_dl, num_query, num_classes = make_dataloader(cfg, num_gpus) model = build_model(cfg, num_classes) loss = make_loss(cfg, num_classes) trainer = SGDTrainer(cfg, model, train_dl, val_dl, loss, num_query, num_gpus) logger.info('train transform: \n{}'.format(train_dl.dataset.transform)) logger.info('valid transform: \n{}'.format(val_dl.dataset.transform)) logger.info(type(model)) logger.info(loss) logger.info(trainer) for epoch in range(trainer.epochs): for batch in trainer.train_dl: trainer.step(batch) trainer.handle_new_batch() trainer.handle_new_epoch()
def main(conf): ''' train_set = TACDataset(conf["data"]["train_json"], conf["data"]["segment"], train=True) val_set = TACDataset(conf["data"]["dev_json"], conf["data"]["segment"], train=False) train_loader = DataLoader( train_set, shuffle=True, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, ) val_loader = DataLoader( val_set, shuffle=False, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, ) ''' train_loader = make_dataloader(train=True, batch_size=conf['training']["batch_size"], chunk_size=conf['data']['chunk'], num_workers=conf['training']['num_workers']) val_loader = make_dataloader(train=False, batch_size=conf['training']['batch_size'], chunk_size=conf['data']['chunk'], num_workers=conf['training']['num_workers']) #Prep(train_loader) #Prep(val_loader) #for data in train_loader: #print(type(data[0])) model = TasNet() # model_parameters = filter(lambda p: p.requires_grad, model.parameters()) # params = sum([np.prod(p.size()) for p in model_parameters]) # print(params) # exit() optimizer = make_optimizer(model.parameters(), **conf["optim"]) # Define scheduler if conf["training"]["half_lr"]: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=conf["training"]["patience"]) else: scheduler = None # Just after instantiating, save the args. Easy loading in the future. exp_dir = conf["main_args"]["exp_dir"] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, "conf.yml") with open(conf_path, "w") as outfile: yaml.safe_dump(conf, outfile) # Define Loss function. loss_func = MSELoss() system = AngleSystem( model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf, ) # Define callbacks # Define callbacks callbacks = [] checkpoint_dir = os.path.join(exp_dir, "checkpoints/") checkpoint = ModelCheckpoint( checkpoint_dir, monitor="val_loss", mode="min", save_top_k=conf["training"]["save_top_k"], verbose=True, ) callbacks.append(checkpoint) if conf["training"]["early_stop"]: callbacks.append( EarlyStopping(monitor="val_loss", mode="min", patience=conf["training"]["patience"], verbose=True)) # Don't ask GPU if they are not available. gpus = [-1] trainer = pl.Trainer( max_epochs=conf["training"]["epochs"], callbacks=callbacks, default_root_dir=exp_dir, #gpus=gpus, distributed_backend="ddp", gradient_clip_val=conf["training"]["gradient_clipping"], ) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) state_dict = torch.load(checkpoint.best_model_path) system.load_state_dict(state_dict=state_dict["state_dict"]) system.cpu() #to_save = system.model.serialize() #to_save.update(train_set.get_infos()) torch.save(system.model.state_dict(), os.path.join(exp_dir, "best_model.ckpt"))
def main_worker(gpus, ngpus_per_node, args, final_output_dir, tb_log_dir): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #os.environ['CUDA_VISIBLE_DEVICES']=gpus # Parallel setting print("Use GPU: {} for training".format(gpus)) update_config(cfg, args) #test(cfg, args) # logger setting logger, _ = setup_logger(final_output_dir, args.rank, 'train') writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # model initilization model = eval(cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=True) # load pretrained model before DDP initialization checkpoint_file = os.path.join(final_output_dir, 'model_best.pth.tar') if cfg.AUTO_RESUME: if os.path.exists(checkpoint_file): checkpoint = torch.load(checkpoint_file, map_location='cpu') state_dict = checkpoint['state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) model.load_state_dict(state_dict) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) elif cfg.MODEL.HRNET_PRETRAINED: logger.info("=> loading a pretrained model '{}'".format( cfg.MODEL.PRETRAINED)) checkpoint = torch.load(cfg.MODEL.HRNET_PRETRAINED, map_location='cpu') state_dict = checkpoint['state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) model.load_state_dict(state_dict) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir) # copy configuration file config_dir = args.cfg shutil.copy2(os.path.join(args.cfg), final_output_dir) # calculate GFLOPS dump_input = torch.rand( (1, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0])) logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) #ops, params = get_model_complexity_info( # model, (3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0]), # as_strings=True, print_per_layer_stat=True, verbose=True) # FP16 SETTING if cfg.FP16.ENABLED: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if cfg.FP16.STATIC_LOSS_SCALE != 1.0: if not cfg.FP16.ENABLED: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) if cfg.FP16.ENABLED: model = network_to_half(model) if cfg.MODEL.SYNC_BN and not cfg.cfg.DISTRIBUTED: print( 'Warning: Sync BatchNorm is only supported in distributed training.' ) # Distributed Computing master = True if cfg.DISTRIBUTED: # This block is not available args.local_rank += int(gpus[0]) print('This process is using GPU', args.local_rank) device = args.local_rank master = device == int(gpus[0]) dist.init_process_group(backend='nccl') if cfg.MODEL.SYNC_BN: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if gpus is not None: torch.cuda.set_device(device) model.cuda(device) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # workers = int(workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], output_device=device, find_unused_parameters=True) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # implement this block gpu_ids = eval('[' + gpus + ']') device = gpu_ids[0] print('This process is using GPU', str(device)) model = torch.nn.DataParallel(model, gpu_ids).cuda(device) # Prepare loss functions criterion = {} if cfg.LOSS.WITH_HEATMAP_LOSS: criterion['heatmap_loss'] = HeatmapLoss().cuda() if cfg.LOSS.WITH_POSE2D_LOSS: criterion['pose2d_loss'] = JointsMSELoss().cuda() if cfg.LOSS.WITH_BONE_LOSS: criterion['bone_loss'] = BoneLengthLoss().cuda() if cfg.LOSS.WITH_JOINTANGLE_LOSS: criterion['jointangle_loss'] = JointAngleLoss().cuda() best_perf = 1e9 best_model = False last_epoch = -1 # optimizer must be initilized after model initilization optimizer = get_optimizer(cfg, model) if cfg.FP16.ENABLED: optimizer = FP16_Optimizer( optimizer, static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE, dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE, verbose=False) begin_epoch = cfg.TRAIN.BEGIN_EPOCH if not cfg.AUTO_RESUME and cfg.MODEL.HRNET_PRETRAINED: optimizer.load_state_dict(checkpoint['optimizer']) if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): begin_epoch = checkpoint['epoch'] best_perf = checkpoint['loss'] optimizer.load_state_dict(checkpoint['optimizer']) if 'train_global_steps' in checkpoint.keys() and \ 'valid_global_steps' in checkpoint.keys(): writer_dict['train_global_steps'] = checkpoint[ 'train_global_steps'] writer_dict['valid_global_steps'] = checkpoint[ 'valid_global_steps'] if cfg.FP16.ENABLED: logger.info("=> Using FP16 mode") lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) elif cfg.TRAIN.LR_SCHEDULE == 'warmup': from utils.utils import get_linear_schedule_with_warmup lr_scheduler = get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=cfg.TRAIN.WARMUP_EPOCHS, num_training_steps=cfg.TRAIN.END_EPOCH - cfg.TRAIN.BEGIN_EPOCH, last_epoch=begin_epoch) elif cfg.TRAIN.LR_SCHEDULE == 'multi_step': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) else: print('Unknown learning rate schedule!') exit() # Data loading code train_loader_dict = make_dataloader(cfg, is_train=True, distributed=cfg.DISTRIBUTED) valid_loader_dict = make_dataloader(cfg, is_train=False, distributed=cfg.DISTRIBUTED) for i, (dataset_name, train_loader) in enumerate(train_loader_dict.items()): logger.info( 'Training Loader {}/{}:\n'.format(i + 1, len(train_loader_dict)) + str(train_loader.dataset)) for i, (dataset_name, valid_loader) in enumerate(valid_loader_dict.items()): logger.info('Validation Loader {}/{}:\n'.format( i + 1, len(valid_loader_dict)) + str(valid_loader.dataset)) #writer_dict['writer'].add_graph(model, (dump_input, )) """ Start training """ start_time = time.time() with torch.autograd.set_detect_anomaly(True): for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1): epoch_start_time = time.time() # shuffle datasets with the sample random seed if cfg.DISTRIBUTED: for data_loader in train_loader_dict.values(): data_loader.sampler.set_epoch(epoch) # train for one epoch # get_last_lr() returns a list logger.info('Start training [{}/{}] lr: {:.4e}'.format( epoch, cfg.TRAIN.END_EPOCH - cfg.TRAIN.BEGIN_EPOCH, lr_scheduler.get_last_lr()[0])) train(cfg, args, master, train_loader_dict, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, logger, fp16=cfg.FP16.ENABLED, device=device) # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`. lr_scheduler.step() # evaluate on validation set if not cfg.WITHOUT_EVAL: logger.info('Start evaluating [{}/{}]'.format( epoch, cfg.TRAIN.END_EPOCH - 1)) with torch.no_grad(): recorder = validate(cfg, args, master, valid_loader_dict, model, criterion, final_output_dir, tb_log_dir, writer_dict, logger, device=device) val_total_loss = recorder.avg_total_loss best_model = False if val_total_loss < best_perf: logger.info( 'This epoch yielded a better model with total loss {:.4f} < {:.4f}.' .format(val_total_loss, best_perf)) best_perf = val_total_loss best_model = True else: val_total_loss = 0 best_model = True if master: logger.info( '=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.EXP_NAME + '.' + cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'loss': val_total_loss, 'optimizer': optimizer.state_dict(), 'train_global_steps': writer_dict['train_global_steps'], 'valid_global_steps': writer_dict['valid_global_steps'] }, best_model, final_output_dir) print('\nEpoch {} spent {:.2f} hours\n'.format( epoch, (time.time() - epoch_start_time) / 3600)) #if epoch == 3:break if master: final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpus)) logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close() print( '\n[Training Accomplished] {} epochs spent {:.2f} hours\n'.format( cfg.TRAIN.END_EPOCH - begin_epoch + 1, (time.time() - start_time) / 3600))
def main_worker(gpus, ngpus_per_node, args, final_output_dir, tb_log_dir): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #os.environ['CUDA_VISIBLE_DEVICES']=gpus # if len(gpus) == 1: # gpus = int(gpus) update_config(cfg, args) #test(cfg, args) # logger setting logger, _ = setup_logger(final_output_dir, args.rank, 'train') writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # model initilization model = { "ransac": RANSACTriangulationNet, "alg": AlgebraicTriangulationNet, "vol": VolumetricTriangulationNet, "vol_CPM": VolumetricTriangulationNet_CPM, "FTL": FTLMultiviewNet }[cfg.MODEL.NAME](cfg) discriminator = Discriminator(cfg) # load pretrained model before DDP initialization if cfg.AUTO_RESUME: checkpoint_file = os.path.join(final_output_dir, 'model_best.pth.tar') if os.path.exists(checkpoint_file): checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu')) state_dict = checkpoint['state_dict'] D_state_dict = checkpoint['D_state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) for key in list(D_state_dict.keys()): new_key = key.replace("module.", "") D_state_dict[new_key] = D_state_dict.pop(key) model.load_state_dict(state_dict) discriminator.load_state_dict(D_state_dict) logger.info("=> Loading checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) else: print('[Warning] Checkpoint file not found! Wrong path: {}'.format( checkpoint_file)) elif cfg.MODEL.HRNET_PRETRAINED: logger.info("=> loading a pretrained model '{}'".format( cfg.MODEL.PRETRAINED)) checkpoint = torch.load(cfg.MODEL.HRNET_PRETRAINED) state_dict = checkpoint['state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) model.load_state_dict(state_dict) # initiliaze a optimizer # optimizer must be initilized after model initilization if cfg.MODEL.TRIANGULATION_MODEL_NAME == "vol": optimizer = torch.optim.Adam([{ 'params': model.backbone.parameters(), 'initial_lr': cfg.TRAIN.LR }, { 'params': model.process_features.parameters(), 'initial_lr': cfg.TRAIN.PROCESS_FEATURE_LR if hasattr(cfg.TRAIN, "PROCESS_FEATURE_LR") else cfg.TRAIN.LR }, { 'params': model.volume_net.parameters(), 'initial_lr': cfg.TRAIN.VOLUME_NET_LR if hasattr(cfg.TRAIN, "VOLUME_NET_LR") else cfg.TRAIN.LR }], lr=cfg.TRAIN.LR) else: optimizer = torch.optim.Adam( [{ 'params': filter(lambda p: p.requires_grad, model.parameters()), 'initial_lr': cfg.TRAIN.LR }], lr=cfg.TRAIN.LR) D_optimizer = torch.optim.RMSprop([{ 'params': filter(lambda p: p.requires_grad, discriminator.parameters()), 'initial_lr': cfg.TRAIN.LR }], lr=cfg.TRAIN.LR) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2(os.path.join(this_dir, '../lib/models', 'triangulation.py'), final_output_dir) # copy configuration file config_dir = args.cfg shutil.copy2(os.path.join(args.cfg), final_output_dir) # calculate GFLOPS # dump_input = torch.rand( # (1, 4, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0]) # ) # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) # FP16 SETTING if cfg.FP16.ENABLED: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if cfg.FP16.STATIC_LOSS_SCALE != 1.0: if not cfg.FP16.ENABLED: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) if cfg.FP16.ENABLED: model = network_to_half(model) if cfg.MODEL.SYNC_BN and not cfg.DISTRIBUTED: print( 'Warning: Sync BatchNorm is only supported in distributed training.' ) if cfg.FP16.ENABLED: optimizer = FP16_Optimizer( optimizer, static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE, dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE, verbose=False) # Distributed Computing master = True if cfg.DISTRIBUTED: # This block is not available args.local_rank += int(gpus[0]) print('This process is using GPU', args.local_rank) device = args.local_rank master = device == int(gpus[0]) dist.init_process_group(backend='nccl') if cfg.MODEL.SYNC_BN: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if gpus is not None: torch.cuda.set_device(device) model.cuda(device) discriminator.cuda(device) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # workers = int(workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], output_device=device, find_unused_parameters=True) discriminator = torch.nn.parallel.DistributedDataParallel( discriminator, device_ids=[device], output_device=device, find_unused_parameters=True) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # implement this block gpu_ids = eval('[' + gpus + ']') device = gpu_ids[0] print('This process is using GPU', str(device)) model = torch.nn.DataParallel(model, gpu_ids).cuda(device) discriminator = torch.nn.DataParallel(discriminator, gpu_ids).cuda(device) # Prepare loss functions criterion = {} if cfg.LOSS.WITH_HEATMAP_LOSS: criterion['heatmap_loss'] = HeatmapLoss().cuda(device) if cfg.LOSS.WITH_POSE2D_LOSS: criterion['pose2d_loss'] = JointsMSELoss().cuda(device) if cfg.LOSS.WITH_POSE3D_LOSS: criterion['pose3d_loss'] = Joints3DMSELoss().cuda(device) if cfg.LOSS.WITH_VOLUMETRIC_CE_LOSS: criterion['volumetric_ce_loss'] = VolumetricCELoss().cuda(device) if cfg.LOSS.WITH_BONE_LOSS: criterion['bone_loss'] = BoneLengthLoss().cuda(device) if cfg.LOSS.WITH_TIME_CONSISTENCY_LOSS: criterion['time_consistency_loss'] = Joints3DMSELoss().cuda(device) if cfg.LOSS.WITH_KCS_LOSS: criterion['KCS_loss'] = None if cfg.LOSS.WITH_JOINTANGLE_LOSS: criterion['jointangle_loss'] = JointAngleLoss().cuda(device) best_perf = 1e9 best_model = False last_epoch = -1 # load history begin_epoch = cfg.TRAIN.BEGIN_EPOCH if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): begin_epoch = checkpoint['epoch'] + 1 best_perf = checkpoint['loss'] optimizer.load_state_dict(checkpoint['optimizer']) D_optimizer.load_state_dict(checkpoint['D_optimizer']) if 'train_global_steps' in checkpoint.keys() and \ 'valid_global_steps' in checkpoint.keys(): writer_dict['train_global_steps'] = checkpoint[ 'train_global_steps'] writer_dict['valid_global_steps'] = checkpoint[ 'valid_global_steps'] # Floating point 16 mode if cfg.FP16.ENABLED: logger.info("=> Using FP16 mode") lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) else: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) # Data loading code train_loader_dict = make_dataloader(cfg, is_train=True, distributed=cfg.DISTRIBUTED) valid_loader_dict = make_dataloader(cfg, is_train=False, distributed=cfg.DISTRIBUTED) for i, (dataset_name, train_loader) in enumerate(train_loader_dict.items()): logger.info( 'Training Loader {}/{}:\n'.format(i + 1, len(train_loader_dict)) + str(train_loader.dataset)) for i, (dataset_name, valid_loader) in enumerate(valid_loader_dict.items()): logger.info('Validation Loader {}/{}:\n'.format( i + 1, len(valid_loader_dict)) + str(valid_loader.dataset)) #writer_dict['writer'].add_graph(model, (dump_input, )) """ Start training """ start_time = time.time() with torch.autograd.set_detect_anomaly(True): for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): epoch_start_time = time.time() # shuffle datasets with the sample random seed if cfg.DISTRIBUTED: for data_loader in train_loader_dict.values(): data_loader.sampler.set_epoch(epoch) # train for one epoch logger.info('Start training [{}/{}]'.format( epoch, cfg.TRAIN.END_EPOCH - 1)) train(epoch, cfg, args, master, train_loader_dict, [model, discriminator], criterion, [optimizer, D_optimizer], final_output_dir, tb_log_dir, writer_dict, logger, device, fp16=cfg.FP16.ENABLED) # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`. lr_scheduler.step() # evaluate on validation set if not cfg.WITHOUT_EVAL: logger.info('Start evaluating [{}/{}]'.format( epoch, cfg.TRAIN.END_EPOCH - 1)) with torch.no_grad(): recorder = validate(cfg, args, master, valid_loader_dict, [model, discriminator], criterion, final_output_dir, tb_log_dir, writer_dict, logger, device) val_total_loss = recorder.avg_total_loss if val_total_loss < best_perf: logger.info( 'This epoch yielded a better model with total loss {:.4f} < {:.4f}.' .format(val_total_loss, best_perf)) best_perf = val_total_loss best_model = True else: best_model = False else: val_total_loss = 0 best_model = True logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.EXP_NAME + '.' + cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'D_state_dict': discriminator.state_dict(), 'loss': val_total_loss, 'optimizer': optimizer.state_dict(), 'D_optimizer': D_optimizer.state_dict(), 'train_global_steps': writer_dict['train_global_steps'], 'valid_global_steps': writer_dict['valid_global_steps'] }, best_model, final_output_dir) print('\nEpoch {} spent {:.2f} hours\n'.format( epoch, (time.time() - epoch_start_time) / 3600)) #if epoch == 3:break if master: final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpus)) logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close() print( '\n[Training Accomplished] {} epochs spent {:.2f} hours\n'.format( cfg.TRAIN.END_EPOCH - begin_epoch + 1, (time.time() - start_time) / 3600))
dev_data = read_infile(args.dev_file) tokenizer = AutoTokenizer.from_pretrained(args.model_name, return_token_type_ids=True) model = AutoModel.from_pretrained(args.model_name) train_dataset = make_dataset(tokenizer, train_data, pos_label=args.pos_label, answer_field=args.answer_field, first_key=args.first_sentence, second_key=args.second_sentence, device="cuda:0") dev_dataset = make_dataset(tokenizer, dev_data, pos_label=args.pos_label, answer_field=args.answer_field, first_key=args.first_sentence, second_key=args.second_sentence, device="cuda:0") train_dataloader = make_dataloader(train_dataset, batch_size=args.train_batch_size) dev_dataloader = make_dataloader(dev_dataset, batch_size=args.dev_batch_size, shuffle=False) if args.batch_size is None: args.batch_size = args.train_batch_size if args.batch_size % args.train_batch_size != 0: raise ValueError("GPU batch size should divide batch size per update.") batches_per_update = args.batch_size // args.train_batch_size bert_classifier = BertClassifier(model, state_key="pooler_output", lr=args.lr, accumulate_gradients=batches_per_update).to("cuda:0") best_score, best_weights = 0.0, None if args.load_file: bert_classifier.load_state_dict(torch.load(args.load_file)) if args.train:
def test(args): if args.config_file != "": cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger = setup_logger('reid_baseline.eval', cfg.OUTPUT_DIR, 0, train=False) logger.info('Running with config:\n{}'.format(cfg)) _, val_dl, num_query, num_classes = make_dataloader(cfg) model = build_model(cfg, num_classes) if cfg.TEST.MULTI_GPU: model = nn.DataParallel(model) model = convert_model(model) logger.info('Use multi gpu to inference') para_dict = torch.load(cfg.TEST.WEIGHT) model.load_state_dict(para_dict) model.cuda() model.eval() feats, pids, camids, paths = [], [], [], [] with torch.no_grad(): for batch in tqdm(val_dl, total=len(val_dl), leave=False): data, pid, camid, path = batch paths.extend(list(path)) data = data.cuda() feat = model(data).detach().cpu() feats.append(feat) pids.append(pid) camids.append(camid) feats = torch.cat(feats, dim=0) pids = torch.cat(pids, dim=0) camids = torch.cat(camids, dim=0) query_feat = feats[:num_query] query_pid = pids[:num_query] query_camid = camids[:num_query] query_path = np.array(paths[:num_query]) gallery_feat = feats[num_query:] gallery_pid = pids[num_query:] gallery_camid = camids[num_query:] gallery_path = np.array(paths[num_query:]) distmat = euclidean_dist(query_feat, gallery_feat) cmc, mAP, all_AP = eval_func(distmat.numpy(), query_pid.numpy(), gallery_pid.numpy(), query_camid.numpy(), gallery_camid.numpy(), use_cython=True) if cfg.TEST.VIS: worst_q = np.argsort(all_AP)[:cfg.TEST.VIS_Q_NUM] qid = query_pid[worst_q] q_im = query_path[worst_q] ind = np.argsort(distmat, axis=1) gid = gallery_pid[ind[worst_q]][..., :cfg.TEST.VIS_G_NUM] g_im = gallery_path[ind[worst_q]][..., :cfg.TEST.VIS_G_NUM] for idx in range(cfg.TEST.VIS_Q_NUM): sid = qid[idx] == gid[idx] im = rank_list_to_im(range(len(g_im[idx])), sid, q_im[idx], g_im[idx]) im.save( osp.join(cfg.OUTPUT_DIR, 'worst_query_{}.jpg'.format(str(idx).zfill(2)))) logger.info('Validation Result:') for r in cfg.TEST.CMC: logger.info('CMC Rank-{}: {:.2%}'.format(r, cmc[r - 1])) logger.info('mAP: {:.2%}'.format(mAP)) logger.info('-' * 20) if not cfg.TEST.RERANK: return distmat = re_rank(query_feat, gallery_feat) cmc, mAP, all_AP = eval_func(distmat, query_pid.numpy(), gallery_pid.numpy(), query_camid.numpy(), gallery_camid.numpy(), use_cython=True) logger.info('ReRanking Result:') for r in cfg.TEST.CMC: logger.info('CMC Rank-{}: {:.2%}'.format(r, cmc[r - 1])) logger.info('mAP: {:.2%}'.format(mAP)) logger.info('-' * 20)
def get_train_dataloader(): update_config(cfg) train_loader, sampler = make_dataloader(cfg, is_train=True, distributed=True) return train_loader, sampler
def main(): args = get_args() # create teacher model_path = './pose_higher_hrnet_w32_512_2.pth' pre_train_model = PoseHigherResolutionNet(cfg) dev = 'cuda' if torch.cuda.is_available() else 'cpu' # load pretrain pre_train_model.load_state_dict(torch.load(model_path, torch.device(dev))) # freeze teacher for param in pre_train_model.parameters(): param.requires_grad = False # student = PoseHigherResolutionNet(new_cfg) student_cfg = get_student_cfg(cfg, args.student_file) student_cfg.LOG_DIR = args.log student = PoseHigherResolutionNet(student_cfg) student = torch.nn.DataParallel(student) # Set up logger logger, final_output_dir, tb_log_dir = create_logger( student_cfg, 'simple_model', 'train') final_output_dir = student_cfg.LOG_DIR if torch.cuda.is_available(): # cudnn related setting cudnn.benchmark = student_cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = student_cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = student_cfg.CUDNN.ENABLED train_loader = make_dataloader(student_cfg, True, False) # iteration = 1 loss_factory = MultiLossFactory(student_cfg).cuda() logger.info(train_loader.dataset) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } best_perf = -1 best_model = False last_epoch = -1 optimizer = optim.Adam(student.parameters(), lr=student_cfg.TRAIN.LR) begin_epoch = student_cfg.TRAIN.BEGIN_EPOCH end_epoch = student_cfg.TRAIN.END_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth.tar') if student_cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] student.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, student_cfg.TRAIN.LR_STEP, student_cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) pre_train_model.to(dev) student.to(dev) for epoch in range(begin_epoch, end_epoch): start = time.time() do_train(student_cfg, student, train_loader, loss_factory, optimizer, epoch, final_output_dir, writer_dict, pre_train_model, dev) print('epoch', epoch, ':', round((time.time() - start) / 60, 2), 'minutes') # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`. lr_scheduler.step() perf_indicator = epoch if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': student_cfg.MODEL.NAME, 'state_dict': student.state_dict(), 'best_state_dict': student.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(torch.cuda.get_device_name())) logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(student.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
from loss import make_loss from processor import do_train from solver import make_optimizer, WarmupMultiStepLR from utils.logger import setup_logger if __name__ == '__main__': Cfg = Configuration() log_dir = Cfg.DATALOADER.LOG_DIR logger = setup_logger('{}'.format(Cfg.PROJECT_NAME), log_dir) logger.info("Running with config:\n{}".format(Cfg.PROJECT_NAME)) os.environ['CUDA_VISIBLE_DEVICES'] = Cfg.DEVICE_ID cudnn.benchmark = True # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware. train_loader, val_loader = make_dataloader(Cfg) model = make_model(Cfg) optimizer = make_optimizer(Cfg, model) scheduler = WarmupMultiStepLR(Cfg, optimizer) loss_func = make_loss(Cfg) do_train( Cfg, model, train_loader, val_loader, optimizer, scheduler, # modify for using self trained model loss_func, )
def start(data_file_name, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver='dm', context_size=0, vec_combine_method='sum', save_all=True, generate_plot=True, max_generated_batches=5, num_workers=1): """Trains a new model. The latest checkpoint and the best performing model are saved in the *models* directory. Parameters ---------- data_file_name: str Name of a file in the *data* directory. model_ver: str, one of ('dm', 'dbow'), default='dbow' Version of the model as proposed by Q. V. Le et al., Distributed Representations of Sentences and Documents. 'dbow' stands for Distributed Bag Of Words, 'dm' stands for Distributed Memory. vec_combine_method: str, one of ('sum', 'concat'), default='sum' Method for combining paragraph and word vectors when model_ver='dm'. Currently only the 'sum' operation is implemented. context_size: int, default=0 Half the size of a neighbourhood of target words when model_ver='dm' (i.e. how many words left and right are regarded as context). When model_ver='dm' context_size has to greater than 0, when model_ver='dbow' context_size has to be 0. num_noise_words: int Number of noise words to sample from the noise distribution. vec_dim: int Dimensionality of vectors to be learned (for paragraphs and words). num_epochs: int Number of iterations to train the model (i.e. number of times every example is seen during training). batch_size: int Number of examples per single gradient update. lr: float Learning rate of the Adam optimizer. save_all: bool, default=False Indicates whether a checkpoint is saved after each epoch. If false, only the best performing model is saved. generate_plot: bool, default=True Indicates whether a diagnostic plot displaying loss value over epochs is generated after each epoch. max_generated_batches: int, default=5 Maximum number of pre-generated batches. num_workers: int, default=1 Number of batch generator jobs to run in parallel. If value is set to -1 number of machine cores are used. """ if model_ver not in ('dm', 'dmspline', 'dbow'): raise ValueError("Invalid version of the model") model_ver_is_dbow = model_ver == 'dbow' model_ver_is_dm = model_ver == 'dm' model_ver_is_dmspline = model_ver == 'dmspline' if model_ver_is_dbow and context_size != 0: raise ValueError("Context size has to be zero when using dbow") if not model_ver_is_dbow: if vec_combine_method not in ('sum', 'concat'): raise ValueError("Invalid method for combining paragraph and word " "vectors when using dm") if context_size <= 0: raise ValueError("Context size must be positive when using dm") # dataset = load_dataset(data_file_name, model_ver) # nce_data = NCEData( # dataset, # batch_size, # context_size, # num_noise_words, # max_generated_batches, # num_workers, # model_ver) # nce_data.start() print ('Loading data and making data loader ...') doc_ids, context_ids, target_noise_ids, word_to_ind_dict = load_and_cache_data(data_file_root=data_file_name, num_context_words=context_size, num_noise_words=num_noise_words) dataloader = make_dataloader((doc_ids, context_ids, target_noise_ids), batch_size) all_doc_ids = set() for i in doc_ids.tolist(): all_doc_ids.add(i) print ('num unique doc ids: ', len(all_doc_ids)) try: _run(dataloader, data_file_name, all_doc_ids, word_to_ind_dict, context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method, save_all, generate_plot, model_ver_is_dbow, model_ver_is_dm) except KeyboardInterrupt: nce_data.stop()
def main(): parser = argparse.ArgumentParser(description="ReID Baseline Training") parser.add_argument("--config_file", default="", help="path to config file", type=str) parser.add_argument("opts", help="Modify config options using the command-line", default=None,nargs=argparse.REMAINDER) args = parser.parse_args() if args.config_file != "": cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) num_gpus = torch.cuda.device_count() logger = setup_logger('reid_baseline', output_dir, 0) logger.info('Using {} GPUS'.format(num_gpus)) logger.info('Running with config:\n{}'.format(cfg)) if cfg.INPUT.SEPNORM.USE: train_dl, val_dl, num_query, num_classes = make_sepnorm_dataloader(cfg, num_gpus) elif cfg.DATASETS.EXEMPLAR.USE: train_dl, val_dl, num_query, num_classes,exemplar_dl = make_dataloader(cfg, num_gpus) else: train_dl, val_dl, num_query, num_classes = make_dataloader(cfg, num_gpus) model = build_model(cfg, num_classes) loss = make_loss(cfg, num_classes) if cfg.SOLVER.CENTER_LOSS.USE == True: trainer = CenterTrainer(cfg, model, train_dl, val_dl, loss, num_query, num_gpus) else: if cfg.SOLVER.MIXUP.USE: trainer = NegMixupTrainer(cfg, model, train_dl, val_dl, loss, num_query, num_gpus) elif cfg.DATASETS.EXEMPLAR.USE: if cfg.DATASETS.EXEMPLAR.MEMORY.USE: trainer = ExemplarMemoryTrainer(cfg, model, train_dl, val_dl,exemplar_dl, loss, num_query, num_gpus) else: trainer = UIRLTrainer(cfg, model, train_dl, val_dl,exemplar_dl, loss, num_query, num_gpus) elif cfg.DATASETS.HIST_LABEL.USE: trainer = HistLabelTrainer(cfg, model, train_dl, val_dl, loss, num_query, num_gpus) else: trainer = BaseTrainer(cfg, model, train_dl, val_dl, loss, num_query, num_gpus) if cfg.INPUT.SEPNORM.USE: logger.info('train transform0: \n{}'.format(train_dl.dataset.transform0)) logger.info('train transform1: \n{}'.format(train_dl.dataset.transform1)) logger.info('valid transform0: \n{}'.format(val_dl.dataset.transform0)) logger.info('valid transform1: \n{}'.format(val_dl.dataset.transform1)) else: logger.info('train transform: \n{}'.format(train_dl.dataset.transform)) logger.info('valid transform: \n{}'.format(val_dl.dataset.transform)) logger.info(type(model)) logger.info(loss) logger.info(trainer) for epoch in range(trainer.epochs): for batch in trainer.train_dl: trainer.step(batch) trainer.handle_new_batch() trainer.handle_new_epoch()
def main_worker( gpu, ngpus_per_node, args, final_output_dir, tb_log_dir ): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if cfg.MULTIPROCESSING_DISTRIBUTED: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes # 通过节点序号来计算进程在所有进程之中的序号 args.rank = args.rank * ngpus_per_node + gpu print('Init process group: dist_url: {}, world_size: {}, rank: {}'. format(args.dist_url, args.world_size, args.rank)) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) update_config(cfg, args) # setup logger logger, _ = setup_logger(final_output_dir, args.rank, 'train') model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')( cfg, is_train=True ) # copy model file if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir ) # 利用tensorboard可视化结果 writer_dict = { 'writer': SummaryWriter(logdir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu] ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): dump_input = torch.rand( (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE) ).cuda() #writer_dict['writer'].add_graph(model, (dump_input, )) logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) # define loss function (criterion) and optimizer loss_factory = MultiLossFactory(cfg).cuda() # Data loading code train_loader = make_dataloader( cfg, is_train=True, distributed=args.distributed )
def main_worker( gpu, ngpus_per_node, args, final_output_dir, tb_log_dir ): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED if cfg.FP16.ENABLED: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if cfg.FP16.STATIC_LOSS_SCALE != 1.0: if not cfg.FP16.ENABLED: print("Warning: if --fp16 is not used, static_loss_scale will be ignored.") args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if cfg.MULTIPROCESSING_DISTRIBUTED: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu print('Init process group: dist_url: {}, world_size: {}, rank: {}'. format(args.dist_url, args.world_size, args.rank)) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) update_config(cfg, args) # setup logger logger, _ = setup_logger(final_output_dir, args.rank, 'train') model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')( cfg, is_train=True ) # copy model file if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir ) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): dump_input = torch.rand( (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE) ) writer_dict['writer'].add_graph(model, (dump_input, )) # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) if cfg.FP16.ENABLED: model = network_to_half(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu] ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer loss_factory = MultiLossFactory(cfg).cuda() # Data loading code train_loader = make_dataloader( cfg, is_train=True, distributed=args.distributed ) logger.info(train_loader.dataset) best_perf = -1 best_model = False last_epoch = -1 optimizer = get_optimizer(cfg, model) if cfg.FP16.ENABLED: optimizer = FP16_Optimizer( optimizer, static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE, dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE ) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join( final_output_dir, 'checkpoint.pth.tar') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) if cfg.FP16.ENABLED: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch ) else: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch ) for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): lr_scheduler.step() # train one epoch do_train(cfg, model, train_loader, loss_factory, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, fp16=cfg.FP16.ENABLED) perf_indicator = epoch if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank == 0 ): logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpu) ) logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
if args.config_file != "": cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # print(cfg) dict_args = {} dict_args.update(vars(args)) print(pprint.pformat(dict_args)) output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) num_gpus = torch.cuda.device_count() train_dl, val_dl, num_query, num_classes = make_dataloader(cfg, num_gpus) print("==> build model..") model = build_model(cfg, num_classes) print(model) print("==> load params..") param_dict = torch.load(cfg.TEST.WEIGHT) model = torch.nn.DataParallel(model) if cfg.SOLVER.SYNCBN: print("convert_model to syncbn") model = convert_model(model) # param_dict = {k.replace('module.', ''): v for k, v in param_dict.items()} print('unloaded_param:') print([ k for k, v in model.state_dict().items()