def __init__(self, model, ema, device, cfg, train_loader, val_loader, logger): self.config = cfg self.epoch = 0 self.train_loader = train_loader self.val_loader = val_loader self.base_dir = f'{self.config.OUTPUT_DIR}' if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) self.logger = logger self.best_final_loss = 9999.0 self.model = model self.device = device self.model.to(self.device) self.ema = ema.to(self.device) self.loss = torch.nn.MSELoss(reduce=True, size_average=True) self.optimizer = make_optimizer(cfg, model) self.scheduler = make_scheduler(cfg, self.optimizer, train_loader) self.logger.info(f'Fitter prepared. Device is {self.device}') self.early_stop_epochs = 0 self.early_stop_patience = self.config.SOLVER.EARLY_STOP_PATIENCE self.do_scheduler = True self.logger.info("Start training")
def __init__(self, model, device, cfg, train_loader, val_loader, logger): self.config = cfg self.epoch = 0 self.train_loader = train_loader self.val_loader = val_loader self.base_dir = f'{self.config.OUTPUT_DIR}' if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) self.logger = logger self.best_final_score = 0.0 self.best_score_threshold = 0.5 self.model = model self.device = device self.model.to(self.device) self.optimizer = make_optimizer(cfg, model) self.scheduler = make_scheduler(cfg, self.optimizer, train_loader) self.logger.info(f'Fitter prepared. Device is {self.device}') self.all_predictions = [] self.early_stop_epochs = 0 self.early_stop_patience = self.config.SOLVER.EARLY_STOP_PATIENCE self.do_scheduler = True self.logger.info("Start training")
def test_something(self): net = nn.Linear(10, 10) optimizer = make_optimizer(cfg, net) lr_scheduler = WarmupMultiStepLR(optimizer, [20, 40], warmup_iters=10) for i in range(50): lr_scheduler.step() for j in range(3): print(i, lr_scheduler.get_lr()[0]) optimizer.step()
def __init__(self, cfg, logger, writer): self.cfg, self.logger, self.writer = cfg, logger, writer # Define dataloader self.tng_dataloader, self.val_dataloader, self.num_classes, self.num_query = get_dataloader( cfg) # networks self.model = build_model(cfg, self.num_classes) # loss function self.ce_loss = nn.CrossEntropyLoss() self.triplet = TripletLoss(cfg.SOLVER.MARGIN) # optimizer and scheduler self.opt = make_optimizer(self.cfg, self.model) self.lr_sched = make_lr_scheduler(self.cfg, self.opt) self._construct()
def train(cfg): model = build_model(cfg) data_rows_num = get_data_rows_num(cfg) k_fold = KFold(n_splits=10, shuffle=True, random_state=1) n_fold = 1 for train_idx, val_idx in k_fold.split( [i for i in range(1, data_rows_num)]): optimizer = make_optimizer(cfg, model) train_loader = make_data_loader(cfg, train_idx, is_train=True) val_loader = make_data_loader(cfg, val_idx, is_train=True) loss_functions = [bce_with_logits_loss, bce_with_logits_loss] do_train(cfg, model, train_loader, val_loader, optimizer, loss_functions, n_fold) n_fold += 1 pass
def __init__(self, cfg, logger, writer): self.cfg, self.logger, self.writer = cfg, logger, writer # Define dataloader self.tng_dataloader, self.val_dataloader_collection, self.num_classes, self.num_query_len_collection = get_dataloader(cfg) if 'InsDis' in list(self.cfg.SOLVER.LOSSTYPE): self.tng_dataloader, self.val_dataloader, self.num_classes, self.num_query = get_ins_dataloader(cfg) # networks self.model = build_model(cfg, self.num_classes, use_mask=False) # loss function self.ce_loss = nn.CrossEntropyLoss() self.triplet = TripletLoss(cfg.SOLVER.MARGIN) self.NCEContrast = NCEAverage(128, len(self.tng_dataloader)*cfg.SOLVER.IMS_PER_BATCH, cfg.NCE.K, cfg.NCE.T, cfg.NCE.M) self.NCELoss = NCECriterion(len(self.tng_dataloader)) # optimizer and scheduler self.opt = make_optimizer(self.cfg, self.model) self.lr_sched = make_lr_scheduler(self.cfg, self.opt) self._construct()
def __init__(self, cfg): self.cfg = cfg.clone() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 self.logger = setup_logger("deformconv RCNN", 'workspace/logger', 0) self.logger.info("Using {} GPUs".format(num_gpus)) self.logger.info("Collecting env info (might take some time)") self.logger.info("\n" + collect_env_info()) self.logger.info("Running with config:\n{}".format(cfg)) self.device = torch.device(cfg.MODEL.DEVICE) self.model = DeformConvRCNN(cfg).to(self.device) [*self.model.backbone.modules()][1].stem.load_state_dict( torch.load(cfg.MODEL.BACKBONE.PRETRAINED_STEM_WEIGHTS)) [*self.model.backbone.modules()][1].layer1.load_state_dict( torch.load(cfg.MODEL.BACKBONE.PRETRAINED_LAYER1_WEIGHTS)) self.train_loader = make_data_loader(cfg, is_train=True) self.val_loader = make_data_loader(cfg, is_train=False)[0] remove_empty_target(self.val_loader.dataset) self.optimizer = make_optimizer(cfg, self.model) self.writer = SummaryWriter(cfg.WRITER_DIR) self.predictor = Predictor(cfg, self.model, confidence_threshold=cfg.SOLVER.CONF_THRES, min_image_size=cfg.TEST.MIN_IMG_SIZE) self.predictor.model.roi_heads.box.post_processor.detections_per_img = 20 self.step = 0 self.milestones = cfg.SOLVER.STEPS self.workspace = Path(cfg.WORKSPACE) self.board_loss_every = len( self.train_loader.dataset ) // cfg.SOLVER.IMS_PER_BATCH // cfg.SOLVER.BOARD_LOSS_INTERVAL self.evaluate_every = len( self.train_loader.dataset ) // cfg.SOLVER.IMS_PER_BATCH // cfg.SOLVER.EVALUATE_INTERVAL self.save_every = len( self.train_loader.dataset ) // cfg.SOLVER.IMS_PER_BATCH // cfg.SOLVER.SAVE_INTERVAL self.board_pred_image_every = len( self.train_loader.dataset ) // cfg.SOLVER.IMS_PER_BATCH // cfg.SOLVER.BOARD_IMAGE_INTERVAL self.inference_every = len( self.train_loader.dataset ) // cfg.SOLVER.IMS_PER_BATCH // cfg.SOLVER.INFERENCE_INTERVAL
def __init__(self, cfg, logger, writer): self.cfg, self.logger, self.writer = cfg, logger, writer # Define dataloader self.tng_dataloader, self.val_dataloader, self.num_classes, self.num_query = get_dataloader(cfg) # networks self.model = build_model(cfg, self.num_classes) self.base_type = self.model.base_type # loss function if cfg.SOLVER.LABEL_SMOOTH: self.ce_loss = CrossEntropyLabelSmooth(self.num_classes) else: self.ce_loss = nn.CrossEntropyLoss() self.triplet = TripletLoss(cfg.SOLVER.MARGIN) self.aligned_triplet = TripletLossAlignedReID(margin=cfg.SOLVER.MARGIN) self.of_penalty = OFPenalty(beta=1e-6, penalty_position=['intermediate']) # optimizer and scheduler self.opt = make_optimizer(self.cfg, self.model) self.lr_sched = make_lr_scheduler(self.cfg, self.opt) self._construct()
def __init__(self, cfg, logger, writer): self.cfg, self.logger, self.writer = cfg, logger, writer # Define dataloader self.tng_dataloader, self.val_dataloader, self.num_classes, self.num_query = get_dataloader( cfg) logger.info('num_classes ' + str(self.num_classes)) # networks self.model = build_model(cfg, self.num_classes) #self.base_type = self.model.base_type # loss function #if cfg.SOLVER.LABEL_SMOOTH: self.ce_loss = CrossEntropyLabelSmooth(self.num_classes) #else: # self.ce_loss = nn.CrossEntropyLoss() self.triplet = TripletLoss(cfg.SOLVER.MARGIN) # optimizer and scheduler self.opt = make_optimizer(self.cfg, self.model) self.lr_sched = make_lr_scheduler(self.cfg, self.opt) self._construct()
def __init__(self, cfg, logger, writer): self.cfg, self.logger, self.writer = cfg, logger, writer # Define dataloader self.tng_dataloader, self.val_dataloader_collection, self.num_classes, self.num_query_len_collection = get_dataloader_mask( cfg) # networks self.use_part_erasing = False self.num_parts = cfg.MODEL.NUM_PARTS self.model = build_model_selfgcn(cfg, self.num_classes) self.adj = torch.from_numpy(coarse_adj_npy).float() # loss function self.ce_loss = nn.CrossEntropyLoss() self.triplet = TripletLoss(cfg.SOLVER.MARGIN) self.mse_loss = nn.MSELoss() # optimizer and scheduler self.opt = make_optimizer(self.cfg, self.model) self.lr_sched = make_lr_scheduler(self.cfg, self.opt) self.loss_weight = [1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 0.4] self.logger.info( f"Loss weights: {self.loss_weight}, use_pe: {self.use_part_erasing}, use_bnfeat: {True}" ) self._construct()
def train(cfg, local_rank, distributed): logger = logging.getLogger(cfg.NAME) # build model model = build_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # build solver optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {"iteration": 0} save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) save_to_disk = get_rank() == 0 checkpointer = Checkpointer( model=model, optimizer=optimizer, scheduler=scheduler, save_dir=save_dir, save_to_disk=save_to_disk, logger=logger ) extra_checkpoint_data = checkpointer.load(cfg.CHECKPOINTER.LOAD_NAME) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) evaluate = cfg.SOLVER.EVALUATE if evaluate: synchronize() data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) synchronize() else: data_loader_val = None save_to_disk = get_rank() == 0 if cfg.SUMMARY_WRITER and save_to_disk: save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) summary_writer = make_summary_writer(cfg.SUMMARY_WRITER, save_dir, model_name=cfg.MODEL.NAME) else: summary_writer = None do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, arguments, summary_writer ) return model
def test_optimzier(self): model = build_model(cfg) optimizer, lr_schedule = make_optimizer(cfg, model) from IPython import embed embed()
def train(cfg, args): train_set = DatasetCatalog.get(cfg.DATASETS.TRAIN, args) val_set = DatasetCatalog.get(cfg.DATASETS.VAL, args) train_loader = DataLoader(train_set, cfg.SOLVER.IMS_PER_BATCH, num_workers=cfg.DATALOADER.NUM_WORKERS, shuffle=True) val_loader = DataLoader(val_set, cfg.SOLVER.IMS_PER_BATCH, num_workers=cfg.DATALOADER.NUM_WORKERS, shuffle=True) gpu_ids = [_ for _ in range(torch.cuda.device_count())] model = build_model(cfg) model.to("cuda") model = torch.nn.parallel.DataParallel( model, gpu_ids) if not args.debug else model logger = logging.getLogger("train_logger") logger.info("Start training") train_metrics = MetricLogger(delimiter=" ") max_iter = cfg.SOLVER.MAX_ITER output_dir = cfg.OUTPUT_DIR optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) checkpointer = Checkpointer(model, optimizer, scheduler, output_dir, logger) start_iteration = checkpointer.load() if not args.debug else 0 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD validation_period = cfg.SOLVER.VALIDATION_PERIOD summary_writer = SummaryWriter(log_dir=os.path.join(output_dir, "summary")) visualizer = train_set.visualizer(cfg.VISUALIZATION)(summary_writer) model.train() start_training_time = time.time() last_batch_time = time.time() for iteration, inputs in enumerate(cycle(train_loader), start_iteration): data_time = time.time() - last_batch_time iteration = iteration + 1 scheduler.step() inputs = to_cuda(inputs) outputs = model(inputs) loss_dict = gather_loss_dict(outputs) loss = loss_dict["loss"] train_metrics.update(**loss_dict) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - last_batch_time last_batch_time = time.time() train_metrics.update(time=batch_time, data=data_time) eta_seconds = train_metrics.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( train_metrics.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}" ]).format(eta=eta_string, iter=iteration, meters=str(train_metrics), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0)) summary_writer.add_scalars("train", train_metrics.mean, iteration) if iteration % 100 == 0: visualizer.visualize(inputs, outputs, iteration) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration)) if iteration % validation_period == 0: with torch.no_grad(): val_metrics = MetricLogger(delimiter=" ") for i, inputs in enumerate(val_loader): data_time = time.time() - last_batch_time inputs = to_cuda(inputs) outputs = model(inputs) loss_dict = gather_loss_dict(outputs) val_metrics.update(**loss_dict) batch_time = time.time() - last_batch_time last_batch_time = time.time() val_metrics.update(time=batch_time, data=data_time) if i % 20 == 0 or i == cfg.SOLVER.VALIDATION_LIMIT: logger.info( val_metrics.delimiter.join([ "VALIDATION", "eta: {eta}", "iter: {iter}", "{meters}" ]).format(eta=eta_string, iter=iteration, meters=str(val_metrics))) if i == cfg.SOLVER.VALIDATION_LIMIT: summary_writer.add_scalars("val", val_metrics.mean, iteration) break if iteration == max_iter: break checkpointer.save("model_{:07d}".format(max_iter)) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def configure_optimizers(self): opt_fns = make_optimizer(self.cfg, self.model) lr_sched = make_lr_scheduler(self.cfg, opt_fns) return [opt_fns], [lr_sched]
def test_optimzier(self): model = build_model(cfg) optimizer = make_optimizer(cfg, model) from IPython import embed; embed()