def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets, _ in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets_ = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets_) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return loss_value
def evaluate(model, data_loader, device, epoch, print_freq): # test overfitting metric_logger = utils.MetricLogger(delimiter=" ") header = 'Validation'.format(epoch) sum_loss = [] with torch.no_grad(): for images, targets in metric_logger.log_every(data_loader, print_freq, header): # for images, targets in data_loader: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # loss in origin paper # losses_reduced = loss_dict_reduced['loss_classifier'] + loss_dict_reduced['loss_box_reg'] # losses = loss_dict['loss_classifier'] + loss_dict['loss_box_reg'] if math.isfinite(losses.item()): sum_loss.append(losses.item()) loss_value = losses_reduced.item() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) if device == 'cuda': torch.cuda.empty_cache() del images del targets del losses_reduced del losses del loss_dict del loss_dict_reduced # break sum_loss = np.sum(sum_loss) return sum_loss
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) for images, targets in metric_logger.log_every(data_loader, print_freq, header): # for images, targets in data_loader: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # loss in original paper # losses = loss_dict['loss_classifier'] + loss_dict['loss_box_reg'] # losses_reduced = loss_dict_reduced['loss_classifier'] + loss_dict_reduced['loss_box_reg'] loss_value = losses_reduced.item() optimizer.zero_grad() losses.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 2) optimizer.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if device == 'cuda': torch.cuda.empty_cache() del images del targets del losses_reduced del losses del loss_dict del loss_dict_reduced
def trainer(train, model, optimizer): print("---------- Start Training ----------") trainloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) try: with tqdm(trainloader, ncols=100) as pbar: train_loss = 0.0 for images, targets in pbar: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() train_loss += loss_value return train_loss except ValueError: pass
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in data_loader: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() return losses
# setup dataloader dataset = VOCDetection2007(root=voc_base_dir, image_set='train') dataloader = DataLoader(dataset, 3, shuffle=True, num_workers=0, collate_fn=lambda x: tuple(zip(*x))) # model.train() --> model(imgs, targets) --> loss breakdown images, targets = next(iter(dataloader)) print('images shape: {} \n\n'.format(images[0].shape)) images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # gross sum of loss loss_value = losses_reduced.item() print('loss_dict.keys: {}'.format(loss_dict.keys())) print('loss_dict: {}'.format(loss_dict)) print('loss classifier: {}'.format( loss_dict['loss_classifier'].cpu().tolist())) print('losses_reduced: {}'.format(losses_reduced)) print('loss_value: {}\n\n'.format(loss_value)) # model.eval() --> model(imgs) --> model post-processed bbox predictions model.eval() preds = model(images) print('preds[0].keys(): {}'.format(preds[0].keys()))
def evaluate(model, criterion, postprocessor, data_loader, base_ds, device, eval_bbox, eval_masks): model.eval() criterion.eval() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")) header = "Test:" iou_types = [] if eval_masks: iou_types += ["segm"] if eval_bbox: iou_types += ["bbox"] iou_types = tuple(iou_types) if isinstance(base_ds, LVIS): coco_evaluator = LvisEvaluator(base_ds, iou_types) if eval_bbox or eval_masks else None else: coco_evaluator = CocoEvaluator(base_ds, iou_types) if eval_bbox or eval_masks else None # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75] for samples, targets in metric_logger.log_every(data_loader, 10, header): samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model(samples) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) loss_dict_reduced_scaled = { k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict } loss_dict_reduced_unscaled = {f"{k}_unscaled": v for k, v in loss_dict_reduced.items()} metric_logger.update( loss=sum(loss_dict_reduced_scaled.values()), **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled, ) metric_logger.update(class_error=loss_dict_reduced["class_error"]) results = postprocessor(outputs, targets) res = {target["image_id"].item(): output for target, output in zip(targets, results)} if coco_evaluator is not None: coco_evaluator.update(res) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) if coco_evaluator is not None: coco_evaluator.synchronize_between_processes() # accumulate predictions from all images if coco_evaluator is not None: coco_evaluator.accumulate() coco_evaluator.summarize() stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} if coco_evaluator is not None: if eval_bbox: stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist() if eval_masks: stats["coco_eval_masks"] = coco_evaluator.coco_eval["segm"].stats.tolist() return stats, coco_evaluator
def main(args): torch.cuda.set_device(args.local_rank) utils.init_distributed_mode(args) hook = smd.Hook.create_from_json_file() device = torch.device('cuda') # Data loading code print("Loading data") dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) indices = torch.randperm(len(dataset)).tolist() dataset_test = torch.utils.data.Subset(dataset, indices[-50:]) dataset = torch.utils.data.Subset(dataset, indices[:-50]) num_classes = 2 print("Creating data loaders") if args.world_size > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, #num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=False, rpn_nms_thresh=1, rpn_pre_nms_top_n_train=5000) model.to('cuda') #hook.register_module(model) model_without_ddp = model if args.world_size > 1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.epochs): if args.world_size > 1: train_sampler.set_epoch(epoch) hook.set_mode(modes.TRAIN) model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for iteration, (images, targets) in enumerate(data_loader): images = list(image.to('cuda') for image in images) targets = [{k: v.to('cuda') for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if iteration % args.checkpoint_freq == 0: utils.save_on_master( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, 'model_{}.pth') lr_scheduler.step() hook.set_mode(modes.EVAL) lr_scheduler.step() hook.set_mode(modes.EVAL) evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def train(self): LOSSES_NAME = self.args.LOSSES_NAME task_dict = { 'Mask_LM': 'word_mask', 'Matched': 'matched', 'Mask_Obj': 'vis_mask', 'Mask_Attr': 'vis_mask', 'Mask_Feat': 'vis_mask', 'QA': 'qa' } if self.args.dry: results = self.evaluate_epoch(epoch=0) self.optim.zero_grad() if self.verbose: loss_meters = [LossMeter() for _ in range(len(LOSSES_NAME))] best_eval_loss = 9595. from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(log_dir=self.args.log_dir) print('logging at', str(self.args.log_dir)) self.logger.info('logging at' + str(self.args.log_dir)) hparam_dict = {} for k, v in self.args.__dict__.items(): if type(v) in [int, float, str, bool, torch.Tensor]: hparam_dict[k] = v metric_dict = {} self.writer.add_hparams(hparam_dict, metric_dict) dist.barrier() n_update = 0 global_step = 0 for epoch in range(self.args.epochs): if self.start_epoch is not None: epoch += self.start_epoch if self.args.distributed: self.train_loader.sampler.set_epoch(epoch) # Train self.model.train() loss_counts = [0 for _ in range(len(LOSSES_NAME))] if self.verbose: pbar = tqdm(total=len(self.train_loader), ncols=240) epoch_results = { 'lm_loss': 0, 'vis_loss': 0, 'matched_loss': 0, 'qa_loss': 0, 'obj_loss': 0, 'feat_loss': 0, 'attr_loss': 0, } for k in list(epoch_results.keys()): if k[-4:] == 'loss': epoch_results[f'{k}_count'] = 0 if self.args.task_qa: uid2ans = {} for step_i, batch in enumerate(self.train_loader): # task = random.choice(self.args.MASK_MODALITY) task_i = step_i % len(self.args.MASK_MODALITY) task = self.args.MASK_MODALITY[task_i] # with torch.autograd.set_detect_anomaly(True): results = self.forward(batch, task) if self.args.fp16 and _use_native_amp: with autocast(): results = self.model(batch, task) else: results = self.model(batch, task) if task == 'vis_mask': if 'Mask_Obj' in LOSSES_NAME: epoch_results['obj_loss_count'] += 1 if 'Mask_Feat' in LOSSES_NAME: epoch_results['feat_loss_count'] += 1 if 'Mask_Attr' in LOSSES_NAME: epoch_results['attr_loss_count'] += 1 epoch_results['vis_loss_count'] += 1 elif task == 'word_mask': epoch_results['lm_loss_count'] += 1 elif task == 'matched': epoch_results['matched_loss_count'] += 1 if self.args.task_qa: epoch_results['qa_loss_count'] += 1 qa_pred = results['qa_pred'] for uid, ans_id in zip(batch['uid'], qa_pred.cpu().numpy()): ans = self.train_loader.dataset.answer_table.id2ans( ans_id) uid2ans[uid] = ans loss = results['total_loss'] #===== Update =====# if self.args.fp16 and _use_native_amp: self.scaler.scale(loss).backward() elif self.args.fp16 and _use_apex: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() loss = loss.detach() # Update Parameters if self.args.clip_grad_norm > 0: if self.args.fp16 and _use_native_amp: self.scaler.unscale_(self.optim) torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.args.clip_grad_norm) elif self.args.fp16 and _use_apex: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optim), self.args.clip_grad_norm) else: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.args.clip_grad_norm) if self.args.fp16 and _use_native_amp: self.scaler.step(self.optim) self.scaler.update() else: self.optim.step() if self.lr_scheduler: self.lr_scheduler.step() for param in self.model.parameters(): param.grad = None global_step += 1 #====================# try: lr = self.scheduler.get_last_lr()[0] except AttributeError: lr = self.args.lr if self.verbose: desc_str = f'Epoch {epoch} | LR {lr:.6f} | ' if self.args.word_mask_predict: desc_str += f'Word Mask: Uniform (MP) | ' elif self.args.word_mask_rate > 0: desc_str += f'Word Mask: {self.args.word_mask_rate:.2f} | ' if self.args.vis_mask_predict: desc_str += f'Vis Mask: Uniform (MP) |' else: desc_str += f'Vis Mask: {self.args.obj_mask_rate:.2f} |' if self.args.task_qa: loss_meter = loss_meters[-1] loss_meter.update(results['qa_loss'].item()) loss_counts[-1] += 1 for i, (loss_name, loss_meter) in enumerate( zip(LOSSES_NAME, loss_meters)): if task_dict[loss_name] == task: if task == 'vis_mask': if loss_name == 'Mask_Obj': loss_meter.update( results['obj_loss'].item()) elif loss_name == 'Mask_Attr': loss_meter.update( results['attr_loss'].item()) elif loss_name == 'Mask_Feat': loss_meter.update( results['feat_loss'].item()) elif task == 'word_mask': loss_meter.update(results['lm_loss'].item()) elif task == 'matched': loss_meter.update( results['matched_loss'].item()) # elif task == 'qa': # loss_meter.update(results['qa_loss'].item()) loss_counts[i] += 1 if len(loss_meter) > 0: loss_count = loss_counts[i] if loss_name in [ 'Mask_LM', 'Matched', 'Mask_Obj', 'Mask_Attr', 'Mask_Feat', 'QA' ]: desc_str += f' {loss_name} ({loss_count}) {loss_meter.val:.3f}' else: desc_str += f' {loss_name} {loss_meter.val:.3f}' if step_i % 10 == 0: self.writer.add_scalar( f'Train_steps/{loss_name}', loss_meter.val, global_step) # if update: n_update += 1 desc_str += f' | Total Update: {n_update}' pbar.set_description(desc_str) pbar.update(1) if self.verbose: pbar.close() dist.barrier() results = reduce_dict(epoch_results, self.args.gpu) if self.args.gpu == 0: total_loss = results['lm_loss'] + results[ 'vis_loss'] + results['matched_loss'] + results['qa_loss'] total_count = results['lm_loss_count'] + results[ 'vis_loss_count'] + results['matched_loss_count'] # + results['qa_loss_count'] avg_train_loss = total_loss / total_count losses_str = f"Train Loss: {avg_train_loss:.4f}\n" for name, loss in results.items(): if name[-4:] == 'loss': loss_count = int(results[name + '_count']) if loss_count > 0: avg_loss = loss / loss_count if name == 'lm_loss': name = 'Mask_LM' elif name == 'matched_loss': name = 'Matched' elif name == 'obj_loss': name = 'Mask_Obj' elif name == 'attr_loss': name = 'Mask_Attr' elif name == 'feat_loss': name = 'Mask_Feat' elif name == 'qa_loss': name = 'QA' losses_str += f"{name} ({loss_count}): {avg_loss:.4f} " self.writer.add_scalar(f'Train Loss/{name}', avg_loss, epoch) losses_str += '\n' print(losses_str) self.logger.info(losses_str) if self.args.task_qa: dset2score, dset2cnt, score, cnt = self.train_loader.dataset.evaluator.evaluate( uid2ans) dset2score = reduce_dict(dset2score, self.args.gpu) dset2cnt = reduce_dict(dset2cnt, self.args.gpu) score_cnt_dict = reduce_dict({ 'score': score, 'cnt': cnt }, self.args.gpu) if self.args.gpu == 0: score = score_cnt_dict['score'] cnt = score_cnt_dict['cnt'] accu = score / cnt dset2accu = {} for dset in dset2cnt: dset2accu[dset] = dset2score[dset] / dset2cnt[dset] accu_str = "Overall Accu %0.4f, " % (accu) sorted_keys = sorted(dset2accu.keys()) for key in sorted_keys: accu_str += "%s Accu %0.4f, " % (key, dset2accu[key]) print(accu_str) self.logger.info(accu_str) dist.barrier() # Validation valid_results, valid_uid2ans = self.evaluate_epoch(epoch=epoch) valid_results = reduce_dict(valid_results, self.args.gpu) if self.args.gpu == 0: valid_total_loss = valid_results['lm_loss'] + valid_results[ 'vis_loss'] + valid_results[ 'matched_loss'] + valid_results['qa_loss'] valid_total_count = valid_results[ 'lm_loss_count'] + valid_results[ 'vis_loss_count'] + valid_results['matched_loss_count'] # + valid_results['qa_loss_count'] avg_valid_loss = valid_total_loss / valid_total_count losses_str = f"Valid Loss: {avg_valid_loss:.4f}\n" for name, loss in valid_results.items(): if name[-4:] == 'loss': loss_count = int(valid_results[name + '_count']) if loss_count > 0: avg_loss = loss / loss_count if name == 'lm_loss': name = 'Mask_LM' elif name == 'matched_loss': name = 'Matched' elif name == 'obj_loss': name = 'Mask_Obj' elif name == 'attr_loss': name = 'Mask_Attr' elif name == 'feat_loss': name = 'Mask_Feat' elif name == 'qa_loss': name = 'QA' losses_str += f"{name} ({loss_count}): {avg_loss:.4f} " self.writer.add_scalar(f'Valid Loss/{name}', avg_loss, epoch) losses_str += '\n' print(losses_str) self.logger.info(losses_str) if self.args.task_qa: dset2score, dset2cnt, score, cnt = self.val_loader.dataset.evaluator.evaluate( valid_uid2ans) dset2score = reduce_dict(dset2score, self.args.gpu) dset2cnt = reduce_dict(dset2cnt, self.args.gpu) score_cnt_dict = reduce_dict({ 'score': score, 'cnt': cnt }, self.args.gpu) if self.args.gpu == 0: score = score_cnt_dict['score'] cnt = score_cnt_dict['cnt'] accu = score / cnt dset2accu = {} for dset in dset2cnt: dset2accu[dset] = dset2score[dset] / dset2cnt[dset] accu_str = "Overall Accu %0.4f, " % (accu) sorted_keys = sorted(dset2accu.keys()) for key in sorted_keys: accu_str += "%s Accu %0.4f, " % (key, dset2accu[key]) print(accu_str) self.logger.info(accu_str) dist.barrier() if self.verbose: # Save if avg_valid_loss < best_eval_loss: best_eval_loss = avg_valid_loss # self.save("BEST_EVAL_LOSS") self.save("Epoch%02d" % (epoch + 1)) dist.barrier()
def train_one_epoch(model: nn.Module, optimizer: torch.optim.Optimizer, data_loader: torch.utils.data.DataLoader, master_progress_bar: master_bar, device: str = "cpu"): """Train model in one epoch Parameters ---------- model: torch.nn.Module model to train optimizer: torch.optim.Optimizer optimize function data_loader: torch.utils.data.DataLoader dataset loader in batch device: str (default: "cpu") "cpu" or "cuda", device to train master_progress_bar: fastprogress.master_bar progress bar to update trainning information Returns ------- float loss of current training epoch """ # Switch model to training mode model.train() training_loss = 0 # Storing total loss loss_dict = {} # For each batch train_progress_bar = progress_bar(data_loader, parent=master_progress_bar) for batch, (images, targets) in enumerate(train_progress_bar): # Move images and targets to device images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # Back propagation optimizer.zero_grad() losses.backward() optimizer.step() # Log loss loss_dict_reduced = reduce_dict(loss_dict) loss_dict = { k: v + loss_dict.get(k, 0) for k, v in loss_dict_reduced.items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) training_loss += losses_reduced.item() mean_loss = training_loss / (batch + 1) log = "Loss: %.2f" % (mean_loss) master_progress_bar.child.comment = log if torch.cuda.is_available(): torch.cuda.empty_cache() # Return training loss return training_loss / len(data_loader), { k: v / len(data_loader) for k, v in loss_dict.items() }
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, tb_writer): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( "lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}")) header = "Epoch: [{}]".format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): try: targets = [{k: v.to(device) for k, v in t.items()} for t in targets if t["boxes"].shape[0] > 0] images = list( image.to(device) for image, t in zip(images, targets) if t["boxes"].shape[0] > 0) except: print("neeeee") # breakpoint() try: loss_dict = model(images, targets) except: print("daaaaa") # breakpoint() losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, epoch=tb_writer["step"], tb_writer=tb_writer["writer"], **loss_dict_reduced) metric_logger.update( lr=optimizer.param_groups[0]["lr"], epoch=tb_writer["step"], tb_writer=tb_writer["writer"], ) tb_writer["step"] += 1
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, writer, ckpt_path): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) for batch_idx, (images, targets) in enumerate( metric_logger.log_every(data_loader, print_freq, header)): writer.add_scalar('Training Loss', loss_value, epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'].item(), epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'].item(), epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_objectness', loss_dict_reduced['loss_objectness'].item(), epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_rpn_box_reg', loss_dict_reduced['loss_rpn_box_reg'].item(), epoch * len(data_loader) + batch_idx) for name, param in model.named_parameters(): if param.grad is not None: param_norm = param.grad.data.norm(2).cpu().item() writer.add_histogram(name + '_grad', param_norm, epoch) # else: # print("{} has no grad".format(name)) optimizer.step() if lr_scheduler is not None: lr_scheduler.step() # Save model print("Saving model at training epoch: {}".format(epoch + 1)) ckpt_dict = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save( ckpt_dict, os.path.join( ckpt_path, 'ckpt_epoch-' + str(epoch + 1) + 'loss' + str(loss_value) + '.pth'))
def train_one_epoch( model, optimizer, data_loader, device, epoch, metric_logger, print_freq, mq_logger=None, ): model.train() # metric_logger = utils.MetricLogger(delimiter=" ") # metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, # fmt='{value:.6f}')) header = "Epoch: [{}]".format(epoch) metric_logger.clear() losses_summed = 0.0 cnt = 0 warm_up_lr_scheduler = None if epoch == 0: warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(data_loader) - 1) warm_up_lr_scheduler = utils.warmup_lr_scheduler( optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) # it just pop.. and we do not train rpn anyway!!!! loss_dict.pop("loss_rpn_box_reg") # diferrent model use different names loss_dict.pop("loss_box_reg") losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) optimizer.zero_grad() losses.backward() optimizer.step() losses_summed += losses_reduced.detach().cpu().numpy() cnt += 1 if warm_up_lr_scheduler is not None: # only for epoch 0, warm up warm_up_lr_scheduler.step() if mq_logger is not None: # issue is it runs off... So NAN mq_logger.debug( f"losses summed is {losses_summed}, cnt is {cnt}") print( f"losses summed is {losses_summed}, cnt is {cnt}, loss_dict_reduced is {loss_dict_reduced}" ) metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return losses_summed / cnt
def train_one_epoch(self, lr_schedule='cyclic'): self.model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(self.epoch) lr_scheduler = None if (self.epoch == 0): if lr_schedule == 'warmup': warmup_factor = 1. / 1000 warmup_iters = min(1000, len(self.data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(self.optimizer, warmup_iters, warmup_factor) elif lr_schedule == 'cyclic': lr_scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer, 1e-6, 1e-2) for iteration, (images, targets) in enumerate(metric_logger.log_every(self.data_loader, self.print_freq, header)): with torch.autograd.detect_anomaly(): images = list(image.to(self.device) for image in images) targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets] loss_dict = self.model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if self.emergency is True: if not math.isfinite(loss_value): print() print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) self.optimizer.zero_grad() losses.backward() grad_norm = clip_grad_norm_(self.model.parameters(), grad_clip_norm_value) self.optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=self.optimizer.param_groups[0]["lr"]) if self.logger is not None: if iteration % 50 == 0: # 1. Log scalar values (scalar summary) info = {'loss': losses_reduced, **loss_dict_reduced} for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration+1) # 2. Log values and gradients of the parameters (histogram summary) for tag, value in self.model.named_parameters(): tag = tag.replace('.', '/') self.logger.histo_summary(tag, value.data.cpu().numpy(), iteration+1) self.epoch += 1
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) #.to(device) for both targets = [{k: v.to(device) for k, v in t.items()} for t in targets] #.to(device) for both loss_dict = model(images, targets) ''' During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance The model returns a Dict[Tensor] during training, containing the classification and regression losses for both the RPN and the R-CNN, and the mask loss. During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to obtain the final segmentation masks, the soft masks can be thresholded, generally with a value of 0.5 (mask >= 0.5) ''' losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] ts = copy.deepcopy(targets) # print(f"targets before model: {targets[0]['boxes']}") # print(f"n images: {len(images)}\nn boxes: {targets[0]['boxes'].shape}\nn labels: {targets[0]['labels'].shape}\nn masks: {targets[0]['masks'].shape}\n") loss_dict = model(images, targets) print(loss_dict) # print(f"targets after model: {targets[0]['boxes']}") losses = sum(loss for loss in loss_dict.values()) # print(losses) # if losses.item() > 1: # single_image = np.transpose(images[0].cpu().detach().numpy(),(1,2,0)).squeeze() # fig = plt.figure() # ax = fig.add_subplot(111, aspect='equal') # ax.imshow(single_image) # # print(np.unique(single_image)) # # cvimg = cv2.imread(img_path) # # print(single_image.shape) # # plt.imshow(single_image) # # plt.show() # # cvimg = np.uint8(single_image*255) # # print(cvimg.shape) # # cvimg = cvimg.astype(int) # # r,g,b = cv2.split(cvimg) # # cvimg = cv2.merge([b,g,r]) # # print(cvimg) # # print(targets[0]['boxes']) # # for box in ts[0]['boxes']: # for box in targets[0]['boxes']: # # print(f"dict: {dict}") # # box = dict['boxes'] # # print(f"box: {box}") # # box = box.item() # x1 = box[0].item() # y1 = box[1].item() # x2 = box[2].item() # y2 = box[3].item() # # print(box) # # print(f"x1:{x1} y1:{y1} x2:{x2} y2:{y2}") # rect = patches.Rectangle((x1,y1),x2-x1,y2-y1,fill=False,edgecolor='r') # ax.add_patch(rect) # cv2.rectangle(cvimg,(x1,y1),(x2,y2),(255,255,0)) # plt.show() # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) # print(loss_dict_reduced) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # print(losses_reduced) loss_value = losses_reduced.item() if not math.isfinite(loss_value): # visualize_bboxes(images,targets) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def train_one_epoch_FastRCNN(model, optimizer, data_loader, device, epoch, print_freq, mode="sew6", encoder=None, train_encoder=False): #this data loader is given loader #mode can be "sew6", "panorm", "autoencode" model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None # if epoch == 0: # warmup_factor = 1. / 1000 # warmup_iters = min(1000, len(data_loader) - 1) # lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) if mode == 'panorm': tt = transforms.Compose( [transforms.Resize((800, 800)), transforms.ToTensor(), normalize]) #this is for 6 images combo for sample, old_targets, road_image, extra in metric_logger.log_every( data_loader, print_freq, header): #images = sample[0] targets = trans_target(old_targets) #print("images len {}, targets len {}".format(len(images), len(targets))) #print("len(sample) {}, sample [0] shape {}".format(len(sample), sample[0].shape)) # [6, 3, 256, 306] #images = list(image.to(device) for image in images) if mode == "panorm": images = [ tt(s).to(device) for s in sew_images_panorm(sample, to_img=True) ] elif mode == "autoencode": encoder.cuda() samp_pan = sew_images_panorm(sample) #convert to panoramic tensor samp_pan = [normalize(i) for i in samp_pan] samp_pan_t = torch.stack(samp_pan, dim=0) #stack if train_encoder: images = encoder.return_image_tensor( samp_pan_t.to(device), train_encoder ) #see if it will take it or it needs to take a list else: images = encoder.return_image_tensor(samp_pan_t.cuda(), train_encoder).to(device) else: #mode is sew6 images = [tt(sew_images(s)).to(device) for s in sample ] #list of [3, 800, 800], should be 1 per patch targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) #print(loss_dict) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, transfer_learning): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) loss_epoch = { 'loss_classifier': 0, 'loss_box_reg': 0, 'loss_objectness': 0, 'loss_rpn_box_reg': 0, 'loss_total': 0, } counter = 0 lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): counter += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] _, loss_dict = model(images, targets) # if transfer_learning: # losses = sum(loss_dict[key] if key == 'loss_box_reg' or key == 'loss_classifier' else torch.zeros_like( # loss_dict[key]) for key in loss_dict.keys()) # else: losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) # save the epoch loss for key in loss_dict_reduced.keys(): loss_epoch[key] += loss_dict_reduced[key].item() loss_epoch['loss_total'] += loss_value print('Epoch: [{}]'.format(epoch)) for key in loss_epoch.keys(): loss_epoch[key] = loss_epoch[key] / counter print('{}: {}.'.format(key, loss_epoch[key])) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for i in metric_logger.log_every(data_loader, print_freq, header): try: images, targets = i '''Burası değiştirilecek''' targets["boxes"] = targets["boxes"].to(device) targets["labels"] = targets["labels"].to(device) targets["boxes"].squeeze_() targets["labels"].squeeze_() targets1 = [{k: v for k, v in targets.items()}] images = images.to(device) targets = targets1 # zero the parameter gradients # forward # track history if only in train #images = list(image.to(device) for image in images) #targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() #print(targets[0]["boxes"]) if not math.isfinite(loss_value): print(images.size()) print(targets[0]["boxes"]) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) except ValueError: continue return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, gradient_accumulation_steps, print_freq, box_threshold): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) optimizer.zero_grad() # gradient_accumulation steps = 0 # gradient_accumulation for images, targets in metric_logger.log_every(data_loader, print_freq, header): # print("target: {}".format(targets)) steps += 1 # gradient_accumulation # images = list(image.to(device) for image in images) # images = torch.stack(images).to(device) # images = images.to(device) # targets = [{k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items()} for t in targets] # targets = {k: v.to(device) if torch.is_tensor(v) else v for k, v in targets.items()} # vis = visualize.Visualize('.', targets['img_size'][0][0]) # num_of_detections = len(torch.where(targets['cls'][0] > -1)[0]) # vis.show_image_data(images[0], targets['cls'][0,:num_of_detections].int(), None, targets['bbox'][0,:num_of_detections,[1,0,3,2]]) if box_threshold is None: loss_dict = model(images, targets) else: # loss_dict = model(images, box_threshold, targets) loss_dict = model(images, targets) # losses = sum(loss / gradient_accumulation_steps for loss in loss_dict.values()) # gradient_accumulation losses = loss_dict['loss'] / gradient_accumulation_steps # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) # optimizer.zero_grad() losses.backward() # ofekp: we add grad clipping here to avoid instabilities in training torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0) # gradient_accumulation if steps % gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train(batch_size, checkpoint_freq, num_epochs): num_classes = 2 model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True, rpn_nms_thresh=1, rpn_pre_nms_top_n_train=5000) in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels hidden_layer = 256 model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes) model = torch.nn.DataParallel(model) model.to('cuda') dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) hook = smd.Hook.create_from_json_file() for epoch in range(num_epochs): hook.set_mode(modes.TRAIN) model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for iteration, (images, targets) in enumerate(data_loader): images = list(image.to('cuda') for image in images) targets = [{k: v.to('cuda') for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if iteration % checkpoint_freq == 0: utils.save_on_master( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, 'model_{}.pth') lr_scheduler.step() hook.set_mode(modes.EVAL) evaluate(model, data_loader_test, device='cuda')
coarse_optim, warmup_iters, warmup_factor) for i, (fine_train, coarse_train) in enumerate( zip(fine_train_loader, coarse_train_loader)): # train fine_model.train() coarse_model.train() #### fine train ### # Label mathching fine_imgs, fine_labels = label_matching(fine_train, device) fine_imgs = fine_imgs.to(device) / 255. ## train: img normalization --> not, zerodivision err fine_loss_dict = fine_model(fine_imgs, copy.deepcopy(fine_labels)) fine_losses = sum(loss for loss in fine_loss_dict.values()) fine_loss_dict_reduced = reduce_dict(fine_loss_dict) fine_loss_reduced = sum(loss for loss in fine_loss_dict_reduced.values()) fine_loss_val = fine_loss_reduced.item() # optimizer fine_optim.zero_grad() fine_losses.backward() fine_optim.step() if fine_lr_scheduler is not None: fine_lr_scheduler.step() fine_metric_logger.update(loss=fine_loss_reduced, **fine_loss_dict_reduced) fine_metric_logger.update(lr=fine_optim.param_groups[0]["lr"])
hists[cat]["data_obs"] = OrderedDict() hists[cat]["data_obs"]["shapes_prefit"] = nd_tot[ category_mapper[cat]]["data_obs"]["nominal"] for samp in hists[cat].keys(): for syst in hists[cat][samp].keys(): old_d = hists[cat][samp][syst] new_d = OrderedDict() for ibin in range( 1, len(hists[cat]["data_obs"]["shapes_prefit"]) + 1): label = "bin_{0}".format(ibin) new_d[label] = old_d[label] hists[cat][samp][syst] = new_d bins = bins_to_category(hists) bins_sob = reduce_dict(bins, dcard_repr.calculate_signal_over_background) bins_sorted = sorted(bins_sob.keys(), key=lambda x: bins_sob[x], reverse=False) sob_data = [bins_sob[b] for b in bins_sorted] print("Best bins by SoB are") for bs in bins_sorted[-10:]: print(" {0} {1:.4f}".format(bs, bins_sob[bs])) nd3 = OrderedDict() for samp in ["total_signal", "total_background", "data_obs"]: nd3[samp] = OrderedDict() for syst in ["shapes_prefit", "shapes_fit_s", "shapes_fit_b"]: nd3[samp][syst] = OrderedDict() if not hists[cat][samp].has_key(syst): continue
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) # for boxNum, box in enumerate(targets[0]['boxes']): # image = copy.deepcopy(images[0].cpu().numpy()) # image = image.transpose(1,2,0) # image = image*255 # image = np.ascontiguousarray(image, dtype=np.uint16) # # print (image.shape, box[0].item(),box[1].item(),box[2].item(),box[3].item(), targets[0]['names'][boxNum]) # image = cv2.rectangle(image, (int(box[0].item()),int(box[1].item())), (int(box[2].item()),int(box[3].item())), (0,0,255), 2) # # cv2.imwrite('checker/'+targets[0]['names'][boxNum]+'.jpg',image) # cv2.imwrite('checker/'+str(boxNum)+'.jpg',image) # import pdb;pdb.set_trace() targets = [{k: v.to(device) for k, v in t.items()} for t in targets] # for target in targets: # for boxNum, box in enumerate(target['boxes']): # if ((box[2]-box[0])<=0) or ((box[3]-box[1])<=0): # import pdb;pdb.set_trace() # print(images[0].size(), targets[0]['boxes'][0],targets[0]['names'][0]) # import pdb;pdb.set_trace() loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() # for target in targets: # for box in target['boxes']: # print (box), print (loss_value) # if ((box[2]-box[0])<=0) or ((box[3]-box[1])<=0): # import pdb;pdb.set_trace() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) loss_plt = [] for images, ann in metric_logger.log_every(data_loader, print_freq, header): targets = [] for data1 in ann: #这个for循环可以舍去 boxes = [] target = {} labels = [] for d in data1: box = d['bbox'] box = [box[0], box[1], box[0] + box[2], box[1] + box[3]] boxes.append(box) labels.append(d['category_id']) # convert everything into a torch.Tensor boxes = torch.as_tensor(boxes, dtype=torch.float32) # there is only one class labels = torch.as_tensor(labels, dtype=torch.int64) image_id = torch.tensor([data1[0]['image_id']]) area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) #print(area) #return iscrowd = torch.zeros((len(data1), ), dtype=torch.int64) # suppose all instances are not crowd target["boxes"] = boxes target["labels"] = labels target["image_id"] = image_id target["area"] = area target["iscrowd"] = iscrowd targets.append(target) images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] #假设标签没有放大相应device上?? loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_plt.append(losses) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) #break return metric_logger, loss_plt
def train_one_epoch( model, arch, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, ngpus_per_node, model_without_ddp, args ): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}")) # header = "Epoch: [{}]".format(epoch) for images, targets in metric_logger.log_every( iterable=data_loader, print_freq=print_freq, # header=header, iter_num=args.iter_num ): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] """ [{"boxes": tensor([], device="cuda:0"), "labels": tensor([], device="cuda:0", dtype=torch.int64), "masks": tensor([], device="cuda:0", dtype=torch.uint8), "iscrowd": tensor([], device="cuda:0", dtype=torch.int64)}] """ try: loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): logger.fatal("Loss is {}, stopping training".format(loss_value)) logger.fatal(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) except Exception as e: logger.warning(e, exc_info=True) # logger.info("print target for debug") # print(targets) args.iter_num += 1 # save checkpoint here if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if args.iter_num % 1000 == 0: utils.save_on_master({ "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "iter_num": args.iter_num, "args": args, }, "{}/{}_{}.pth".format(checkpoint_dir, arch.__name__, args.iter_num) ) os.makedirs("{}/debug_image/".format(checkpoint_dir), exist_ok=True) if args.iter_num < 5000: continue model.eval() from barez import overlay_ann debug_image = None debug_image_list = [] cnt = 0 for image_path in glob.glob("./table_test/*"): cnt += 1 image_name = os.path.basename(image_path) # print(image_name) image = cv2.imread(image_path) rat = 1300 / image.shape[0] image = cv2.resize(image, None, fx=rat, fy=rat) transform = transforms.Compose([transforms.ToTensor()]) image = transform(image) # put the model in evaluation mode with torch.no_grad(): tensor = [image.to(device)] prediction = model(tensor) image = torch.squeeze(image, 0).permute(1, 2, 0).mul(255).numpy().astype(np.uint8) for pred in prediction: for idx, mask in enumerate(pred['masks']): if pred['scores'][idx].item() < 0.5: continue m = mask[0].mul(255).byte().cpu().numpy() box = list(map(int, pred["boxes"][idx].tolist())) score = pred["scores"][idx].item() image = overlay_ann(image, m, box, "", score) if debug_image is None: debug_image = image else: debug_image = np.concatenate((debug_image, image), axis=1) if cnt == 10: cnt = 0 debug_image_list.append(debug_image) debug_image = None avg_length = np.mean([i.shape[1] for i in debug_image_list]) di = None for debug_image in debug_image_list: rat = avg_length / debug_image.shape[1] debug_image = cv2.resize(debug_image, None, fx=rat, fy=rat) if di is None: di = debug_image else: di = np.concatenate((di, debug_image), axis=0) di = cv2.resize(di, None, fx=0.4, fy=0.4) cv2.imwrite("{}/debug_image/{}.jpg".format(checkpoint_dir, args.iter_num), di) model.train() # hard stop if args.iter_num == 50000: logger.info("ITER NUM == 50k, training successfully!") raise SystemExit
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, log_writer): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) # lr_scheduler = None milestones = [len(data_loader)//2] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.8) # if epoch == 0: # warmup_factor = 1. / 1000 # warmup_iters = min(1000, len(data_loader) - 1) # # lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) count = 0 for images, targets in metric_logger.log_every(data_loader, print_freq, header): count += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("count {}".format(count)) print(">>>>>>>>>>>>>>>>>> bboxes") print(targets[0]["boxes"]) print(">>>>>>>>>>>>>>>>>> labels") print(targets[0]["labels"]) print(">>>>>>>>>>>>>>>>>> image_id") print(targets[0]["image_id"]) print(">>>>>>>>>>>>>>>>>> area") print(targets[0]["area"]) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) # ================================================================== # # Tensorboard Logging # # ================================================================== # if count % 100 == 0: n_iter = count + epoch * len(data_loader) / len(images) log_writer.add_scalar('Loss/total', loss_value, n_iter/100) log_writer.add_scalar('Loss/class', loss_dict['loss_classifier'], n_iter/100) log_writer.add_scalar('Loss/bbox', loss_dict['loss_box_reg'], n_iter/100) log_writer.add_scalar('Loss/mask', loss_dict['loss_mask'], n_iter/100) log_writer.add_scalar('Loss/objectness', loss_dict['loss_objectness'], n_iter/100) log_writer.add_scalar('Loss/rpn_box', loss_dict['loss_rpn_box_reg'], n_iter/100)
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, writer=None): count = 0 model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) flag = False for images, targets in metric_logger.log_every(data_loader, print_freq, header): count += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] if 1 not in targets[0]['labels']: continue #print(len(targets)) flag = 0 for i in range(len(targets)) : if len(targets[i]['boxes'])==0: flag = 1 break if flag is 1 : continue loss_dict = model(images, targets) # losses = sum(loss for loss in loss_dict.values()) losses = 0 for i in loss_dict: if i == 'loss_keypoint': losses += loss_dict[i] * 0.5 else: losses += loss_dict[i] # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) # sys.exit(1) continue optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if writer and count % 100 == 0: writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'], epoch * len(data_loader) + count) writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'], epoch * len(data_loader) + count) if 'loss_mask' in loss_dict.keys(): writer.add_scalar('loss_mask', loss_dict_reduced['loss_mask'], epoch * len(data_loader) + count) if 'loss_keypoint' in loss_dict.keys(): writer.add_scalar('loss_keypoint', loss_dict_reduced['loss_keypoint'], epoch * len(data_loader) + count)
def evaluate(model, data_loader, device, epoch, writer=None): global best_mAp n_threads = torch.get_num_threads() torch.set_num_threads(1) cpu_device = torch.device("cpu") model.eval() metric_logger = utils.MetricLogger(delimiter=" ") header = 'Test:' coco = get_coco_api_from_dataset(data_loader.dataset) print("get coco dataset completed!") iou_types = _get_iou_types(model) coco_evaluator = CocoEvaluator(coco, iou_types) running_loss = 0 running_num = 0 for image, targets in metric_logger.log_every(data_loader, 100, header): image = list(img.to(device) for img in image) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] torch.cuda.synchronize() model_time = time.time() outputs, loss_dict = model(image, targets) loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) print('losses_reduced type:', type(losses_reduced)) loss_value = losses_reduced running_loss += loss_value running_num += len(image) outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] model_time = time.time() - model_time res = { target["image_id"].item(): output for target, output in zip(targets, outputs) } evaluator_time = time.time() coco_evaluator.update(res) evaluator_time = time.time() - evaluator_time metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) coco_evaluator.synchronize_between_processes() # accumulate predictions from all images coco_evaluator.accumulate() stats_dic = coco_evaluator.summarize() print('stats_dic', stats_dic) bbox_mAp = stats_dic['bbox'][0] torch.set_num_threads(n_threads) if writer is not None: writer.add_scalar('runing_loss', running_loss / running_num, epoch) writer.add_scalar('test_mAP', bbox_mAp, epoch) return coco_evaluator, bbox_mAp