def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, my_logger=None, name=None, env_name=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) warmup_lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) warmup_lr_scheduler = utils.warmup_lr_scheduler( optimizer, warmup_iters, warmup_factor) for images, targets, name in metric_logger.log_every( data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) for img, t in zip(images, targets): print(img.shape) print(t) print(name) sys.exit(1) if my_logger: my_logger.scalar(loss_value, env=env_name, win="Loss", trace=name, xlabel="Iteration") optimizer.zero_grad() losses.backward() optimizer.step() if warmup_lr_scheduler: warmup_lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, log_writer): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) # lr_scheduler = None milestones = [len(data_loader)//2] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.8) # if epoch == 0: # warmup_factor = 1. / 1000 # warmup_iters = min(1000, len(data_loader) - 1) # # lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) count = 0 for images, targets in metric_logger.log_every(data_loader, print_freq, header): count += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("count {}".format(count)) print(">>>>>>>>>>>>>>>>>> bboxes") print(targets[0]["boxes"]) print(">>>>>>>>>>>>>>>>>> labels") print(targets[0]["labels"]) print(">>>>>>>>>>>>>>>>>> image_id") print(targets[0]["image_id"]) print(">>>>>>>>>>>>>>>>>> area") print(targets[0]["area"]) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) # ================================================================== # # Tensorboard Logging # # ================================================================== # if count % 100 == 0: n_iter = count + epoch * len(data_loader) / len(images) log_writer.add_scalar('Loss/total', loss_value, n_iter/100) log_writer.add_scalar('Loss/class', loss_dict['loss_classifier'], n_iter/100) log_writer.add_scalar('Loss/bbox', loss_dict['loss_box_reg'], n_iter/100) log_writer.add_scalar('Loss/mask', loss_dict['loss_mask'], n_iter/100) log_writer.add_scalar('Loss/objectness', loss_dict['loss_objectness'], n_iter/100) log_writer.add_scalar('Loss/rpn_box', loss_dict['loss_rpn_box_reg'], n_iter/100)
def train_one_epoch(model, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, vis=None, checkpoint_fn=None, prob=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter( 'clips/s', utils.SmoothedValue(window_size=10, fmt='{value:.3f}')) header = f'Epoch: [{epoch}]' # Initialise wandb if vis is not None: vis.wandb_init(model) for step, ((video, orig, orig_unnorm), sp_mask) in enumerate( metric_logger.log_every(data_loader, print_freq, header)): start_time = time.time() grid = np.random.choice([True, False], p=[prob, 1 - prob]) if grid: video = video.to(device) output, loss, diagnostics = model( video, None, None, orig_unnorm=None) if not args.teacher_student else model(video) else: sp_mask = sp_mask.to(device) orig = orig.to(device) max_sp_num = len(torch.unique(sp_mask)) output, loss, diagnostics = model(orig, sp_mask, max_sp_num, orig_unnorm=orig_unnorm) loss = loss.mean() # if vis is not None and np.random.random() < 0.01: if vis is not None: vis.log(dict(loss=loss.mean().item())) vis.log({k: v.mean().item() for k, v in diagnostics.items()}) # NOTE Stochastic checkpointing has been retained if checkpoint_fn is not None and np.random.random() < 0.005: checkpoint_fn() optimizer.zero_grad() loss.backward() optimizer.step() metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['clips/s'].update(video.shape[0] / (time.time() - start_time)) lr_scheduler.step() # Change Compactness During The Epoch # if step > len(data_loader)//2 and epoch < 15: # compactness = data_loader.dataset.get_compactness() # data_loader.dataset.set_compactness(compactness - 10) checkpoint_fn()
def train(batch_size, checkpoint_freq, num_epochs): num_classes = 2 model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True, rpn_nms_thresh=1, rpn_pre_nms_top_n_train=5000) in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels hidden_layer = 256 model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes) model = torch.nn.DataParallel(model) model.to('cuda') dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) hook = smd.Hook.create_from_json_file() for epoch in range(num_epochs): hook.set_mode(modes.TRAIN) model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for iteration, (images, targets) in enumerate(data_loader): images = list(image.to('cuda') for image in images) targets = [{k: v.to('cuda') for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if iteration % checkpoint_freq == 0: utils.save_on_master( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, 'model_{}.pth') lr_scheduler.step() hook.set_mode(modes.EVAL) evaluate(model, data_loader_test, device='cuda')
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, writer=None): count = 0 model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): count += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] #print(len(targets)) flag = 0 for i in range(len(targets)): if len(targets[i]['boxes']) == 0: flag = 1 break if flag is 1: continue loss_dict = model(images, targets) # losses = sum(loss for loss in loss_dict.values()) losses = 0 for i in loss_dict: if i == 'loss_keypoint': losses += loss_dict[i] * 0.5 else: losses += loss_dict[i] # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) # sys.exit(1) continue optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if writer and count % 100 == 0: writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'], epoch * len(data_loader) + count) writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'], epoch * len(data_loader) + count) writer.add_scalar('loss_mask', loss_dict_reduced['loss_mask'], epoch * len(data_loader) + count) writer.add_scalar('loss_keypoint', loss_dict_reduced['loss_keypoint'], epoch * len(data_loader) + count)
def train_one_epoch(model, optimizer, data_loader, device, epoch, gradient_accumulation_steps, print_freq, box_threshold): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) optimizer.zero_grad() # gradient_accumulation steps = 0 # gradient_accumulation for images, targets in metric_logger.log_every(data_loader, print_freq, header): # print("target: {}".format(targets)) steps += 1 # gradient_accumulation images = list(image.to(device) for image in images) targets = [{ k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items() } for t in targets] if box_threshold is None: loss_dict = model(images, targets) else: loss_dict = model(images, box_threshold, targets) # print(loss_dict) losses = sum(loss / gradient_accumulation_steps for loss in loss_dict.values()) # gradient_accumulation # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) #optimizer.zero_grad() losses.backward() # ofekp: we add grad clipping here to avoid instabilities in training torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0) # gradient_accumulation if steps % gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch_FastRCNN(model, optimizer, data_loader, device, epoch, print_freq, mode="sew6", encoder=None, train_encoder=False): #this data loader is given loader #mode can be "sew6", "panorm", "autoencode" model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None # if epoch == 0: # warmup_factor = 1. / 1000 # warmup_iters = min(1000, len(data_loader) - 1) # lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) if mode == 'panorm': tt = transforms.Compose( [transforms.Resize((800, 800)), transforms.ToTensor(), normalize]) #this is for 6 images combo for sample, old_targets, road_image, extra in metric_logger.log_every( data_loader, print_freq, header): #images = sample[0] targets = trans_target(old_targets) #print("images len {}, targets len {}".format(len(images), len(targets))) #print("len(sample) {}, sample [0] shape {}".format(len(sample), sample[0].shape)) # [6, 3, 256, 306] #images = list(image.to(device) for image in images) if mode == "panorm": images = [ tt(s).to(device) for s in sew_images_panorm(sample, to_img=True) ] elif mode == "autoencode": encoder.cuda() samp_pan = sew_images_panorm(sample) #convert to panoramic tensor samp_pan = [normalize(i) for i in samp_pan] samp_pan_t = torch.stack(samp_pan, dim=0) #stack if train_encoder: images = encoder.return_image_tensor( samp_pan_t.to(device), train_encoder ) #see if it will take it or it needs to take a list else: images = encoder.return_image_tensor(samp_pan_t.cuda(), train_encoder).to(device) else: #mode is sew6 images = [tt(sew_images(s)).to(device) for s in sample ] #list of [3, 800, 800], should be 1 per patch targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) #print(loss_dict) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None, teacher=None, set_training_mode=True): # TODO fix this for finetuning # model.train(set_training_mode) model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 100 for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) if mixup_fn is not None: samples, targets = mixup_fn(samples, targets) samples, targets, mix_rate, aux_targets = two_mix( samples, targets, num_patch=samples.shape[-1] // 16) with torch.cuda.amp.autocast(): # outputs, r_loss = model(samples) outputs, r_loss, s_loss, proj = model(samples, aux_targets) loss = torch.sum(-targets * (1e-8 + outputs.softmax(dim=-1)).log(), dim=-1).mean() loss_value = loss.item() loss += 1. * (r_loss + 1. * s_loss) if not math.isfinite(loss.item()): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) optimizer.zero_grad() # this attribute is added by timm on one optimizer (adahessian) is_second_order = hasattr( optimizer, 'is_second_order') and optimizer.is_second_order loss_scaler(loss, optimizer, clip_grad=max_norm, parameters=model.parameters(), create_graph=is_second_order) torch.cuda.synchronize() if model_ema is not None: model_ema.update(model) metric_logger.update(loss=loss_value) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['r'].update(r_loss.item(), n=targets.shape[0]) # metric_logger.meters['p'].update(proj.item(), n=targets.shape[0]) metric_logger.meters['s'].update(s_loss.item(), n=targets.shape[0]) # metric_logger.meters['cos'].update(cos.item(), n=targets.shape[0]) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, writer, ckpt_path): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) for batch_idx, (images, targets) in enumerate( metric_logger.log_every(data_loader, print_freq, header)): writer.add_scalar('Training Loss', loss_value, epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'].item(), epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'].item(), epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_objectness', loss_dict_reduced['loss_objectness'].item(), epoch * len(data_loader) + batch_idx) writer.add_scalar('loss_rpn_box_reg', loss_dict_reduced['loss_rpn_box_reg'].item(), epoch * len(data_loader) + batch_idx) for name, param in model.named_parameters(): if param.grad is not None: param_norm = param.grad.data.norm(2).cpu().item() writer.add_histogram(name + '_grad', param_norm, epoch) # else: # print("{} has no grad".format(name)) optimizer.step() if lr_scheduler is not None: lr_scheduler.step() # Save model print("Saving model at training epoch: {}".format(epoch + 1)) ckpt_dict = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save( ckpt_dict, os.path.join( ckpt_path, 'ckpt_epoch-' + str(epoch + 1) + 'loss' + str(loss_value) + '.pth'))
def train_one_epoch(model: torch.nn.Module, d_vae: torch.nn.Module, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, log_writer=None, lr_scheduler=None, start_steps=None, lr_schedule_values=None, wd_schedule_values=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.add_meter( 'min_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 10 for step, (batch, _) in enumerate( metric_logger.log_every(data_loader, print_freq, header)): # assign learning rate & weight decay for each step it = start_steps + step # global training iteration if lr_schedule_values is not None or wd_schedule_values is not None: for i, param_group in enumerate(optimizer.param_groups): if lr_schedule_values is not None: param_group["lr"] = lr_schedule_values[it] * param_group[ "lr_scale"] if wd_schedule_values is not None and param_group[ "weight_decay"] > 0: param_group["weight_decay"] = wd_schedule_values[it] samples, images, bool_masked_pos = batch images = images.to(device, non_blocking=True) samples = samples.to(device, non_blocking=True) bool_masked_pos = bool_masked_pos.to(device, non_blocking=True) with torch.no_grad(): input_ids = d_vae.get_codebook_indices(images).flatten(1) bool_masked_pos = bool_masked_pos.flatten(1).to(torch.bool) labels = input_ids[bool_masked_pos] with torch.cuda.amp.autocast(): outputs = model(samples, bool_masked_pos=bool_masked_pos, return_all_tokens=False) loss = nn.CrossEntropyLoss()(input=outputs, target=labels) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) optimizer.zero_grad() # this attribute is added by timm on one optimizer (adahessian) is_second_order = hasattr( optimizer, 'is_second_order') and optimizer.is_second_order grad_norm = loss_scaler(loss, optimizer, clip_grad=max_norm, parameters=model.parameters(), create_graph=is_second_order) loss_scale_value = loss_scaler.state_dict()["scale"] torch.cuda.synchronize() mlm_acc = (outputs.max(-1)[1] == labels).float().mean().item() metric_logger.update(mlm_acc=mlm_acc) if log_writer is not None: log_writer.update(mlm_acc=mlm_acc, head="loss") metric_logger.update(loss=loss_value) metric_logger.update(loss_scale=loss_scale_value) min_lr = 10. max_lr = 0. for group in optimizer.param_groups: min_lr = min(min_lr, group["lr"]) max_lr = max(max_lr, group["lr"]) metric_logger.update(lr=max_lr) metric_logger.update(min_lr=min_lr) weight_decay_value = None for group in optimizer.param_groups: if group["weight_decay"] > 0: weight_decay_value = group["weight_decay"] metric_logger.update(weight_decay=weight_decay_value) metric_logger.update(grad_norm=grad_norm) if log_writer is not None: log_writer.update(loss=loss_value, head="loss") log_writer.update(loss_scale=loss_scale_value, head="opt") log_writer.update(lr=max_lr, head="opt") log_writer.update(min_lr=min_lr, head="opt") log_writer.update(weight_decay=weight_decay_value, head="opt") log_writer.update(grad_norm=grad_norm, head="opt") log_writer.set_step() if lr_scheduler is not None: lr_scheduler.step_update(start_steps + step) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None, amp: bool = True, teacher_model: torch.nn.Module = None, teach_loss: torch.nn.Module = None, distill_token: bool=False, choices=None, mode='super', retrain_config=None): model.train() criterion.train() # set random seed random.seed(epoch) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 10 if mode == 'retrain': config = retrain_config model_module = unwrap_model(model) print(config) model_module.set_sample_config(config=config) print(model_module.get_sampled_params_numel(config)) for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) # sample random config if mode == 'super': config = sample_configs(choices=choices) model_module = unwrap_model(model) model_module.set_sample_config(config=config) elif mode == 'retrain': config = retrain_config model_module = unwrap_model(model) model_module.set_sample_config(config=config) if mixup_fn is not None: samples, targets = mixup_fn(samples, targets) if amp: with torch.cuda.amp.autocast(): if teacher_model: with torch.no_grad(): teach_output = teacher_model(samples) _, teacher_label = teach_output.topk(1, 1, True, True) if distill_token: output_cls, output_dis = model(samples) loss = 1/2 * criterion(output_cls, targets) + 1/2 * teach_loss(output_dis, teacher_label.squeeze()) else: outputs = model(samples) loss = 1/2 * criterion(outputs, targets) + 1/2 * teach_loss(outputs, teacher_label.squeeze()) else: outputs = model(samples) loss = criterion(outputs, targets) else: outputs = model(samples) if teacher_model: with torch.no_grad(): teach_output = teacher_model(samples) _, teacher_label = teach_output.topk(1, 1, True, True) loss = 1 / 2 * criterion(outputs, targets) + 1 / 2 * teach_loss(outputs, teacher_label.squeeze()) else: loss = criterion(outputs, targets) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) optimizer.zero_grad() # this attribute is added by timm on one optimizer (adahessian) if amp: is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order loss_scaler(loss, optimizer, clip_grad=max_norm, parameters=model.parameters(), create_graph=is_second_order) else: loss.backward() optimizer.step() torch.cuda.synchronize() if model_ema is not None: model_ema.update(model) metric_logger.update(loss=loss_value) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train_one_epoch(self, lr_schedule='cyclic'): self.model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(self.epoch) lr_scheduler = None if (self.epoch == 0): if lr_schedule == 'warmup': warmup_factor = 1. / 1000 warmup_iters = min(1000, len(self.data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(self.optimizer, warmup_iters, warmup_factor) elif lr_schedule == 'cyclic': lr_scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer, 1e-6, 1e-2) for iteration, (images, targets) in enumerate(metric_logger.log_every(self.data_loader, self.print_freq, header)): with torch.autograd.detect_anomaly(): images = list(image.to(self.device) for image in images) targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets] loss_dict = self.model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if self.emergency is True: if not math.isfinite(loss_value): print() print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) self.optimizer.zero_grad() losses.backward() grad_norm = clip_grad_norm_(self.model.parameters(), grad_clip_norm_value) self.optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=self.optimizer.param_groups[0]["lr"]) if self.logger is not None: if iteration % 50 == 0: # 1. Log scalar values (scalar summary) info = {'loss': losses_reduced, **loss_dict_reduced} for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration+1) # 2. Log values and gradients of the parameters (histogram summary) for tag, value in self.model.named_parameters(): tag = tag.replace('.', '/') self.logger.histo_summary(tag, value.data.cpu().numpy(), iteration+1) self.epoch += 1
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch + 1) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) info_dict = { "lr": [], "loss_comb": [], "loss_classifier": [], "loss_box_reg": [], "loss_objectness": [], "loss_rpn_box_reg": [] } for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) info_dict["loss_comb"].append(loss_value) for k in loss_dict_reduced.keys(): info_dict[k].append(loss_dict_reduced[k].item()) info_dict["lr"].append(optimizer.param_groups[0]["lr"]) info_dict["total_loss"] = sum(info_dict["loss_comb"]) / len( info_dict["loss_comb"]) return info_dict
def train_SSL(model: torch.nn.Module, criterion, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None): model.train(True) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 50 i = 0 for imgs1, rots1, imgs2, rots2 in metric_logger.log_every( data_loader, print_freq, header): imgs1 = imgs1.to(device, non_blocking=True) imgs1_aug = distortImages(imgs1) # Apply distortion rots1 = rots1.to(device, non_blocking=True) imgs2 = imgs2.to(device, non_blocking=True) imgs2_aug = distortImages(imgs2) rots2 = rots2.to(device, non_blocking=True) with torch.cuda.amp.autocast(): rot1_p, contrastive1_p, imgs1_recon, r_w, cn_w, rec_w = model( imgs1_aug) rot2_p, contrastive2_p, imgs2_recon, _, _, _ = model(imgs2_aug) rot_p = torch.cat([rot1_p, rot2_p], dim=0) rots = torch.cat([rots1, rots2], dim=0) imgs_recon = torch.cat([imgs1_recon, imgs2_recon], dim=0) imgs = torch.cat([imgs1, imgs2], dim=0) loss, (loss1, loss2, loss3) = criterion(rot_p, rots, contrastive1_p, contrastive2_p, imgs_recon, imgs, r_w, cn_w, rec_w) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) optimizer.zero_grad() # this attribute is added by timm on one optimizer (adahessian) is_second_order = hasattr( optimizer, 'is_second_order') and optimizer.is_second_order loss_scaler(loss, optimizer, clip_grad=max_norm, parameters=model.parameters(), create_graph=is_second_order) torch.cuda.synchronize() if model_ema is not None: model_ema.update(model) metric_logger.update(loss=loss_value) metric_logger.update(RotationLoss=loss1.data.item()) metric_logger.update(RotationScalar=r_w.data.item()) metric_logger.update(ContrastiveLoss=loss2.data.item()) metric_logger.update(ContrastiveScalar=cn_w.data.item()) metric_logger.update(ReconstructionLoss=loss3.data.item()) metric_logger.update(ReconstructionScalar=rec_w.data.item()) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) i = i + 1 # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, print_freq, apex=False): model.train() metric_logger = utils.MetricLogger(delimiter=" ", device=device) metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter('img/s', utils.SmoothedValue(window_size=10, fmt='{value}')) header = 'Epoch: [{}]'.format(epoch) step_count = 0 last_print_time = time.time() for image, target in metric_logger.log_every(data_loader, print_freq, header): image, target = image.to(device, non_blocking=True), target.to( device, non_blocking=True) dl_ex_start_time = time.time() if args.channels_last: image = image.contiguous(memory_format=torch.channels_last) if args.run_lazy_mode: # This mark_step is added so that the the lazy kernel can # create and evaluate the graph to infer the resulting tensor # as channels_last import habana_frameworks.torch.core as htcore htcore.mark_step() output = model(image) loss = criterion(output, target) optimizer.zero_grad() # We see the performance gain of mobilenet via added this mark_step. if (args.run_lazy_mode and 'mobilenet_v2' in args.model): import habana_frameworks.torch.core as htcore htcore.mark_step() if apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.run_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() optimizer.step() if args.run_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() if step_count % print_freq == 0: output_cpu = output.detach().to('cpu') acc1, acc5 = utils.accuracy(output_cpu, target, topk=(1, 5)) batch_size = image.shape[0] metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size * print_freq) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size * print_freq) current_time = time.time() last_print_time = dl_ex_start_time if args.dl_time_exclude else last_print_time metric_logger.meters['img/s'].update( batch_size * print_freq / (current_time - last_print_time)) last_print_time = time.time() step_count = step_count + 1 if step_count >= args.num_train_steps: break
def train_one_epoch( model, arch, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, ngpus_per_node, model_without_ddp, args ): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}")) # header = "Epoch: [{}]".format(epoch) for images, targets in metric_logger.log_every( iterable=data_loader, print_freq=print_freq, # header=header, iter_num=args.iter_num ): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] """ [{"boxes": tensor([], device="cuda:0"), "labels": tensor([], device="cuda:0", dtype=torch.int64), "masks": tensor([], device="cuda:0", dtype=torch.uint8), "iscrowd": tensor([], device="cuda:0", dtype=torch.int64)}] """ try: loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): logger.fatal("Loss is {}, stopping training".format(loss_value)) logger.fatal(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) except Exception as e: logger.warning(e, exc_info=True) # logger.info("print target for debug") # print(targets) args.iter_num += 1 # save checkpoint here if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if args.iter_num % 1000 == 0: utils.save_on_master({ "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "iter_num": args.iter_num, "args": args, }, "{}/{}_{}.pth".format(checkpoint_dir, arch.__name__, args.iter_num) ) os.makedirs("{}/debug_image/".format(checkpoint_dir), exist_ok=True) if args.iter_num < 5000: continue model.eval() # from barez import overlay_ann debug_image = None debug_image_list = [] cnt = 0 for image_path in glob.glob("./table_test/*"): cnt += 1 image_name = os.path.basename(image_path) # print(image_name) image = cv2.imread(image_path) rat = 1300 / image.shape[0] image = cv2.resize(image, None, fx=rat, fy=rat) transform = transforms.Compose([transforms.ToTensor()]) image = transform(image) # put the model in evaluation mode with torch.no_grad(): tensor = [image.to(device)] prediction = model(tensor) image = torch.squeeze(image, 0).permute(1, 2, 0).mul(255).numpy().astype(np.uint8) for pred in prediction: for idx, mask in enumerate(pred['masks']): if pred['scores'][idx].item() < 0.5: continue m = mask[0].mul(255).byte().cpu().numpy() box = list(map(int, pred["boxes"][idx].tolist())) score = pred["scores"][idx].item() # image = overlay_ann(image, m, box, "", score) # if debug_image is None: # debug_image = image # else: # debug_image = np.concatenate((debug_image, image), axis=1) # if cnt == 10: # cnt = 0 # debug_image_list.append(debug_image) # debug_image = None avg_length = np.mean([i.shape[1] for i in debug_image_list]) di = None for debug_image in debug_image_list: rat = avg_length / debug_image.shape[1] debug_image = cv2.resize(debug_image, None, fx=rat, fy=rat) if di is None: di = debug_image else: di = np.concatenate((di, debug_image), axis=0) di = cv2.resize(di, None, fx=0.4, fy=0.4) cv2.imwrite("{}/debug_image/{}.jpg".format(checkpoint_dir, args.iter_num), di) model.train() # hard stop if args.iter_num == 50000: logger.info("ITER NUM == 50k, training successfully!") raise SystemExit
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, tb_writer): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( "lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}")) header = "Epoch: [{}]".format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): try: targets = [{k: v.to(device) for k, v in t.items()} for t in targets if t["boxes"].shape[0] > 0] images = list( image.to(device) for image, t in zip(images, targets) if t["boxes"].shape[0] > 0) except: print("neeeee") # breakpoint() try: loss_dict = model(images, targets) except: print("daaaaa") # breakpoint() losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, epoch=tb_writer["step"], tb_writer=tb_writer["writer"], **loss_dict_reduced) metric_logger.update( lr=optimizer.param_groups[0]["lr"], epoch=tb_writer["step"], tb_writer=tb_writer["writer"], ) tb_writer["step"] += 1
fine_train_dataset = load_dataset(train_imgs, fine_tr, bs) coarse_train_dataset = load_dataset(train_imgs, coarse_tr, bs) for e in range(epochs): print('Starting training for %g epochs...' % e) fine_train_loader = load_dataloader(bs, fine_train_dataset) coarse_train_loader = load_dataloader(bs, coarse_train_dataset) fine_train_nb = len(fine_train_loader) coarse_train_nb = len(coarse_train_loader) assert fine_train_nb == coarse_train_nb, 'fine & coarse train batch number is not matched' nb = fine_train_nb # Logger fine_metric_logger = utils.MetricLogger(delimiter=" ") fine_metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) coarse_metric_logger = utils.MetricLogger(delimiter=" ") coarse_metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) fine_header = 'Fine Epoch: [{}]'.format(e) coarse_header = 'Coarse Epoch: [{}]'.format(e) # # warmup fine_lr_scheduler = None corase_lr_scheduler = None if e == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, fine_train_nb - 1) fine_lr_scheduler = utils.warmup_lr_scheduler(fine_optim, warmup_iters, warmup_factor) coarse_lr_scheduler = utils.warmup_lr_scheduler(coarse_optim, warmup_iters, warmup_factor) for i, (fine_train, coarse_train) in enumerate(zip(fine_train_loader, coarse_train_loader)):
def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled_lr): """Train and evaluate the model Args: model (dlrm): loss_fn (torch.nn.Module): Loss function optimizer (torch.nn.optim): data_loader_train (torch.utils.data.DataLoader): data_loader_test (torch.utils.data.DataLoader): scaled_lr (float) """ # Print per 16384 * 2000 samples by default default_print_freq = 16384 * 2000 // FLAGS.batch_size print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq steps_per_epoch = len(data_loader_train) # MLperf requires 20 tests per epoch test_freq = FLAGS.test_freq if FLAGS.test_freq is not None else steps_per_epoch // 20 metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'loss', utils.SmoothedValue(window_size=print_freq, fmt='{avg:.4f}')) metric_logger.add_meter( 'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}')) metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.4f}')) lr_scheduler = utils.LearningRateScheduler( optimizers=[optimizer], base_lrs=[scaled_lr], warmup_steps=FLAGS.warmup_steps, warmup_factor=FLAGS.warmup_factor, decay_start_step=FLAGS.decay_start_step, decay_steps=FLAGS.decay_steps, decay_power=FLAGS.decay_power, end_lr_factor=FLAGS.decay_end_lr / FLAGS.lr) step = 0 start_time = time() stop_time = time() for epoch in range(FLAGS.epochs): epoch_start_time = time() for numerical_features, categorical_features, click in data_loader_train: global_step = steps_per_epoch * epoch + step lr_scheduler.step() output = model(numerical_features, categorical_features).squeeze() loss = loss_fn(output, click) optimizer.zero_grad() if FLAGS.fp16: loss *= FLAGS.loss_scale with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() # Cancel loss scale for logging if fp16 is used metric_logger.update(loss=loss.item() / (FLAGS.loss_scale if FLAGS.fp16 else 1), lr=optimizer.param_groups[0]["lr"] * (FLAGS.loss_scale if FLAGS.fp16 else 1)) if step % print_freq == 0: # Averaging cross a print_freq period to reduce the error. # An accurate timing needs synchronize which would slow things down. metric_logger.update(step_time=(time() - stop_time) / print_freq) stop_time = time() eta_str = datetime.timedelta( seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step))) metric_logger.print( header= F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}] eta: {eta_str}" ) if global_step % test_freq == 0 and global_step > 0 and global_step / steps_per_epoch >= FLAGS.test_after: loss, auc = evaluate(model, loss_fn, data_loader_test) print( F"Epoch {epoch} step {step}. Test loss {loss:.4f}, auc {auc:.6f}" ) stop_time = time() if auc >= FLAGS.auc_threshold: run_time_s = int(stop_time - start_time) print( F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch " F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. " F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s." ) return step += 1 epoch_stop_time = time() epoch_time_s = epoch_stop_time - epoch_start_time print( F"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. " F"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s." )
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, args, print_freq, logger, iterations, bert_model, baseline_model): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) header = 'Epoch: [{}]'.format(epoch) train_loss = 0 total_its = 0 train_emb_loss = 0 train_seg_loss = 0 for data in metric_logger.log_every(data_loader, print_freq, header): total_its += 1 image, target, sentences, attentions = data image, target, sentences, attentions = image.to(device), target.to( device), sentences.to(device), attentions.to(device) sentences = sentences.squeeze(1) attentions = attentions.squeeze(1) if args.baseline_bilstm: num_tokens = torch.sum(attentions, dim=-1) unbinded_sequences = list(torch.unbind(sentences, dim=0)) processed_seqs = [ seq[:num_tokens[i], :] for i, seq in enumerate(unbinded_sequences) ] packed_sentences = torch.nn.utils.rnn.pack_sequence( processed_seqs, enforce_sorted=False) hidden_states, cell_states = baseline_model[0](packed_sentences) hidden_states = torch.nn.utils.rnn.pad_packed_sequence( hidden_states, batch_first=True, total_length=20) hidden_states = hidden_states[0] unbinded_hidden_states = list(torch.unbind(hidden_states, dim=0)) processed_hidden_states = [ seq[:num_tokens[i], :] for i, seq in enumerate(unbinded_hidden_states) ] mean_hidden_states = [ torch.mean(seq, dim=0).unsqueeze(0) for seq in processed_hidden_states ] last_hidden_states = torch.cat(mean_hidden_states, dim=0) last_hidden_states = baseline_model[1](last_hidden_states) last_hidden_states = last_hidden_states.unsqueeze(1) else: last_hidden_states = bert_model(sentences, attention_mask=attentions)[0] embedding = last_hidden_states[:, 0, :] output, vis_emb, lan_emb = model(image, embedding.squeeze(1)) loss = criterion(output, target, args) optimizer.zero_grad() loss.backward() optimizer.step() if args.linear_lr: adjust_learning_rate(optimizer, epoch, args) else: lr_scheduler.step() train_loss += loss.item() iterations += 1 metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) del image, target, sentences, attentions, loss, embedding, output, vis_emb, lan_emb, last_hidden_states, data gc.collect() torch.cuda.empty_cache() train_loss = train_loss / total_its logger.scalar_summary('loss', train_loss, epoch) logger.scalar_summary('lr', optimizer.param_groups[0]["lr"], epoch)
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for i in metric_logger.log_every(data_loader, print_freq, header): try: images, targets = i '''Burası değiştirilecek''' targets["boxes"] = targets["boxes"].to(device) targets["labels"] = targets["labels"].to(device) targets["boxes"].squeeze_() targets["labels"].squeeze_() targets1 = [{k: v for k, v in targets.items()}] images = images.to(device) targets = targets1 # zero the parameter gradients # forward # track history if only in train #images = list(image.to(device) for image in images) #targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() #print(targets[0]["boxes"]) if not math.isfinite(loss_value): print(images.size()) print(targets[0]["boxes"]) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) except ValueError: continue return metric_logger
def my_train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None, set_training_mode=True, fp32=False): model.train(set_training_mode) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 10 # prefetcher = data_prefetcher(data_loader, device, prefetch=True) prefetcher = DataPrefetcher(data_loader) samples, targets = prefetcher.next() for _ in metric_logger.log_every(range(len(data_loader)), print_freq, header): samples = samples.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) if mixup_fn is not None: samples, targets = mixup_fn(samples, targets) # with torch.cuda.amp.autocast(): # outputs = model(samples) # loss = criterion(samples, outputs, targets) with torch.cuda.amp.autocast(enabled=not fp32): outputs = model(samples) loss = criterion(samples, outputs, targets) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) optimizer.zero_grad() # this attribute is added by timm on one optimizer (adahessian) is_second_order = hasattr( optimizer, 'is_second_order') and optimizer.is_second_order loss_scaler(loss, optimizer, clip_grad=max_norm, parameters=model.parameters(), create_graph=is_second_order) torch.cuda.synchronize() if model_ema is not None: model_ema.update(model) metric_logger.update(loss=loss_value) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) samples, targets = prefetcher.next() # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None): # TODO fix this for finetuning model.train() criterion.train() end = time.time() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 50 for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) metric_logger.update(data_time=time.time() - end) if mixup_fn is not None: samples, targets = mixup_fn(samples, targets) end = time.time() with torch.cuda.amp.autocast(): outputs = model(samples) loss = criterion(outputs, targets) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) optimizer.zero_grad() # this attribute is added by timm on one optimizer (adahessian) is_second_order = hasattr( optimizer, 'is_second_order') and optimizer.is_second_order loss_scaler(loss, optimizer, clip_grad=max_norm, parameters=model.parameters(), create_graph=is_second_order) batch_time = time.time() - end torch.cuda.synchronize() if model_ema is not None: model_ema.update(model) metric_logger.update(batch_time=batch_time) metric_logger.update(throughput=samples.size(0) / batch_time) metric_logger.update(loss=loss_value) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, mode='normal'): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) if mode == 'postFusion': for images, motion, targets in metric_logger.log_every( data_loader, print_freq, header): images = list(image.to(device) for image in images) motion = list(m.to(device) for m in motion) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model([images, motion], targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) else: for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) loss_plt = [] for images, ann in metric_logger.log_every(data_loader, print_freq, header): targets = [] for data1 in ann: #这个for循环可以舍去 boxes = [] target = {} labels = [] for d in data1: box = d['bbox'] box = [box[0], box[1], box[0] + box[2], box[1] + box[3]] boxes.append(box) labels.append(d['category_id']) # convert everything into a torch.Tensor boxes = torch.as_tensor(boxes, dtype=torch.float32) # there is only one class labels = torch.as_tensor(labels, dtype=torch.int64) image_id = torch.tensor([data1[0]['image_id']]) area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) #print(area) #return iscrowd = torch.zeros((len(data1), ), dtype=torch.int64) # suppose all instances are not crowd target["boxes"] = boxes target["labels"] = labels target["image_id"] = image_id target["area"] = area target["iscrowd"] = iscrowd targets.append(target) images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] #假设标签没有放大相应device上?? loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_plt.append(losses) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) #break return metric_logger, loss_plt
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, compression_scheduler=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) steps_per_epoch = len(data_loader) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for train_step, (images, targets) in enumerate( metric_logger.log_every(data_loader, print_freq, header)): if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) if compression_scheduler: losses = compression_scheduler.before_backward_pass( epoch, train_step, steps_per_epoch, losses, optimizer=optimizer) optimizer.zero_grad() losses.backward() if compression_scheduler: compression_scheduler.before_parameter_optimization( epoch, train_step, steps_per_epoch, optimizer) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) #.to(device) for both targets = [{k: v.to(device) for k, v in t.items()} for t in targets] #.to(device) for both loss_dict = model(images, targets) ''' During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance The model returns a Dict[Tensor] during training, containing the classification and regression losses for both the RPN and the R-CNN, and the mask loss. During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to obtain the final segmentation masks, the soft masks can be thresholded, generally with a value of 0.5 (mask >= 0.5) ''' losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] ts = copy.deepcopy(targets) # print(f"targets before model: {targets[0]['boxes']}") # print(f"n images: {len(images)}\nn boxes: {targets[0]['boxes'].shape}\nn labels: {targets[0]['labels'].shape}\nn masks: {targets[0]['masks'].shape}\n") loss_dict = model(images, targets) print(loss_dict) # print(f"targets after model: {targets[0]['boxes']}") losses = sum(loss for loss in loss_dict.values()) # print(losses) # if losses.item() > 1: # single_image = np.transpose(images[0].cpu().detach().numpy(),(1,2,0)).squeeze() # fig = plt.figure() # ax = fig.add_subplot(111, aspect='equal') # ax.imshow(single_image) # # print(np.unique(single_image)) # # cvimg = cv2.imread(img_path) # # print(single_image.shape) # # plt.imshow(single_image) # # plt.show() # # cvimg = np.uint8(single_image*255) # # print(cvimg.shape) # # cvimg = cvimg.astype(int) # # r,g,b = cv2.split(cvimg) # # cvimg = cv2.merge([b,g,r]) # # print(cvimg) # # print(targets[0]['boxes']) # # for box in ts[0]['boxes']: # for box in targets[0]['boxes']: # # print(f"dict: {dict}") # # box = dict['boxes'] # # print(f"box: {box}") # # box = box.item() # x1 = box[0].item() # y1 = box[1].item() # x2 = box[2].item() # y2 = box[3].item() # # print(box) # # print(f"x1:{x1} y1:{y1} x2:{x2} y2:{y2}") # rect = patches.Rectangle((x1,y1),x2-x1,y2-y1,fill=False,edgecolor='r') # ax.add_patch(rect) # cv2.rectangle(cvimg,(x1,y1),(x2,y2),(255,255,0)) # plt.show() # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) # print(loss_dict_reduced) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # print(losses_reduced) loss_value = losses_reduced.item() if not math.isfinite(loss_value): # visualize_bboxes(images,targets) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])