def modifier(args, epoch, model): if epoch == 0: set_model_prune_rate(model, prune_rate=0.0) freeze_model_subnet(model) unfreeze_model_weights(model) elif epoch == 6: set_model_prune_rate(model, prune_rate=args.prune_rate) unfreeze_model_subnet(model) freeze_model_weights(model) save_checkpoint( { "epoch": epoch, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": 0.0, "best_acc5": 0.0, "best_train_acc1": 0.0, "best_train_acc5": 0.0, "curr_acc1": "Not evaluated", }, False, filename=args.ckpt_base_dir / f"epoch_2.state", save=True, )
def trn(cfg, model): cfg.logger.info(cfg) if cfg.seed is not None: random.seed(cfg.seed) torch.manual_seed(cfg.seed) torch.cuda.manual_seed(cfg.seed) torch.cuda.manual_seed_all(cfg.seed) train, validate = get_trainer(cfg) if cfg.gpu is not None: cfg.logger.info("Use GPU: {} for training".format(cfg.gpu)) linear_classifier_layer = model.module[1] optimizer = get_optimizer(cfg, linear_classifier_layer) cfg.logger.info(f"=> Getting {cfg.set} dataset") dataset = getattr(data, cfg.set)(cfg) lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg) softmax_criterion = nn.CrossEntropyLoss().cuda() criterion = lambda output, target: softmax_criterion(output, target) # optionally resume from a checkpoint best_val_acc1 = 0.0 best_val_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if cfg.resume: best_val_acc1 = resume(cfg, model, optimizer) run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories( cfg, cfg.gpu) cfg.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], cfg, prefix="Overall Timing") end_epoch = time.time() cfg.start_epoch = cfg.start_epoch or 0 last_val_acc1 = None start_time = time.time() gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu) # Start training for epoch in range(cfg.start_epoch, cfg.epochs): cfg.logger.info('Model conv 1 {} at epoch {}'.format( torch.sum(model.module[0].conv1.weight), epoch)) ## make sure backbone is not updated if cfg.world_size > 1: dataset.sampler.set_epoch(epoch) lr_policy(epoch, iteration=None) cur_lr = net_utils.get_lr(optimizer) start_train = time.time() train_acc1, train_acc5 = train(dataset.trn_loader, model, criterion, optimizer, epoch, cfg, writer=writer) train_time.update((time.time() - start_train) / 60) if (epoch + 1) % cfg.test_interval == 0: if cfg.gpu == cfg.base_gpu: # evaluate on validation set start_validation = time.time() last_val_acc1, last_val_acc5 = validate( dataset.val_loader, model.module, criterion, cfg, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = last_val_acc1 > best_val_acc1 best_val_acc1 = max(last_val_acc1, best_val_acc1) best_val_acc5 = max(last_val_acc5, best_val_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = (((epoch + 1) % cfg.save_every) == 0) and cfg.save_every > 0 if save or epoch == cfg.epochs - 1: if is_best: cfg.logger.info( f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}" ) net_utils.save_checkpoint( { "epoch": epoch + 1, "arch": cfg.arch, "state_dict": model.state_dict(), "best_acc1": best_val_acc1, "best_acc5": best_val_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": last_val_acc1, "curr_acc5": last_val_acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save or epoch == cfg.epochs - 1, ) elapsed_time = time.time() - start_time seconds_todo = (cfg.epochs - epoch) * (elapsed_time / cfg.test_interval) estimated_time_complete = timedelta(seconds=int(seconds_todo)) start_time = time.time() cfg.logger.info( f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}" ) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() if cfg.world_size > 1: dist.barrier()
def main_worker(args): train, validate, modifier = get_trainer(args) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) model = set_gpu(args, model) wandb.watch(model) if args.pretrained: pretrained(args, model) optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: criterion = nn.CrossEntropyLoss().cuda() else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: best_acc1 = resume(args, model, optimizer) # Data loading code if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return # Set up directories run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) args.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) # Start training for epoch in range(args.start_epoch, args.epochs): lr_policy(epoch, iteration=None) modifier(args, epoch, model) cur_lr = get_lr(optimizer) # train for one epoch start_train = time.time() train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save, ) wandb.log({ "curr_acc1": acc1, "curr_acc5": acc5, }) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) if args.conv_type == "SampleSubnetConv": count = 0 sum_pr = 0.0 for n, m in model.named_modules(): if isinstance(m, SampleSubnetConv): # avg pr across 10 samples pr = 0.0 for _ in range(10): pr += ((torch.rand_like(m.clamped_scores) >= m.clamped_scores).float().mean().item()) pr /= 10.0 writer.add_scalar("pr/{}".format(n), pr, epoch) sum_pr += pr count += 1 args.prune_rate = sum_pr / count writer.add_scalar("pr/average", args.prune_rate, epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, )
def trn(cfg,model): cfg.logger.info(cfg) if cfg.seed is not None: random.seed(cfg.seed) torch.manual_seed(cfg.seed) torch.cuda.manual_seed(cfg.seed) torch.cuda.manual_seed_all(cfg.seed) train, validate_knn = get_trainer(cfg) if cfg.gpu is not None: cfg.logger.info("Use GPU: {} for training".format(cfg.gpu)) # if cfg.pretrained: # net_utils.load_pretrained(cfg.pretrained,cfg.multigpu[0], model) optimizer = get_optimizer(cfg, model) cfg.logger.info(f"=> Getting {cfg.set} dataset") dataset = getattr(data, cfg.set)(cfg) lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg) if cfg.arch == 'SimSiam': # L = D(p1, z2) / 2 + D(p2, z1) / 2 base_criterion = lambda bb1_z1_p1_emb, bb2_z2_p2_emb: simsiam.SimSaimLoss(bb1_z1_p1_emb[2], bb2_z2_p2_emb[1]) / 2 +\ simsiam.SimSaimLoss(bb2_z2_p2_emb[2], bb1_z1_p1_emb[1]) / 2 elif cfg.arch == 'SimCLR': base_criterion = lambda z1,z2 : simclr.NT_XentLoss(z1, z2) else: raise NotImplemented run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories(cfg,cfg.gpu) _, zero_gpu_ckpt_base_dir, _ = path_utils.get_directories(cfg, 0) # if cfg.resume: saved_epochs = sorted(glob.glob(str(zero_gpu_ckpt_base_dir) + '/epoch_*.state'), key=os.path.getmtime) # assert len(epochs) < 2, 'Should be only one saved epoch -- the last one' if len(saved_epochs) > 0: cfg.resume = saved_epochs[-1] resume(cfg, model, optimizer) cfg.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter( 1, [epoch_time, validation_time, train_time], cfg, prefix="Overall Timing" ) end_epoch = time.time() cfg.start_epoch = cfg.start_epoch or 0 start_time = time.time() gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu) cfg.logger.info('Start Training: Model conv 1 initialization {}'.format(torch.sum(model.module.backbone.conv1.weight))) # Start training for n,m in model.module.named_modules(): if hasattr(m, "weight") and m.weight is not None: cfg.logger.info('{} ({}): {}'.format(n,type(m).__name__,m.weight.shape)) criterion = base_criterion cfg.logger.info('Using Vanilla Criterion') for epoch in range(cfg.start_epoch, cfg.epochs): if cfg.world_size > 1: dataset.sampler.set_epoch(epoch) lr_policy(epoch, iteration=None) cur_lr = net_utils.get_lr(optimizer) start_train = time.time() train(dataset.trn_loader, model,criterion, optimizer, epoch, cfg, writer=writer) train_time.update((time.time() - start_train) / 60) if (epoch + 1) % cfg.test_interval == 0: if cfg.gpu == cfg.base_gpu: # evaluate on validation set start_validation = time.time() acc = validate_knn(dataset.trn_loader, dataset.val_loader, model.module, cfg, writer, epoch) validation_time.update((time.time() - start_validation) / 60) csv_utils.write_generic_result_to_csv(path=cfg.exp_dir,name=os.path.basename(cfg.exp_dir[:-1]), epoch=epoch, knn_acc=acc) save = (((epoch+1) % cfg.save_every) == 0) and cfg.save_every > 0 if save or epoch == cfg.epochs - 1: # if is_best: # print(f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}") net_utils.save_checkpoint( { "epoch": epoch + 1, "arch": cfg.arch, "state_dict": model.state_dict(), "ACC": acc, "optimizer": optimizer.state_dict(), }, is_best=False, filename=ckpt_base_dir / f"epoch_{epoch:04d}.state", save=save or epoch == cfg.epochs - 1, ) elapsed_time = time.time() - start_time seconds_todo = (cfg.epochs - epoch) * (elapsed_time / cfg.test_interval) estimated_time_complete = timedelta(seconds=int(seconds_todo)) start_time = time.time() cfg.logger.info( f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}") epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard( writer, prefix="diagnostics", global_step=epoch ) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() if cfg.world_size > 1: # cfg.logger.info('GPU {} going into the barrier'.format(cfg.gpu)) dist.barrier()
logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step) loss_temp = 0 start = time.time() save_name = os.path.join( output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name)) if args.use_tfboard: logger.close()
def train(): args = parse_args() print('Called with args:') print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) device = torch.device('cuda') else: device = torch.device('cpu') output_dir = args.save_dir if not os.path.exists(output_dir): os.makedirs(output_dir) if args.target_only: source_train_dataset = TDETDataset(['voc07_trainval'], args.data_dir, args.prop_method, num_classes=20, prop_min_scale=args.prop_min_scale, prop_topk=args.num_prop) else: source_train_dataset = TDETDataset( ['coco60_train2014', 'coco60_val2014'], args.data_dir, args.prop_method, num_classes=60, prop_min_scale=args.prop_min_scale, prop_topk=args.num_prop) target_val_dataset = TDETDataset(['voc07_test'], args.data_dir, args.prop_method, num_classes=20, prop_min_scale=args.prop_min_scale, prop_topk=args.num_prop) lr = args.lr if args.net == 'DC_VGG16_DET': base_model = DC_VGG16_CLS(None, 20 if args.target_only else 80, 3, 4) checkpoint = torch.load(args.pretrained_base_path) base_model.load_state_dict(checkpoint['model']) del checkpoint model = DC_VGG16_DET(base_model, args.pooling_method) optimizer = model.get_optimizer(args.lr) log_file_name = os.path.join( output_dir, 'log_{}_{}.txt'.format(args.net, args.session)) log_file = open(log_file_name, 'w') log_file.write(str(args)) log_file.write('\n') model.to(device) model.train() source_loss_sum = 0 source_pos_prop_sum = 0 source_neg_prop_sum = 0 start = time.time() optimizer.zero_grad() for step in range(args.start_iter, args.max_iter + 1): if step % len(source_train_dataset) == 1: source_rand_perm = np.random.permutation(len(source_train_dataset)) source_index = source_rand_perm[step % len(source_train_dataset)] source_batch = source_train_dataset.get_data( source_index, h_flip=np.random.rand() > 0.5, target_im_size=np.random.choice([480, 576, 688, 864, 1200])) source_im_data = source_batch['im_data'].unsqueeze(0).to(device) source_proposals = source_batch['proposals'] source_gt_boxes = source_batch['gt_boxes'] if args.target_only: source_gt_labels = source_batch['gt_labels'] else: source_gt_labels = source_batch['gt_labels'] + 20 source_pos_cls = [i for i in range(80) if i in source_gt_labels] source_loss = 0 for cls in np.random.choice(source_pos_cls, 2): indices = np.where(source_gt_labels.numpy() == cls)[0] here_gt_boxes = source_gt_boxes[indices] here_proposals, here_labels, _, pos_cnt, neg_cnt = sample_proposals( here_gt_boxes, source_proposals, args.bs // 2, args.pos_ratio) # plt.imshow(source_batch['raw_img']) # draw_box(here_proposals[:pos_cnt] / source_batch['im_scale'], 'black') # draw_box(here_proposals[pos_cnt:] / source_batch['im_scale'], 'yellow') # plt.show() here_proposals = here_proposals.to(device) here_labels = here_labels.to(device) here_loss = model(source_im_data, cls, here_proposals, here_labels) source_loss = source_loss + here_loss source_pos_prop_sum += pos_cnt source_neg_prop_sum += neg_cnt source_loss = source_loss / 2 source_loss_sum += source_loss.item() source_loss.backward() clip_gradient(model, 10.0) optimizer.step() optimizer.zero_grad() if step % args.disp_interval == 0: end = time.time() source_loss_sum /= args.disp_interval source_pos_prop_sum /= args.disp_interval source_neg_prop_sum /= args.disp_interval log_message = "[%s][session %d][iter %4d] loss: %.4f, pos_prop: %.1f, neg_prop: %.1f, lr: %.2e, time: %.1f" % \ (args.net, args.session, step, source_loss_sum, source_pos_prop_sum, source_neg_prop_sum, lr, end - start) print(log_message) log_file.write(log_message + '\n') log_file.flush() source_loss_sum = 0 source_pos_prop_sum = 0 source_neg_prop_sum = 0 start = time.time() if step in (args.max_iter * 4 // 7, args.max_iter * 6 // 7): adjust_learning_rate(optimizer, 0.1) lr *= 0.1 if step % args.save_interval == 0 or step == args.max_iter: validate(model, target_val_dataset, args, device) save_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.session, step)) checkpoint = dict() checkpoint['net'] = args.net checkpoint['session'] = args.session checkpoint['pooling_method'] = args.pooling_method checkpoint['iterations'] = step checkpoint['model'] = model.state_dict() save_checkpoint(checkpoint, save_name) print('save model: {}'.format(save_name)) log_file.close()
def train(): args = parse_args() print('Called with args:') print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) device = torch.device('cuda') else: device = torch.device('cpu') output_dir = args.save_dir if not os.path.exists(output_dir): os.makedirs(output_dir) source_train_dataset = TDETDataset(['coco60_train2014', 'coco60_val2014'], args.data_dir, args.prop_method, num_classes=60, prop_min_scale=args.prop_min_scale, prop_topk=args.num_prop) target_train_dataset = TDETDataset(['voc07_trainval'], args.data_dir, args.prop_method, num_classes=20, prop_min_scale=args.prop_min_scale, prop_topk=args.num_prop) lr = args.lr if args.net == 'NEW_TDET': model = NEW_TDET(os.path.join(args.data_dir, 'pretrained_model/vgg16_caffe.pth'), 20, pooling_method=args.pooling_method, share_level=args.share_level, mil_topk=args.mil_topk) else: raise Exception('network is not defined') optimizer = model.get_optimizer(args.lr) if args.resume: load_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.checksession, args.checkiter)) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) assert args.net == checkpoint['net'] args.start_iter = checkpoint['iterations'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr = optimizer.param_groups[0]['lr'] print("loaded checkpoint %s" % (load_name)) log_file_name = os.path.join( output_dir, 'log_{}_{}.txt'.format(args.net, args.session)) if args.resume: log_file = open(log_file_name, 'a') else: log_file = open(log_file_name, 'w') log_file.write(str(args)) log_file.write('\n') model.to(device) model.train() source_loss_sum = 0 target_loss_sum = 0 source_pos_prop_sum = 0 source_neg_prop_sum = 0 target_prop_sum = 0 start = time.time() for step in range(args.start_iter, args.max_iter + 1): if step % len(source_train_dataset) == 1: source_rand_perm = np.random.permutation(len(source_train_dataset)) if step % len(target_train_dataset) == 1: target_rand_perm = np.random.permutation(len(target_train_dataset)) source_index = source_rand_perm[step % len(source_train_dataset)] target_index = target_rand_perm[step % len(target_train_dataset)] source_batch = source_train_dataset.get_data( source_index, h_flip=np.random.rand() > 0.5, target_im_size=np.random.choice([480, 576, 688, 864, 1200])) target_batch = target_train_dataset.get_data( target_index, h_flip=np.random.rand() > 0.5, target_im_size=np.random.choice([480, 576, 688, 864, 1200])) source_im_data = source_batch['im_data'].unsqueeze(0).to(device) source_proposals = source_batch['proposals'] source_gt_boxes = source_batch['gt_boxes'] source_proposals, source_labels, _, pos_cnt, neg_cnt = sample_proposals( source_gt_boxes, source_proposals, args.bs, args.pos_ratio) source_proposals = source_proposals.to(device) source_gt_boxes = source_gt_boxes.to(device) source_labels = source_labels.to(device) target_im_data = target_batch['im_data'].unsqueeze(0).to(device) target_proposals = target_batch['proposals'].to(device) target_image_level_label = target_batch['image_level_label'].to(device) optimizer.zero_grad() # source forward & backward _, source_loss = model.forward_det(source_im_data, source_proposals, source_labels) source_loss_sum += source_loss.item() source_loss = source_loss * (1 - args.alpha) source_loss.backward() # target forward & backward if args.cam_like: _, target_loss = model.forward_cls_camlike( target_im_data, target_proposals, target_image_level_label) else: _, target_loss = model.forward_cls(target_im_data, target_proposals, target_image_level_label) target_loss_sum += target_loss.item() target_loss = target_loss * args.alpha target_loss.backward() clip_gradient(model, 10.0) optimizer.step() source_pos_prop_sum += pos_cnt source_neg_prop_sum += neg_cnt target_prop_sum += target_proposals.size(0) if step % args.disp_interval == 0: end = time.time() loss_sum = source_loss_sum * ( 1 - args.alpha) + target_loss_sum * args.alpha loss_sum /= args.disp_interval source_loss_sum /= args.disp_interval target_loss_sum /= args.disp_interval source_pos_prop_sum /= args.disp_interval source_neg_prop_sum /= args.disp_interval target_prop_sum /= args.disp_interval log_message = "[%s][session %d][iter %4d] loss: %.4f, src_loss: %.4f, tar_loss: %.4f, pos_prop: %.1f, neg_prop: %.1f, tar_prop: %.1f, lr: %.2e, time: %.1f" % \ (args.net, args.session, step, loss_sum, source_loss_sum, target_loss_sum, source_pos_prop_sum, source_neg_prop_sum, target_prop_sum, lr, end - start) print(log_message) log_file.write(log_message + '\n') log_file.flush() source_loss_sum = 0 target_loss_sum = 0 source_pos_prop_sum = 0 source_neg_prop_sum = 0 target_prop_sum = 0 start = time.time() if step in (args.max_iter * 4 // 7, args.max_iter * 6 // 7): adjust_learning_rate(optimizer, 0.1) lr *= 0.1 if step % args.save_interval == 0 or step == args.max_iter: save_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.session, step)) checkpoint = dict() checkpoint['net'] = args.net checkpoint['session'] = args.session checkpoint['pooling_method'] = args.pooling_method checkpoint['share_level'] = args.share_level checkpoint['iterations'] = step checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() save_checkpoint(checkpoint, save_name) print('save model: {}'.format(save_name)) log_file.close()
def train(): args = parse_args() print('Called with args:') print(args) np.random.seed(3) torch.manual_seed(4) if torch.cuda.is_available(): torch.cuda.manual_seed(5) device = torch.device('cuda') else: device = torch.device('cpu') output_dir = args.save_dir if not os.path.exists(output_dir): os.makedirs(output_dir) train_dataset = WSDDNDataset(dataset_names=['voc07_trainval'], data_dir=args.data_dir, prop_method=args.prop_method, num_classes=20, min_prop_scale=args.min_prop) lr = args.lr if args.net == 'WSDDN_VGG16': model = WSDDN_VGG16( os.path.join(args.data_dir, 'pretrained_model/vgg16_caffe.pth'), 20) else: raise Exception('network is not defined') params = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'bias' in key: params += [{ 'params': [value], 'lr': lr * 2, 'weight_decay': 0 }] else: params += [{ 'params': [value], 'lr': lr, 'weight_decay': 0.0005 }] optimizer = torch.optim.SGD(params, momentum=0.9) if args.resume: load_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.checksession, args.checkepoch)) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) assert args.net == checkpoint['net'] args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr = optimizer.param_groups[0]['lr'] print("loaded checkpoint %s" % (load_name)) log_file_name = os.path.join( output_dir, 'log_{}_{}.txt'.format(args.net, args.session)) if args.resume: log_file = open(log_file_name, 'a') else: log_file = open(log_file_name, 'w') log_file.write(str(args)) log_file.write('\n') model.to(device) for epoch in range(args.start_epoch, args.max_epochs + 1): model.train() loss_sum = 0 reg_sum = 0 iter_sum = 0 num_prop = 0 start = time.time() optimizer.zero_grad() rand_perm = np.random.permutation(len(train_dataset)) for step in range(1, len(train_dataset) + 1): index = rand_perm[step - 1] apply_h_flip = np.random.rand() > 0.5 target_im_size = np.random.choice([480, 576, 688, 864, 1200]) im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale, raw_img, im_id = \ train_dataset.get_data(index, apply_h_flip, target_im_size) # plt.imshow(raw_img) # draw_box(proposals / im_scale) # draw_box(gt_boxes / im_scale, 'black') # plt.show() im_data = im_data.unsqueeze(0).to(device) rois = proposals.to(device) image_level_label = image_level_label.to(device) if args.use_prop_score: prop_scores = prop_scores.to(device) else: prop_scores = None scores, loss, reg = model(im_data, rois, prop_scores, image_level_label) reg = reg * args.alpha num_prop += proposals.size(0) loss_sum += loss.item() reg_sum += reg.item() loss = loss + reg if args.bavg: loss = loss / args.bs loss.backward() if step % args.bs == 0: optimizer.step() optimizer.zero_grad() iter_sum += 1 if step % args.disp_interval == 0: end = time.time() print( "[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, reg: %.4f, num_prop: %.1f, lr: %.2e, time: %.1f" % (args.net, args.session, epoch, step, loss_sum / iter_sum, reg_sum / iter_sum, num_prop / iter_sum, lr, end - start)) log_file.write( "[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, reg: %.4f, num_prop: %.1f, lr: %.2e, time: %.1f\n" % (args.net, args.session, epoch, step, loss_sum / iter_sum, reg_sum / iter_sum, num_prop / iter_sum, lr, end - start)) loss_sum = 0 reg_sum = 0 num_prop = 0 iter_sum = 0 start = time.time() log_file.flush() if epoch == 10: adjust_learning_rate(optimizer, 0.1) lr *= 0.1 if epoch % args.save_interval == 0: save_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.session, epoch)) checkpoint = dict() checkpoint['net'] = args.net checkpoint['session'] = args.session checkpoint['epoch'] = epoch + 1 checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() save_checkpoint(checkpoint, save_name) print('save model: {}'.format(save_name)) log_file.close()
def main_worker(args): # NEW: equivalent to MPI init. print("world size ", os.environ['OMPI_COMM_WORLD_SIZE']) print("rank ", os.environ['OMPI_COMM_WORLD_RANK']) torch.distributed.init_process_group( backend="nccl", init_method="env://", world_size=int(os.environ['OMPI_COMM_WORLD_SIZE']), rank=int(os.environ['OMPI_COMM_WORLD_RANK'])) # NEW: lookup number of ranks in the job, and our rank args.world_size = torch.distributed.get_world_size() print("world size ", args.world_size) args.rank = torch.distributed.get_rank() print("rank ", args.rank) ngpus_per_node = torch.cuda.device_count() print("ngpus_per_node ", ngpus_per_node) local_rank = args.rank % ngpus_per_node print("local_rank ", local_rank) # NEW: Globalize variables global best_acc1 global best_acc5 global best_train_acc1 global best_train_acc5 #args.gpu = None # NEW: Specify gpu args.gpu = local_rank train, validate, modifier = get_trainer(args) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) # NEW: Distributed data #if args.distributed: args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) #model = set_gpu(args, model) # NEW: Modified function for loading gpus on multinode setups model = lassen_set_gpu(args, model) if args.pretrained: pretrained(args, model) optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: #criterion = nn.CrossEntropyLoss().cuda() # NEW: Specify gpu criterion = nn.CrossEntropyLoss().cuda(args.gpu) else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: best_acc1 = resume(args, model, optimizer) # Data loading code if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return # Set up directories # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) args.ckpt_base_dir = ckpt_base_dir # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: writer = SummaryWriter(log_dir=log_base_dir) else: writer = None epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: progress_overall = ProgressMeter( 1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) # Start training for epoch in range(args.start_epoch, args.epochs): # NEW: Distributed data #if args.distributed: data.train_sampler.set_epoch(epoch) data.val_sampler.set_epoch(epoch) lr_policy(epoch, iteration=None) #modifier(args, epoch, model) cur_lr = get_lr(optimizer) # train for one epoch start_train = time.time() train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) #train_acc1, train_acc5 = train( # data.train_loader, model, criterion, optimizer, epoch, args, writer=None #) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() # NEW: Only write values to tensorboard for main processor (one with global rank 0) if args.rank == 0: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) else: acc1, acc5 = validate(data.val_loader, model, criterion, args, None, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_most_recent.state", save=save, ) #filename=ckpt_base_dir / f"epoch_{epoch}.state", epoch_time.update((time.time() - end_epoch) / 60) # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) if args.conv_type == "SampleSubnetConv": count = 0 sum_pr = 0.0 for n, m in model.named_modules(): if isinstance(m, SampleSubnetConv): # avg pr across 10 samples pr = 0.0 for _ in range(10): pr += ((torch.rand_like(m.clamped_scores) >= m.clamped_scores).float().mean().item()) pr /= 10.0 writer.add_scalar("pr/{}".format(n), pr, epoch) sum_pr += pr count += 1 args.prune_rate = sum_pr / count writer.add_scalar("pr/average", args.prune_rate, epoch) # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, )
def main(): ## CALL ARGUMENTS args = arguments() print("Called Arguments") ## LOG PATH date = datetime.datetime.now() day = date.strftime('%m%d_%H%M') log_path = "./logs/{}".format(day) if not os.path.exists(log_path): os.mkdir(log_path) ## INITIALIZE TENSORBOARD if args.tfboard: from utils.logger import Logger logger = Logger(log_path) ## CONFIG SAVE AS TEXT configs = "Dataset: {}\nLSTM Size: {}\nNumber of Proposals: {}\nStart Learning Rate: {}\nLearning Rate Decay: {}\nOptimizer: {}\nScore Threshold: {}\nEncoding: {}".format(args.dataset, args.lstm_size, args.num_prop_after, args.learning_rate, args.learning_rate_decay, args.optimizer, args.score_thresh, args.encoding) txt_file = "{}/configs.txt".format(log_path) f = open(txt_file, 'w') f.write(configs) f.close() ## DATASET CONFIGURATION if args.dataset == 'cvpr19': args.anchor_scale = [[400, 300, 200, 100]] ## CUDA CHECK if not torch.cuda.is_available(): print("WARNING: Why don't you use CUDA?") ## DATALOADER (ITERATOR) data_type = ['train', 'val', 'test'] #data_type = ['val'] loader = {} for type in data_type: EEGDetectionData = loaddata.EEGDetectionDataset(args, type) # Shape of data : (seq_len, num_ch) or (1, seq_len, num_ch)?? loader[type] = DataLoader(dataset=EEGDetectionData, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # Already shuffled seq_len, num_ch = EEGDetectionData[0][0].shape[0], EEGDetectionData[0][0].shape[1] args.seq_len, args.num_ch = seq_len, num_ch ## ENCODING OR NOT if args.encoding: args.encoding_size = int((args.num_ch) / (args.encoding_scale)) ## CALL MODEL model = rlstm(args) model.create_architecture() ## OPTIMIZER #params = [] #for key, value in dict(model.named_parameters()).items(): # if value.requires_grad: # params += [{'params':[value],'lr':lr, 'weight_decay': args.weight_decay}] lr = args.learning_rate optimizer = getattr(torch.optim, args.optimizer)(model.parameters(), lr = args.learning_rate) #if args.cuda: model.cuda() # CUDA ## RESUME if args.resume: checkpoint = torch.load('./logs/0524_2245/save_model/thecho7_25.pth') args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Resume the training") ## TRAINING for epoch in range(args.start_epoch, args.epochs): acc = {"train": 0, "val": 0, "test": 0} counts = {"train": 0, "val": 0, "test": 0} # TRAIN MODE model.train() loss_check = 0 start = time.time() # Learning Rate Adjustment if epoch % (args.checkpoint_interval) == 0: adjust_learning_rate(optimizer, args.weight_decay) lr *= args.learning_rate_decay # Session mode - Train or Test for split in data_type: if split == 'train': args.training = True model.train() else: args.training = False model.eval() for i, data in enumerate(loader[split], 0): print(split) # READ BATCH DATA (IN OUR CASE, BATCH SIZE IS 1) inputs, labels = data inputs, proposals, labels = proposal_gen(inputs, labels, args) inputs = inputs.cuda() #labels = labels.cuda(async = True) labels = labels.cuda() inputs = Variable(inputs, volatile = (split != "train")) labels = Variable(labels, volatile = (split != "train")) # FORWARD cls_loss, bbox_loss, acc = model(inputs, labels, proposals, split, acc) ''' if split == 'train': cls_loss = F.cross_entropy(cls_feat, labels.long()) # F.cross_entropy converts indices automatically else: cls_loss = Variable(torch.zeros(1).cuda()) # Garbage Value # Penalized Loss (Division Method) loss_div = 1 for j in range(args.num_prop_after): if int(cls_feat.data.max(1)[1][j]) == int(labels[j]): loss_div += 1 acc[split] += 1 cls_loss = cls_loss.div(loss_div) # Result Print _, cls_idx = cls_feat.data.max(1) result_print = [] for j in range(args.num_prop_after): result_print.append(int(cls_idx[j])) print(" Result labels: {}".format(result_print)) ''' # BACKWARD loss = cls_loss.mean() + bbox_loss.mean() loss_check += loss.data[0] if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() print("CLASS LOSS: {} BBOX LOSS: {}".format(float(loss.data[0]), float(bbox_loss.data[0]))) counts[split] += args.num_prop_after # LOSS DISPLAY if i % args.checkpoint_interval == 1: end = time.time() if i > 1: loss_check /= args.checkpoint_interval print("[epoch: {} - loss_check: {}]".format(epoch, loss_check)) print("[Iter Accuracy: {}".format(acc["train"]/counts["train"])) loss_cls = cls_loss.data[0] if args.tfboard: info = {'loss':loss_check, 'loss_cls':loss_cls} for tag, value in info.items(): logger.scalar_summary(tag, value, i) loss_check = 0 start = time.time() # Print info at the end of the epoch print("Epoch {}: TrA={:.4f}, VA={:.4f}, TeA={:.4f}".format(epoch, acc["train"]/counts["train"], acc["val"]/counts["val"], acc["test"]/counts["test"])) ## SAVE MODEL (TBI) model_path = "{}/{}".format(log_path, "save_model") if not os.path.exists(model_path): os.mkdir(model_path) save_name = os.path.join('{}'.format(model_path), 'thecho7_{}.pth'.format(epoch)) save_checkpoint({ 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), }, save_name) print('Saving Model: {}......'.format(save_name))
def train(): args = parse_args() print('Called with args:') print(args) assert args.bs % 2 == 0 np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) device = torch.device('cuda') else: device = torch.device('cpu') print(device) output_dir = args.save_dir if not os.path.exists(output_dir): os.makedirs(output_dir) target_only = args.target_only source_train_dataset = TDETDataset(['coco60_train2014', 'coco60_val2014'], args.data_dir, 'eb', num_classes=60) target_train_dataset = TDETDataset(['voc07_trainval'], args.data_dir, 'eb', num_classes=20) lr = args.lr if args.net == 'CAM_DET': model = CamDet( os.path.join(args.data_dir, 'pretrained_model/vgg16_caffe.pth') if not args.resume else None, 20 if target_only else 80, args.hidden_dim) else: raise Exception('network is not defined') optimizer = model.get_optimizer(args.lr) if args.resume: load_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.checksession, args.checkiter)) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) assert args.net == checkpoint['net'] args.start_iter = checkpoint['iterations'] + 1 model.load_state_dict(checkpoint['model']) print("loaded checkpoint %s" % (load_name)) del checkpoint log_file_name = os.path.join( output_dir, 'log_{}_{}.txt'.format(args.net, args.session)) if args.resume: log_file = open(log_file_name, 'a') else: log_file = open(log_file_name, 'w') log_file.write(str(args)) log_file.write('\n') model.to(device) model.train() source_loss_sum = 0 target_loss_sum = 0 total_loss_sum = 0 start = time.time() source_rand_perm = None target_rand_perm = None for step in range(args.start_iter, args.max_iter + 1): if source_rand_perm is None or step % len(source_train_dataset) == 1: source_rand_perm = np.random.permutation(len(source_train_dataset)) if target_rand_perm is None or step % len(target_train_dataset) == 1: target_rand_perm = np.random.permutation(len(target_train_dataset)) source_index = source_rand_perm[step % len(source_train_dataset)] target_index = target_rand_perm[step % len(target_train_dataset)] optimizer.zero_grad() if not target_only: source_batch = source_train_dataset.get_data( source_index, h_flip=np.random.rand() > 0.5, target_im_size=np.random.choice([480, 576, 688, 864, 1200])) source_im_data = source_batch['im_data'].unsqueeze(0).to(device) source_gt_labels = source_batch['gt_labels'] + 20 source_pos_cls = [i for i in range(80) if i in source_gt_labels] source_pos_cls = torch.tensor(np.random.choice( source_pos_cls, min(args.bs, len(source_pos_cls)), replace=False), dtype=torch.long, device=device) source_loss, _, _ = model(source_im_data, source_pos_cls) source_loss_sum += source_loss.item() target_batch = target_train_dataset.get_data( target_index, h_flip=np.random.rand() > 0.5, target_im_size=np.random.choice([480, 576, 688, 864, 1200])) target_im_data = target_batch['im_data'].unsqueeze(0).to(device) target_gt_labels = target_batch['gt_labels'] target_pos_cls = [i for i in range(80) if i in target_gt_labels] target_pos_cls = torch.tensor(np.random.choice( target_pos_cls, min(args.bs, len(target_pos_cls)), replace=False), dtype=torch.long, device=device) target_loss, _, _, _ = model(target_im_data, target_pos_cls) target_loss_sum += target_loss.item() if args.target_only: total_loss = target_loss else: total_loss = (source_loss + target_loss) * 0.5 total_loss.backward() total_loss_sum += total_loss.item() clip_gradient(model, 10.0) optimizer.step() if step % args.disp_interval == 0: end = time.time() total_loss_sum /= args.disp_interval source_loss_sum /= args.disp_interval target_loss_sum /= args.disp_interval log_message = "[%s][session %d][iter %4d] loss: %.8f, src_loss: %.8f, tar_loss: %.8f, lr: %.2e, time: %.1f" % \ (args.net, args.session, step, total_loss_sum, source_loss_sum, target_loss_sum, lr, end - start) print(log_message) log_file.write(log_message + '\n') log_file.flush() total_loss_sum = 0 source_loss_sum = 0 target_loss_sum = 0 start = time.time() if step in (args.max_iter * 4 // 7, args.max_iter * 6 // 7): adjust_learning_rate(optimizer, 0.1) lr *= 0.1 if step % args.save_interval == 0 or step == args.max_iter: save_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.session, step)) checkpoint = dict() checkpoint['net'] = args.net checkpoint['session'] = args.session checkpoint['iterations'] = step checkpoint['model'] = model.state_dict() save_checkpoint(checkpoint, save_name) print('save model: {}'.format(save_name)) log_file.close()
def main_worker(args): args.gpu = None if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) model = set_gpu(args, model) # Set up directories run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) # Loading pretrained model if args.pretrained: pretrained(args, model) # Saving a DenseConv (nn.Conv2d) compatible model if args.dense_conv_model: print( f"==> DenseConv compatible model, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), }, True, filename=ckpt_base_dir / f"epoch_pretrained.state", save=True, ) return optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: criterion = nn.CrossEntropyLoss().cuda() else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: best_acc1 = resume(args, model, optimizer) # Evaulation of a model if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) # Start training for epoch in range(args.start_epoch, args.epochs): lr_policy(epoch, iteration=None) cur_lr = get_lr(optimizer) # Gradual pruning in GMP experiments if args.conv_type == "GMPConv" and epoch >= args.init_prune_epoch and epoch <= args.final_prune_epoch: total_prune_epochs = args.final_prune_epoch - args.init_prune_epoch + 1 for n, m in model.named_modules(): if hasattr(m, 'set_curr_prune_rate'): prune_decay = ( 1 - ((args.curr_prune_epoch - args.init_prune_epoch) / total_prune_epochs))**3 curr_prune_rate = m.prune_rate - (m.prune_rate * prune_decay) m.set_curr_prune_rate(curr_prune_rate) # train for one epoch start_train = time.time() train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save, ) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() # Storing sparsity and threshold statistics for STRConv models if args.conv_type == "STRConv": count = 0 sum_sparse = 0.0 for n, m in model.named_modules(): if isinstance(m, STRConv): sparsity, total_params, thresh = m.getSparsity() writer.add_scalar("sparsity/{}".format(n), sparsity, epoch) writer.add_scalar("thresh/{}".format(n), thresh, epoch) sum_sparse += int(((100 - sparsity) / 100) * total_params) count += total_params total_sparsity = 100 - (100 * sum_sparse / count) writer.add_scalar("sparsity/total", total_sparsity, epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, ) if args.conv_type == "STRConv": json_data = {} json_thres = {} for n, m in model.named_modules(): if isinstance(m, STRConv): sparsity = m.getSparsity() json_data[n] = sparsity[0] sum_sparse += int(((100 - sparsity[0]) / 100) * sparsity[1]) count += sparsity[1] json_thres[n] = sparsity[2] json_data["total"] = 100 - (100 * sum_sparse / count) if not os.path.exists("runs/layerwise_sparsity"): os.mkdir("runs/layerwise_sparsity") if not os.path.exists("runs/layerwise_threshold"): os.mkdir("runs/layerwise_threshold") with open("runs/layerwise_sparsity/{}.json".format(args.name), "w") as f: json.dump(json_data, f) with open("runs/layerwise_threshold/{}.json".format(args.name), "w") as f: json.dump(json_thres, f)
def main_worker(args): args.gpu = None train, validate, modifier = get_trainer(args) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) model = set_gpu(args, model) if args.pretrained: pretrained(args, model) # SJT modification: if args.exp_mode: # pretraining/pruning/funetuning exp_mode = args.exp_mode if exp_mode == "pretraining": # YHT modefication, setting the pruning rate to 0 print( "Figure out your exp_mode is pretraining, setting prune-rate to 0" ) args.prune_rate = 0 unfreeze_model_weights(model) freeze_model_subnet(model) # End of SJT modification optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: criterion = nn.CrossEntropyLoss().cuda() else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: # SJT modification if args.exp_mode: if args.exp_mode == "pruning": optimizer = resume_pruning(args, model) else: # Only can be "finetuning" if args.exp_mode != "finetuning": print( "resume method should be combined with pruning/finetuning exp_mode together!" ) return else: optimizer = resume_finetuning(args, model) # YHT: not sure whether it is needed #lr_policy = get_policy(args.lr_policy)(optimizer, args) #print("#####################DEBUG PRINT : VALIDATE FIRST#####################") #validate(data.val_loader, model, criterion, args, writer= None, epoch=args.start_epoch) else: best_acc1 = resume(args, model, optimizer) # End of SJT modification else: # YHT modification if args.exp_mode: if args.exp_mode == "finetuning": #here, we suppose the user want to use init prun-rate vector to do the finetuning(subnetwork) print( "Using finetuning mode without resume, which is supposed to be innit fientune." ) optimizer = resume_finetuning(args, model) # YHT: not sure whether it is needed lr_policy = get_policy(args.lr_policy)(optimizer, args) # End of modification # Data loading code if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return # Set up directories run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) args.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) if args.gp_warm_up: record_prune_rate = args.prune_rate if args.print_more: print_global_layerwise_prune_rate(model, args.prune_rate) # YHT modification May 20 # till here, we have every prune-rate is accurate # Now we need to create mask if prandom is true using if args.prandom: make_prandom_mask(model) # End of modification # Start training for epoch in range(args.start_epoch, args.epochs): lr_policy(epoch, iteration=None) modifier(args, epoch, model) cur_lr = get_lr(optimizer) if args.print_more: print("In epoch{epoch}, lr = {cur_lr}") # train for one epoch start_train = time.time() # WHN modeification add global pruning if args.pscale == "global": if args.gp_warm_up: if epoch < args.gp_warm_up_epochs: args.prune_rate = 0 else: args.prune_rate = record_prune_rate if not args.prandom: args.score_threshold = get_global_score_threshold( model, args.prune_rate) # YHT modification if args.print_more: print_global_layerwise_prune_rate(model, args.prune_rate) # End of modification train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() # if random labeled, evaluate on training set (by yty) if args.shuffle: acc1, acc5 = train_acc1, train_acc5 else: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save, ) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) if args.conv_type == "SampleSubnetConv": count = 0 sum_pr = 0.0 for n, m in model.named_modules(): if isinstance(m, SampleSubnetConv): # avg pr across 10 samples pr = 0.0 for _ in range(10): pr += ((torch.rand_like(m.clamped_scores) >= m.clamped_scores).float().mean().item()) pr /= 10.0 writer.add_scalar("pr/{}".format(n), pr, epoch) sum_pr += pr count += 1 args.prune_rate = sum_pr / count writer.add_scalar("pr/average", args.prune_rate, epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, )