def set_model(opt): model = models.__dict__['resnet50']() criterion = SupConLoss(temperature=opt.temp) input = 'moco_v1_200ep_pretrain.pth.tar' checkpoint = torch.load(input, map_location="cpu") state_dict = checkpoint["state_dict"] for k in list(state_dict.keys()): if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] msg = model.load_state_dict(state_dict, strict=False) if torch.cuda.is_available(): if torch.cuda.device_count() > 1: model.encoder = torch.nn.DataParallel(model.encoder) model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True return model, criterion
def set_model(opt): model = SupConResNet(name=opt.model) criterion = SupConLoss(temperature=opt.temp) # enable synchronized Batch Normalization if opt.syncBN: model = apex.parallel.convert_syncbn_model(model) if torch.cuda.is_available(): if torch.cuda.device_count() > 1: model.encoder = torch.nn.DataParallel(model.encoder) model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True return model, criterion
def set_model(opt): model = SupConResNet(name=opt.model) classifier = LinearClassifier(name=opt.model, num_classes=opt.n_cls) criterions = { 'SupConLoss': SupConLoss(temperature=opt.temp), 'CrossEntropyLoss': torch.nn.CrossEntropyLoss() } # enable synchronized Batch Normalization if opt.syncBN: model = apex.parallel.convert_syncbn_model(model) if torch.cuda.is_available(): if torch.cuda.device_count() > 1: model.encoder = torch.nn.DataParallel(model.encoder) model = model.cuda() classifier = classifier.cuda() for name, criterion in criterions.items(): criterions[name] = criterion.cuda() cudnn.benchmark = True return model, classifier, criterions
def main(opt): opt = setup_environment(opt) graph = Graph("coco") # Dataset transform = transforms.Compose([ MirrorPoses(opt.mirror_probability), FlipSequence(opt.flip_probability), RandomSelectSequence(opt.sequence_length), ShuffleSequence(opt.shuffle), PointNoise(std=opt.point_noise_std), JointNoise(std=opt.joint_noise_std), MultiInput(graph.connect_joint, opt.use_multi_branch), ToTensor() ], ) dataset_class = dataset_factory(opt.dataset) dataset = dataset_class( opt.train_data_path, train=True, sequence_length=opt.sequence_length, transform=TwoNoiseTransform(transform), ) dataset_valid = dataset_class( opt.valid_data_path, sequence_length=opt.sequence_length, transform=transforms.Compose([ SelectSequenceCenter(opt.sequence_length), MultiInput(graph.connect_joint, opt.use_multi_branch), ToTensor() ]), ) train_loader = torch.utils.data.DataLoader( dataset, batch_size=opt.batch_size, num_workers=opt.num_workers, pin_memory=True, shuffle=True, ) val_loader = torch.utils.data.DataLoader( dataset_valid, batch_size=opt.batch_size_validation, num_workers=opt.num_workers, pin_memory=True, ) # Model & criterion model, model_args = get_model_resgcn(graph, opt) criterion = SupConLoss(temperature=opt.temp) print("# parameters: ", count_parameters(model)) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, opt.gpus) if opt.cuda: model.cuda() criterion.cuda() # Trainer optimizer, scheduler, scaler = get_trainer(model, opt, len(train_loader)) # Load checkpoint or weights load_checkpoint(model, optimizer, scheduler, scaler, opt) # Tensorboard writer = SummaryWriter(log_dir=opt.tb_path) sample_input = torch.zeros(opt.batch_size, model_args["num_input"], model_args["num_channel"], opt.sequence_length, graph.num_node).cuda() writer.add_graph(model, input_to_model=sample_input) best_acc = 0 loss = 0 for epoch in range(opt.start_epoch, opt.epochs + 1): # train for one epoch time1 = time.time() loss = train(train_loader, model, criterion, optimizer, scheduler, scaler, epoch, opt) time2 = time.time() print(f"epoch {epoch}, total time {time2 - time1:.2f}") # tensorboard logger writer.add_scalar("loss/train", loss, epoch) writer.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], epoch) # evaluation result, accuracy_avg, sub_accuracies, dataframe = evaluate( val_loader, model, opt.evaluation_fn, use_flip=True) writer.add_text("accuracy/validation", dataframe.to_markdown(), epoch) writer.add_scalar("accuracy/validation", accuracy_avg, epoch) for key, sub_accuracy in sub_accuracies.items(): writer.add_scalar(f"accuracy/validation/{key}", sub_accuracy, epoch) print(f"epoch {epoch}, avg accuracy {accuracy_avg:.4f}") is_best = accuracy_avg > best_acc if is_best: best_acc = accuracy_avg if opt.tune: tune.report(accuracy=accuracy_avg) if epoch % opt.save_interval == 0 or ( is_best and epoch > opt.save_best_start * opt.epochs): save_file = os.path.join( opt.save_folder, f"ckpt_epoch_{'best' if is_best else epoch}.pth") save_model(model, optimizer, scheduler, scaler, opt, opt.epochs, save_file) # save the last model save_file = os.path.join(opt.save_folder, "last.pth") save_model(model, optimizer, scheduler, scaler, opt, opt.epochs, save_file) log_hyperparameter(writer, opt, best_acc, loss) print(f"best accuracy: {best_acc*100:.2f}")
def main(): parser = argparse.ArgumentParser(description="SSD evaluation") parser.add_argument( "--results-dir", type=str, default="/data/data_vvikash/fall20/SSD/trained_models/", ) # change this parser.add_argument("--exp-name", type=str, default="temp") parser.add_argument("--training-mode", type=str, choices=("SimCLR", "SupCon", "SupCE")) # model parser.add_argument("--arch", type=str, default="resnet50") parser.add_argument("--num-classes", type=int, default=10) # training parser.add_argument("--dataset", type=str, default="cifar10") parser.add_argument("--data-dir", type=str, default="/data/data_vvikash/datasets/") parser.add_argument("--normalize", action="store_true", default=False) parser.add_argument("--batch-size", type=int, default=512) parser.add_argument("--size", type=int, default=32) parser.add_argument("--epochs", type=int, default=500) parser.add_argument("--lr", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=1e-4) parser.add_argument("--warmup", action="store_true") # ssl parser.add_argument("--method", type=str, default="SupCon", choices=["SupCon", "SimCLR", "SupCE"]) parser.add_argument("--temperature", type=float, default=0.5) # misc parser.add_argument("--print-freq", type=int, default=100) parser.add_argument("--save-freq", type=int, default=50) parser.add_argument("--ckpt", type=str, help="checkpoint path") parser.add_argument("--seed", type=int, default=12345) args = parser.parse_args() device = "cuda:0" if args.batch_size > 256 and not args.warmup: warnings.warn("Use warmup training for larger batch-sizes > 256") if not os.path.isdir(args.results_dir): os.mkdir(args.results_dir) # create resutls dir (for logs, checkpoints, etc.) result_main_dir = os.path.join(args.results_dir, args.exp_name) if os.path.exists(result_main_dir): n = len(next( os.walk(result_main_dir))[-2]) # prev experiments with same name result_sub_dir = result_sub_dir = os.path.join( result_main_dir, "{}--dataset-{}-arch-{}-lr-{}_epochs-{}".format( n + 1, args.dataset, args.arch, args.lr, args.epochs), ) else: os.mkdir(result_main_dir) result_sub_dir = result_sub_dir = os.path.join( result_main_dir, "1--dataset-{}-arch-{}-lr-{}_epochs-{}".format( args.dataset, args.arch, args.lr, args.epochs), ) create_subdirs(result_sub_dir) # add logger logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger() logger.addHandler( logging.FileHandler(os.path.join(result_sub_dir, "setup.log"), "a")) logger.info(args) # seed cuda torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) # Create model if args.training_mode in ["SimCLR", "SupCon"]: model = SSLResNet(arch=args.arch).to(device) elif args.training_mode == "SupCE": model = SupResNet(arch=args.arch, num_classes=args.num_classes).to(device) else: raise ValueError("training mode not supported") # load feature extractor on gpu model.encoder = torch.nn.DataParallel(model.encoder).to(device) # Dataloader train_loader, test_loader, _ = data.__dict__[args.dataset]( args.data_dir, mode="ssl" if args.training_mode in ["SimCLR", "SupCon"] else "org", normalize=args.normalize, size=args.size, batch_size=args.batch_size, ) criterion = (SupConLoss( temperature=args.temperature).cuda() if args.training_mode in ["SimCLR", "SupCon"] else nn.CrossEntropyLoss().cuda()) optimizer = torch.optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) # select training and validation methods trainer = (trainers.ssl if args.training_mode in ["SimCLR", "SupCon"] else trainers.supervised) val = knn if args.training_mode in ["SimCLR", "SupCon"] else baseeval # warmup if args.warmup: wamrup_epochs = 10 print(f"Warmup training for {wamrup_epochs} epochs") warmup_lr_scheduler = torch.optim.lr_scheduler.CyclicLR( optimizer, base_lr=0.01, max_lr=args.lr, step_size_up=wamrup_epochs * len(train_loader), ) for epoch in range(wamrup_epochs): trainer( model, device, train_loader, criterion, optimizer, warmup_lr_scheduler, epoch, args, ) best_prec1 = 0 for p in optimizer.param_groups: p["lr"] = args.lr p["initial_lr"] = args.lr lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs * len(train_loader), 1e-4) for epoch in range(0, args.epochs): trainer(model, device, train_loader, criterion, optimizer, lr_scheduler, epoch, args) prec1, _ = val(model, device, test_loader, criterion, args, epoch) # remember best accuracy and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) d = { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_prec1": best_prec1, "optimizer": optimizer.state_dict(), } save_checkpoint( d, is_best, os.path.join(result_sub_dir, "checkpoint"), ) if not (epoch + 1) % args.save_freq: save_checkpoint( d, is_best, os.path.join(result_sub_dir, "checkpoint"), filename=f"checkpoint_{epoch+1}.pth.tar", ) logger.info( f"Epoch {epoch}, validation accuracy {prec1}, best_prec {best_prec1}" ) # clone results to latest subdir (sync after every epoch) clone_results_to_latest_subdir( result_sub_dir, os.path.join(result_main_dir, "latest_exp"))
test_df = pd.read_csv(args.testdata) else : print("local file reading") train_df = pd.read_csv('notebooks/files/train3.csv') test_df = pd.read_csv('notebooks/files/test3.csv') Num_label = len(train_df.label_id.value_counts()) print('#label ', Num_label) device = torch.device(args.device) tokenizer = RobertaTokenizer.from_pretrained("./pretrained", do_lower_case=False) model = ContraRobertaNet(path= "./pretrained", embedding_dim=768, num_class=Num_label ) criterion = SupConLoss(temperature=1) model.to(device) criterion.to(device) train_dataset = PetDataset(train_df) train_loader = DataLoader( train_dataset, batch_size=args.batchsize, shuffle=True, num_workers=2 ) df_dict ={} for label in range(Num_label) : df = train_df[train_df['label_id'] == label] df_dict[label] = df writer = SummaryWriter(args.logdir)
def train(name, df, VAL_FOLD=0, resume=None): dt_string = datetime.now().strftime("%d|%m_%H|%M|%S") print("Starting -->", dt_string) os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs('checkpoint', exist_ok=True) run = f"{name}_[{dt_string}]" wandb.init(project="imanip", config=config_defaults, name=run) config = wandb.config model = SRM_Classifer(num_classes=1, encoder_checkpoint='weights/pretrain_[31|03_12|16|32].h5') # for name_, param in model.named_parameters(): # if 'classifier' in name_: # continue # else: # param.requires_grad = False print("Parameters : ", sum(p.numel() for p in model.parameters() if p.requires_grad)) wandb.save('segmentation/merged_net.py') wandb.save('dataset.py') train_imgaug, train_geo_aug = get_train_transforms() transforms_normalize = get_transforms_normalize() #region ########################-- CREATE DATASET and DATALOADER --######################## train_dataset = DATASET( dataframe=df, mode="train", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, imgaug_augment=train_imgaug, geo_augment=train_geo_aug, supcon=True ) train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) valid_dataset = DATASET( dataframe=df, mode="val", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, supcon=True ) valid_loader = DataLoader(valid_dataset, batch_size=config.valid_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) test_dataset = DATASET( dataframe=df, mode="test", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, supcon=True ) test_loader = DataLoader(test_dataset, batch_size=config.valid_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) #endregion ###################################################################################### optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( # optimizer, # patience=config.schedule_patience, # mode="min", # factor=config.schedule_factor, # ) criterion = SupConLoss().to(device) es = EarlyStopping(patience=20, mode="min") model = nn.DataParallel(model).to(device) # wandb.watch(model, log_freq=50, log='all') start_epoch = 0 if resume is not None: checkpoint = torch.load(resume) # scheduler.load_state_dict(checkpoint['scheduler_state_dict']) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 print("-----------> Resuming <------------") for epoch in range(start_epoch, config.epochs): print(f"Epoch = {epoch}/{config.epochs-1}") print("------------------") train_metrics_st1 = train_stage1(model, train_loader, optimizer, criterion, epoch) print(f"TRAIN_LOSS = {train_metrics_st1['train_loss']}") print("New LR", optimizer.param_groups[0]['lr']) es( train_metrics_st1['train_loss'], model, model_path=os.path.join(OUTPUT_DIR, f"{run}.h5"), ) if es.early_stop: print("Early stopping") break checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict' : optimizer.state_dict(), # 'scheduler_state_dict': scheduler.state_dict(), } torch.save(checkpoint, os.path.join('checkpoint', f"{run}.pt")) if os.path.exists(os.path.join(OUTPUT_DIR, f"{run}.h5")): print(model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, f"{run}.h5")))) print("LOADED FOR TEST") # test_metrics = test(model, test_loader, criterion) wandb.save(os.path.join(OUTPUT_DIR, f"{run}.h5"))