opt.is_smart_model = True smartModel = SmartModel(opt) smartModel = smartModel.to(device) # Parallelize model to multiple GPUs if torch.cuda.device_count() > 1: print("Using", torch.cuda.device_count(), "GPUs!") StudentModel = nn.DataParallel(StudentModel) parms = list(StudentModel.module.parameters()) if opt.isSource: smartModel = nn.DataParallel(smartModel) elif torch.cuda.device_count() == 1: print("Using", torch.cuda.device_count(), "GPU!") parms = list(StudentModel.parameters()) optimizer = torch.optim.SGD(parms, opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) if opt.optim == "sgd": optimizer = torch.optim.SGD(parms, opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) elif opt.optim == "adam": optimizer = torch.optim.Adam(parms, lr=opt.lr) if opt.dataset in ['cifar10', 'cifar100', 'stl10']:
if args.do_train: train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.pool_size, shuffle=True, collate_fn=train_collate_fn) eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=args.eval_pool_size, shuffle=False, collate_fn=eval_collate_fn) if args.task=='distill': t_model = TeacherModel(params=args, pretrained_embedding=torch.tensor(pretrained_embedding).float()) t_state_dict=torch.load(args.teacher_model_path) t_model.load_state_dict(t_state_dict) model = StudentModel(params=args, pretrained_embedding=torch.tensor(pretrained_embedding).float()) bridges=_get_clones(Bridge(params=args), args.num_layers) param_groups=[{'params':model.parameters(), 'lr':args.learning_rate}, {'params':bridges.parameters(), 'lr':args.learning_rate}] t_model = t_model.to(args.device) bridge = bridges.to(args.device) t_model.eval() elif args.task=='student': model = StudentModel(params=args, pretrained_embedding=torch.tensor(pretrained_embedding).float()) param_groups = [{'params': model.parameters(), 'lr': args.learning_rate}] elif args.task=='teacher': model = TeacherModel(params=args, pretrained_embedding=torch.tensor(pretrained_embedding).float()) param_groups = [{'params': model.parameters(), 'lr': args.learning_rate}] if args.classify_loss: classifier = PathClassifier(params=args) param_groups.append({'params': classifier.parameters(),'lr':args.learning_rate}) classifiers = classifier.to(args.device) optimizer = SGD(param_groups, lr=args.learning_rate)