def test_vgg_side_archs_torch(mocker, github, arch): model = getattr(models, arch)(pretrained=False) mocker.patch( "torch.hub.load_state_dict_from_url", return_value=model.state_dict(), ) assert hub.load(github, arch, pretrained=True, framework="torch")
def test_load_from_github(self): hub_model = hub.load('ailzhang/torchhub_example', 'mnist', pretrained=True, verbose=False) self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
def test_load_from_github(self): hub_model = hub.load("pytorch/vision", "resnet18", pretrained=True, progress=False) assert sum_of_model_parameters(hub_model).item() == pytest.approx( SUM_OF_PRETRAINED_RESNET18_PARAMS)
def __init__(self, pretrained=True): super(ShuffleNetV2, self).__init__() self.model = hub.load('pytorch/vision:v0.5.1', 'shufflenet_v2_x1_0', pretrained=pretrained) self._out_features_channels = [116, 232, 464, 1024] self._out_features_strides = [2**(i + 1) for i in range(2, 5)] + [2**5]
def test_set_dir(self): temp_dir = tempfile.gettempdir() hub.set_dir(temp_dir) hub_model = hub.load('pytorch/vision', 'resnet18', pretrained=True) self.assertEqual(self.resnet18_pretrained, hub_model.state_dict()) assert os.path.exists(temp_dir + '/vision_master') shutil.rmtree(temp_dir + '/vision_master')
def test_set_dir(self): temp_dir = tempfile.gettempdir() hub.set_dir(temp_dir) hub_model = hub.load('pytorch/vision', 'resnet18', pretrained=True) self.assertEqual(sum_of_model_parameters(hub_model), SUM_OF_PRETRAINED_RESNET18_PARAMS) assert os.path.exists(temp_dir + '/pytorch_vision_master') shutil.rmtree(temp_dir + '/pytorch_vision_master')
def test_load_zip_1_6_checkpoint(self): hub_model = hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', pretrained=True, verbose=False, trust_repo=True) self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
def get_drp_unet(in_channels, pretrained): model = hub.load('mateuszbuda/brain-segmentation-pytorch', 'unet', in_channels=in_channels, pretrained=pretrained) model.__class__.forward = forward return model
def test_load_from_github(self): hub_model = hub.load('pytorch/vision', 'resnet18', pretrained=True, progress=False) self.assertAlmostEqual(sum_of_model_parameters(hub_model).item(), SUM_OF_PRETRAINED_RESNET18_PARAMS, places=2)
def init(): """Encapsulates the initialization and compensates for some minor quirks in TorchHub""" global tacotron2 global waveglow tacotron2 = hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2') tacotron2 = tacotron2.to('cuda') # Tell the model we're using it to evaluate, not to train (optimization) tacotron2.eval() sys.stderr.write("\n") waveglow = hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow') waveglow = waveglow.remove_weightnorm(waveglow) waveglow = waveglow.to('cuda') # As above: disable training specific stuff waveglow.eval() sys.stderr.write("\n")
def test_load_from_local_dir(self): local_dir = hub._get_cache_or_reload('ailzhang/torchhub_example', force_reload=False) hub_model = hub.load(local_dir, 'mnist', source='local', pretrained=True, verbose=False) self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
def test_set_dir(self): temp_dir = tempfile.gettempdir() hub.set_dir(temp_dir) hub_model = hub.load('ailzhang/torchhub_example', 'mnist', pretrained=True, verbose=False) self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) assert os.path.exists(temp_dir + '/ailzhang_torchhub_example_master') shutil.rmtree(temp_dir + '/ailzhang_torchhub_example_master')
def test_set_dir(self): temp_dir = tempfile.gettempdir() hub.set_dir(temp_dir) hub_model = hub.load('pytorch/vision', 'resnet18', pretrained=True, progress=False) assert sum_of_model_parameters(hub_model).item() == pytest.approx( SUM_OF_PRETRAINED_RESNET18_PARAMS) assert os.path.exists(temp_dir + '/pytorch_vision_master') shutil.rmtree(temp_dir + '/pytorch_vision_master')
def test_set_dir(self): temp_dir = tempfile.gettempdir() hub.set_dir(temp_dir) hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False) assert sum_of_model_parameters(hub_model).item() == pytest.approx( SUM_OF_PRETRAINED_RESNET18_PARAMS) assert os.path.exists(temp_dir + "/pytorch_vision_master") shutil.rmtree(temp_dir + "/pytorch_vision_master")
def test_load_legacy_zip_checkpoint(self): with warnings.catch_warnings(record=True) as ws: warnings.simplefilter("always") hub_model = hub.load('ailzhang/torchhub_example', 'mnist_zip', pretrained=True, verbose=False) self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) assert any( "will be deprecated in favor of default zipfile" in str(w) for w in ws)
def main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} test_loader = torch.utils.data.DataLoader(datasets.MNIST( '/home/john/Data/mnist', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = hub.load("johnhany/torchhub:master", "cnn", force_reload=True, pretrained=True).to(device) test(args, model, device, test_loader)
def __init__(self, num_classes): super(SelfDefinedModel, self).__init__() self.classifier = load(repo_or_dir='huawei-noah/ghostnet', model='ghostnet_1x', pretrained=True, progress=True) self.classifier.conv_stem = nn.Conv2d( in_channels=1, out_channels=self.classifier.conv_stem.out_channels, kernel_size=self.classifier.conv_stem.kernel_size, stride=self.classifier.conv_stem.stride, padding=self.classifier.conv_stem.padding, bias=self.classifier.conv_stem.bias) self.classifier.classifier = nn.Linear( in_features=self.classifier.classifier.in_features, out_features=num_classes)
def _load_model(self, path: str, bpe: str, bpe_filename:str) -> RobertaHubInterface: if path == "xlmr.large" or path == "xlmr.base": return hub.load("pytorch/fairseq", path, force_reload=True) else: checkpoint_file = "model.pt" if os.path.exists(os.path.join(path, "model.pt")) else "checkpoint_best.pt" loaded = hub_utils.from_pretrained( model_name_or_path=path, checkpoint_file=checkpoint_file, data_name_or_path=path, bpe=bpe, sentencepiece_vocab=os.path.join(path, bpe_filename), sentencepiece_model=os.path.join(path, bpe_filename), load_checkpoint_heads=True, archive_map=RobertaModel.hub_models(), cpu=False ) return RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])
def test_hub_johnson_alahi_li_2016_transformer(subtests, github): def configs(): file = path.join(pystiche_papers.__path__[0], "johnson_alahi_li_2016", "model_urls.csv") with open( file, "r", ) as fh: for row in csv.DictReader(fh): row["impl_params"] = str_to_bool(row["impl_params"]) row["instance_norm"] = str_to_bool(row["instance_norm"]) del row["url"] yield row for config in configs(): with subtests.test(**config): assert hub.load(github, "johnson_alahi_li_2016_transformer", **config)
def __init__(self): super().__init__() resnet101 = hub.load('pytorch/vision:v0.6.0', 'resnet101', pretrained=True) resnet101 = list(resnet101.children())[:-3] self.resnet101 = nn.Sequential( *resnet101, ) self.classifier = nn.Sequential( nn.Conv2d(1024, 2048, 3, stride=2, padding=1), nn.ReLU(), nn.Conv2d(2048, 1024, 3), nn.ReLU(), nn.Conv2d(1024, 1024, 3), nn.ReLU(), nn.Conv2d(1024, 1024, 3), nn.ReLU(), nn.Flatten(), nn.Linear(1024, 196), )
def test_get_set_dir(self): previous_hub_dir = torch.hub.get_dir() with tempfile.TemporaryDirectory('hub_dir') as tmpdir: torch.hub.set_dir(tmpdir) self.assertEqual(torch.hub.get_dir(), tmpdir) self.assertNotEqual(previous_hub_dir, tmpdir) hub_model = hub.load('ailzhang/torchhub_example', 'mnist', pretrained=True, verbose=False) self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) assert os.path.exists( os.path.join(tmpdir, 'ailzhang_torchhub_example_master')) # Test that set_dir properly calls expanduser() # non-regression test for https://github.com/pytorch/pytorch/issues/69761 new_dir = os.path.join("~", "hub") torch.hub.set_dir(new_dir) self.assertEqual(torch.hub.get_dir(), os.path.expanduser(new_dir))
def test_load_from_github(self): hub_model = hub.load('pytorch/vision', 'resnet18', pretrained=True) self.assertEqual(self.resnet18_pretrained, hub_model.state_dict())
torch.cuda.empty_cache() basedir = '../data/kitti_raw' date = '2011_09_26' drive = '0005' device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') dataset = pykitti.raw(basedir, date, drive) idx = 16 file_name = "{}_{}_{}.png".format( date, drive, os.path.basename(dataset.cam2_files[idx])[:-4]) model = hub.load('TheCodez/pytorch-GoogLeNet-FCN', 'googlenet_fcn', pretrained='cityscapes') model = model.to(device) model.eval() img = dataset.get_cam2(idx) pc_velo = dataset.get_velo(idx) print("Inference") pred = autolabel.semantic_segmentation(model, img, device) pc_velo = autolabel.get_points_in_fov_90(pc_velo) print("Transferring labels") pc_labels = autolabel.transfer_labels(pc_velo, pred, dataset.calib.T_cam0_velo,
def _get_resnet_partially_trained(cls, num_classes=None, pretrained=True, freeze=6, net_version=18, to_grayscale=False): ''' Obtains the pretrained resnet18 model from the Web if not cached. Then: o Freezes the leftmost freeze layers so that they are unaffected by subsequent training. o Modifies the number of output classes from resnet's defaulat 1000 to num_classes. o Modifies the input layer to expect grayscale, i.e. only one channel, instead of three. The weights are retained from the pretrained model. If freeze is zero, all layers of the pretrained model are allowed to change during training. If running under Distributed Data Processing (DDP) protocol, only the master node will download, and then share with the others. :param num_classes: number of target classes :type num_classes: int :param pretrained: if true, the pre-trained version is obtained. Else initial weights are undefined :type pretrained: bool :param freeze: how many layers to freeze, protecting them from training. :type freeze: int :param net_version: which Resnet to return: 18 or 50 :type net_version: int :return: a fresh model :rtype: pytorch.nn ''' if num_classes is None: raise ValueError("Num_classes argument is required") net_name = 'resnet' available_versions = (18, 34, 50) if net_version not in available_versions: raise ValueError( f"{net_name} version must be one of {available_versions}") model = hub.load('pytorch/vision:v0.6.0', f'{net_name}{net_version}', pretrained=pretrained) if to_grayscale: model = cls._first_layer_to_in_channel1(model, net_name) # Keep as many layers as requested # in the config file protected against # further learning: cls.freeze_model_layers(model, freeze) num_in_features = model.fc.in_features model.fc = nn.Linear(num_in_features, num_classes) # Create a property on the model that # returns the number of output classes: model.num_classes = model.fc.out_features return model
def test_load_from_github(self): hub_model = hub.load('pytorch/vision', 'resnet18', pretrained=True) self.assertEqual(sum_of_model_parameters(hub_model), SUM_OF_PRETRAINED_RESNET18_PARAMS)
def test_hub_load_smoke(github, model): assert isinstance(hub.load(github, model), nn.Module)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu gpus = list(range(len(args.gpu.split(',')))) else: gpus = [0] # [1,2] # if args.arch not in model_names: # raise NotImplementedError('Other optimizer is not implemented') # # elif args.arch == 'DPN26' or 'DPN92': # # from model import MultiModalNet # # model = MultiModalNet("se_resnext101_32x4d","dpn26",0.5) # else: # Net = getattr(modelZoo, args.arch) # model = Net(num_classes=args.num_classes) # model = torch.nn.DataParallel(model, device_ids=gpus).cuda() if args.pretrained: if args.arch == 'se_resnext101_32x4d': print('=> creating model {}'.format(args.arch)) Net = getattr(modelZoo, args.arch) model = Net( args.arch, 0.5, args.num_classes) # here only use 'se_resnext101_32x4d' elif args.arch == 'pnasnet5large' or args.arch == 'se-resnext152': print('=> creating model {}'.format(args.arch)) model = pretrainedmodels.__dict__[args.arch](num_classes=1000, pretrained='imagenet') model.last_linear = nn.Linear(model.last_linear.in_features, args.num_classes) elif args.arch in resnext101_groups: print('=> creating model {}'.format(args.arch)) model = hub.load('facebookresearch/WSL-Images', args.arch) for param in model.parameters(): param.requires_grad = False model.fc = nn.Sequential(nn.Dropout(p=0.5), nn.Linear(2048, 1024), nn.LeakyReLU(inplace=True), nn.Linear(1024, 54)) elif args.arch in efficientnet_groups: print('=> creating model {}'.format(args.arch)) # model = modelZoo.EfficientNet_CBAM(args.arch) Net = getattr(modelZoo, 'MultiNet') model = Net(args.arch) # Net = getattr(modelZoo, 'EfficientNet') # model = Net.from_pretrained(args.arch) # model._fc = nn.Linear(model._fc.in_features, args.num_classes) # model._fc = nn.Sequential( # nn.BatchNorm1d(model._fc.in_features), # nn.Linear(model._fc.in_features, 256), # nn.LeakyReLU(inplace=True), # nn.Dropout(0.4), # nn.Linear(256, args.num_classes) # ) else: print('=> creating model {}'.format(args.arch)) Net = getattr(models, args.arch) model = Net(pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model.fc = nn.Linear(model.fc.in_features, args.num_classes) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # criterion = FocalLoss_v2(num_class=args.num_classes).cuda(args.gpu) # criterion = LabelSmoothSoftmaxCEV2(lb_smooth=0.1, lb_ignore=-100) # criterion.cuda() # optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = RangerVA(model.parameters(), lr=args.lr, betas=(0.95, 0.999), eps=1e-6) # optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: # if os.path.isfile(args.resume): if os.path.exists(args.resume) and (not os.path.isdir(args.resume)): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) if args.resume.startswith('/cache/tmp/'): os.remove(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data_local, args.train_data) valdir = os.path.join(args.data_local, args.val_data) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageFolder类会将traindir目录下的每个子目录名映射为一个label id,然后将该id作为模型训练时的标签 # 比如,traindir目录下的子目录名分别是0~53,ImageFolder类将这些目录名当做class_name,再做一次class_to_idx的映射 # 最终得到这样的class_to_idx:{"0": 0, "1":1, "10":2, "11":3, ..., "19": 11, "2": 12, ...} # 其中key是class_name,value是idx,idx就是模型训练时的标签 # 因此我们在保存训练模型时,需要保存这种idx与class_name的映射关系,以便在做模型推理时,能根据推理结果idx得到正确的class_name # if args.eval_pth != '': # if mox.file.exists(args.eval_pth) and (not mox.file.is_directory(args.eval_pth)): # if args.eval_pth.startswith('s3://'): # model_name = args.eval_pth.rsplit('/', 1)[1] # mox.file.copy(args.eval_pth, '/cache/tmp/' + model_name) # args.eval_pth = '/cache/tmp/' + model_name # print("=> loading checkpoint '{}'".format(args.eval_pth)) # if args.gpu is None: # checkpoint = torch.load(args.eval_pth) # else: # # Map model to be loaded to specified single gpu. # loc = 'cuda:{}'.format(args.gpu) # checkpoint = torch.load(args.eval_pth, map_location=loc) # if args.eval_pth.startswith('/cache/tmp/'): # os.remove(args.eval_pth) # # args.start_epoch = checkpoint['epoch'] # best_acc1 = checkpoint['best_acc1'] # if args.gpu is not None: # # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(args.gpu) # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> loaded checkpoint '{}' (epoch {})" # .format(args.eval_pth, checkpoint['epoch'])) # else: # print("=> no checkpoint found at '{}'".format(args.eval_pth)) # # validate(val_loader, model, criterion, args) # return if args.visdom: vis_title = 'PyTorch on DCNN' vis_legend = ['Train Loss', 'Val Loss'] vis_legend_1 = ['Acc-Top1', 'Acc-Top5'] iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) acc_plot = create_vis_plot('Iteration', 'Acc', vis_title, vis_legend_1) plots = [iter_plot, epoch_plot, acc_plot] for epoch in range(args.start_epoch, args.epochs): # load data ======================================================== switch_freq = 10 if (epoch // switch_freq) % 3 == 0: input_size = 256 # bz=18 args.batch_size = 18 elif (epoch // switch_freq) % 3 == 1: input_size = 380 # bz=8 args.batch_size = 8 elif (epoch // switch_freq) % 3 == 2: input_size = 456 # bz=6 args.batch_size = 6 print('input_size, bz:', input_size, args.batch_size) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(input_size), # PowerPIL(), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) idx_to_class = OrderedDict() for key, value in train_dataset.class_to_idx.items(): idx_to_class[value] = key if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # =========================================================================== # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, plots) # evaluate on validation set if (epoch + 1) % args.print_freq == 0: acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = False best_acc1 = max(acc1.item(), best_acc1) pth_file_name = os.path.join( args.train_local, 'epoch_%s_%s.pth' % (str(epoch + 1), str(round(acc1.item(), 3)))) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'idx_to_class': idx_to_class }, is_best, pth_file_name, args) if args.epochs >= args.print_freq: save_best_checkpoint(best_acc1, args)
raise ValueError("model_repo or model_name not provided") chk_point_files = None model_repo_str = "_".join(model_repo.split("/")) if ":" in model_repo_str: model_repo_str = "_".join(model_repo_str.split(":")) else: model_repo_str = f"{model_repo_str}_master" model_dir = isdir(os.path.join(cache_dir, model_repo_str)) chk_point_dir = join(cache_dir, "checkpoints") try: chk_point_files = any([ f.startswith(model_name) for f in listdir(chk_point_dir) if isfile(join(chk_point_dir, f)) ]) except FileNotFoundError as e: logger.error(e) if model_dir and chk_point_files: return True else: return False if __name__ == "__main__": from torch import hub hub.load("pytorch/vision:v0.6.0", "resnet18", pretrained=True) two = is_model_cached("pytorch/vision:v0.6.0", "resnet18") print("checking if resnet18 v0.6.0 is cached: ", two) hub.load("pytorch/vision", "resnet18", pretrained=True) two = is_model_cached("pytorch/vision", "resnet18") print("checking if resnet18 master is cached: ", two)
def train_model( self, train_csv, train_data_dir, val_csv, val_data_dir, num_epochs=2, batch_size=64, save_graph=True, graph_fname="train-val-loss.png", save_model=True, model_fname="resnest_model", ): train_dataset = ImageDataset(train_csv, train_data_dir, self.transform.training) validation_dataset = ImageDataset(val_csv, val_data_dir, self.transform.validation) batch_size = batch_size train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) validation_loader = DataLoader( validation_dataset, batch_size=batch_size, shuffle=False, drop_last=False, ) # get list of models hub.list("zhanghang1989/ResNeSt", force_reload=True) # load pretrained models, using ResNeSt-50 as an example resnest = hub.load("zhanghang1989/ResNeSt", "resnest50", pretrained=True) # Freeze layers for param in resnest.parameters(): param.requires_grad = False # Define thelast layers to be retrained num_ftrs = resnest.fc.in_features # Redifine last layer of the model resnest.fc = nn.Sequential( nn.Dropout(p=0.5), nn.Linear(num_ftrs, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Linear(256, 17), ) resnest = resnest.to(self.device) optimizer = optim.SGD(resnest.fc.parameters(), lr=0.01, momentum=0.9, weight_decay=0.01) # Weights for imbalanced data pos_weights = DataStats.pos_weights(train_csv) pos_weights = log(as_tensor(pos_weights, dtype=float)) pos_weights = pos_weights.to(self.device) training_losses, validation_losses = Trainer.train( resnest, optimizer, train_loader, validation_loader, num_epochs=num_epochs, # scheduler=scheduler, verbose=True, pos_weights=pos_weights, device=self.device, ) if save_graph: PlotUtils.plot_losses(training_losses, validation_losses, graph_fname) if save_model: self._save_model(resnest, "", model_fname) if self.device == "cuda": cuda.empty_cache()
def get_model_ddp( cls, rank, local_leader_rank, log, net_version, pretrained, freeze, ): # @DontTrace ''' Determine whether this process is the master node. If so, obtain the pretrained resnet18 model. Then distributed the model to the other nodes. :param rank: this process' rank in the distributed data processing sense :type rank: int :param local_leader_rank: the lowest rank on this machine :type local_leader_rank: int :param log: logging service to log to :type log: LoggingService :param net_version: which resnet version to obtain :type net_version: int :param pretrained: if true, the pre-trained version is obtained. Else initial weights are undefined :type pretrained: bool :param freeze: how many layers to freeze, protecting them from training. :type freeze: int ''' if net_version not in (18, 50): raise ValueError("Resnet version must be 18 or 50") hostname = socket.gethostname() # Let the local leader download # the model from the Internet, # in case it is not already cached # locally: # Case 1: not on a GPU machine: device = device('cuda' if cuda.is_available() else 'cpu') if device == device('cpu'): model = hub.load('pytorch/vision:v0.6.0', 'resnet18' if net_version == 18 else 'resnet50', pretrained=pretrained) # Case2a: GPU machine, and this is this machine's # leader process. So it is reponsible for # downloading the model if it is not cached: elif rank == local_leader_rank: log.info(f"Procss with rank {rank} on {hostname} loading model") model = hub.load('pytorch/vision:v0.6.0', 'resnet18' if net_version == 18 else 'resnet50', pretrained=pretrained) # Allow the others on this machine # to load the model (guaranteed to # be locally cached now): log.info( f"Procss with rank {rank} on {hostname} waiting for others to laod model" ) dist.barrier() # Case 2b: GPU machine, but not the local leader. Just # wait for the local leader to be done downloading: else: # Wait for leader to download the # model for everyone on this machine: log.info( f"Process with rank {rank} on {hostname} waiting for leader to laod model" ) dist.barrier() # Get the cached version: log.info(f"Procss with rank {rank} on {hostname} laoding model") model = hub.load('pytorch/vision:v0.6.0', 'resnet18' if net_version == 18 else 'resnet50', pretrained=pretrained) model = cls.freeze_model_layers(model, freeze) return model