def build_data(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build data".format(self.tag, type(self).__name__)) args = self._ctx.args data = dt.data.Mnist(batch_size=args.batch_size, valid_size=args.valid_size, num_workers=1, pin_memory=self.use_cuda) data.init_data() data.load_data() self._data = data return True
def build_model(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build model".format(self.tag, type(self).__name__)) self._model = MnistNet() #model = torchvision.models.resnet50(False) # Have ResNet model take in grayscale rather than RGB #model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) return True
def __init__(self, ctx): self.tag = "EST::BASE" dt.trace(dt.DC.MODEL, "[{}] ({}) __init__".format(self.tag, type(self).__name__)) self._ctx = ctx self._trainer = None self._data = None self._model = None self._criterion = None self._optimizer = None self._train_hooks = [] self._valid_hooks = []
def load_data(self): dt.trace(dt.DC.DATA, "[{}] load data".format(self.tag)) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) kwargs = {'num_workers': self._num_workers, 'pin_memory': True} if self._pin_memory else {} self.train.dataset = datasets.CIFAR10(self._data_dir, train=True, download=True, transform=transform_train) if dt.train.is_mp(): # Horovod: use DistributedSampler to partition the training data. self.train.sampler = torch.utils.data.distributed.DistributedSampler( self.train.dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=self._shuffle) self.train.loader = torch.utils.data.DataLoader(self.train.dataset, batch_size=self._batch_size, shuffle=False, sampler=self.train.sampler, **kwargs) else: self.train.loader = torch.utils.data.DataLoader(self.train.dataset, batch_size=self._batch_size, shuffle=self._shuffle, **kwargs) self.valid.dataset = datasets.CIFAR10(self._data_dir, train=False, transform=transform_test) if dt.train.is_mp(): # Horovod: use DistributedSampler to partition the validation data. self.valid.sampler = torch.utils.data.distributed.DistributedSampler( self.valid.dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) self.valid.loader = torch.utils.data.DataLoader(self.valid.dataset, batch_size=self._valid_size, shuffle=False, sampler=self.valid.sampler, **kwargs) else: self.valid.loader = torch.utils.data.DataLoader(self.valid.dataset, batch_size=self._valid_size, shuffle=False, **kwargs) self.test.dataset = datasets.CIFAR10(self._data_dir, train=False, transform=transform_test) if dt.train.is_mp(): # Horovod: use DistributedSampler to partition the test data. self.test.sampler = torch.utils.data.distributed.DistributedSampler( self.test.dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) self.test.loader = torch.utils.data.DataLoader(self.test.dataset, batch_size=self._test_size, shuffle=False, sampler=self.test.sampler, **kwargs) else: self.test.loader = torch.utils.data.DataLoader(self.test.dataset, batch_size=self._test_size, shuffle=False, **kwargs) return self
def bind_estimator(self, est): # Estimiator self._est = est est.bind_trainer(self) est.build_train() # Load checkpoint sync_params = { 'epoch_done': self._ctx.epoch_done, 'global_step': self.global_step } if is_chief(): model_params = dt.model.load(est.model, self._saver.model_latest) optimizer_params = dt.optimizer.load(est.optimizer, self._saver.optimizer_latest) if optimizer_params: sync_params['epoch_done'] = optimizer_params.epoch sync_params['global_step'] = optimizer_params.step #self._ctx.stats = optimizer_params.stats sync_params = mp_broadcast(sync_params) self._ctx.epoch_done = int(sync_params['epoch_done']) self.set_global_step(int(sync_params['global_step'])) set_mono_step(self.global_step) dt.trace( dt.DC.TRAIN, '[CHECKPOINT] epoch_done {}, global_step {}, mono_step {}'.format( self._ctx.epoch_done, self.global_step, mono_step())) if self.use_cuda: # Move model to GPU. est.model.cuda() # Make sure learning rate is up to date self.update_learning_rate(est.optimizer) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(est.model.state_dict(), root_rank=chief_rank()) hvd.broadcast_optimizer_state(est.optimizer, root_rank=chief_rank()) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if self._ctx.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. est.optimizer = hvd.DistributedOptimizer( est.optimizer, named_parameters=est.model.named_parameters(), compression=compression)
def load_data(self): dt.trace(dt.DC.DATA, "[{}] load data".format(self.tag)) kwargs = { 'num_workers': 1, 'pin_memory': True } if self._pin_memory else {} self.train.dataset = datasets.MNIST(self._data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize( (0.1307, ), (0.3081, )) ])) self.train.loader = torch.utils.data.DataLoader( self.train.dataset, batch_size=self._batch_size, shuffle=self._shuffle, **kwargs) self.valid.dataset = datasets.MNIST(self._data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize( (0.1307, ), (0.3081, )) ])) self.valid.loader = torch.utils.data.DataLoader( self.valid.dataset, batch_size=self._valid_size, shuffle=False, **kwargs) self.test.dataset = datasets.MNIST(self._data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) self.test.loader = torch.utils.data.DataLoader( self.test.dataset, batch_size=self._test_size, shuffle=False, **kwargs) return self
def init_data(self): dt.trace(dt.DC.DATA, "[{}] init data".format(self.tag)) self.train, self.valid, self.test = dt.Opt(), dt.Opt, dt.Opt() self.train.batch_size = self._batch_size self.valid.batch_size = self._valid_size self.test.batch_size = self._test_size self.train.num_total = ImageNet.TRAIN_NUM_PER_EPOCH self.valid.num_total = ImageNet.VALID_NUM_PER_EPOCH self.test.num_total = ImageNet.TEST_NUM_PER_EPOCH self.train.num_batch = int(math.ceil(ImageNet.TRAIN_NUM_PER_EPOCH / self._batch_size / hvd.size())) self.valid.num_batch = int(math.ceil(ImageNet.VALID_NUM_PER_EPOCH / self._valid_size / hvd.size())) self.test.num_batch = int(math.ceil(ImageNet.TEST_NUM_PER_EPOCH / self._test_size / hvd.size())) return self
def init_data(self): dt.trace(dt.DC.DATA, "[{}] init data".format(self.tag)) self.train, self.valid, self.test = dt.Opt(), dt.Opt, dt.Opt() self.train.batch_size = self._batch_size self.valid.batch_size = self._valid_size self.test.batch_size = self._test_size self.train.num_total = Cifar10.TRAIN_NUM_PER_EPOCH self.valid.num_total = Cifar10.VALID_NUM_PER_EPOCH self.test.num_total = Cifar10.TEST_NUM_PER_EPOCH self.train.num_batch = int(math.ceil(Cifar10.TRAIN_NUM_PER_EPOCH / self._batch_size / hvd.size())) self.valid.num_batch = int(math.ceil(Cifar10.VALID_NUM_PER_EPOCH / self._valid_size / hvd.size())) self.test.num_batch = int(math.ceil(Cifar10.TEST_NUM_PER_EPOCH / self._test_size / hvd.size())) self.classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') return self
def build_model(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build model".format(self.tag, type(self).__name__)) #self._model = dt.model.cifar.VGG('VGG19') # target 92.64% self._model = dt.model.cifar.ResNet18() # target 93.02% #self._model = dt.model.cifar.ResNet50() # target 93.62% #self._model = dt.model.cifar.ResNet101() # target 93.75% #self._model = dt.model.cifar.ResNet152() # 8-gpu 94.2+% #self._model = dt.model.cifar.PreActResNet18() # target 95.11%, NAN #self._model = dt.model.cifar.GoogLeNet() #self._model = dt.model.cifar.DenseNet121() # target 95.04% #self._model = dt.model.cifar.ResNeXt29_32x4d() # target 94.73% #self._model = dt.model.cifar.ResNeXt29_2x64d() # target 94.82% #self._model = dt.model.cifar.MobileNet() #self._model = dt.model.cifar.MobileNetV2() # target 94.43% #self._model = dt.model.cifar.DPN92() # target 95.16% #self._model = dt.model.cifar.ShuffleNetG2() #self._model = dt.model.cifar.SENet18() #self._model = dt.model.cifar.ShuffleNetV2(1) #self._model = dt.model.cifar.EfficientNetB0() return True
def build_model(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build model".format(self.tag, type(self).__name__)) args = self._ctx.args pretrained = (args.pretrained > 0) if args.model_name == 'efficientnet': if args.model_type == 'b0': self._model = dt.model.efficientnet.efficientnet_b0(pretrained=pretrained) elif args.model_type == 'b1': self._model = dt.model.efficientnet.efficientnet_b1(pretrained=pretrained) elif args.model_type == 'b2': self._model = dt.model.efficientnet.efficientnet_b2(pretrained=pretrained) elif args.model_type == 'b3': self._model = dt.model.efficientnet.efficientnet_b3(pretrained=pretrained) elif args.model_type == 'b4': self._model = dt.model.efficientnet.efficientnet_b4(pretrained=pretrained) elif args.model_type == 'b5': self._model = dt.model.efficientnet.efficientnet_b5(pretrained=pretrained) elif args.model_type == 'b6': self._model = dt.model.efficientnet.efficientnet_b6(pretrained=pretrained) elif args.model_type == 'b7': self._model = dt.model.efficientnet.efficientnet_b7(pretrained=pretrained) elif args.model_name == 'efficientnet_lm': if args.model_type == 'b0' or \ args.model_type == 'b1' or \ args.model_type == 'b2' or \ args.model_type == 'b3' or \ args.model_type == 'b4' or \ args.model_type == 'b5' or \ args.model_type == 'b6' or \ args.model_type == 'b7': model_arch = "efficientnet-{}".format(args.model_type) if pretrained: self._model = dt.model.efficientnet.EfficientNetLM.from_pretrained(model_arch) else: self._model = dt.model.efficientnet.EfficientNetLM.from_name(model_arch) elif args.model_name == 'efficientnet_rw': if args.model_type == 'b0' or \ args.model_type == 'b1' or \ args.model_type == 'b2' or \ args.model_type == 'b3' or \ args.model_type == 'b4' or \ args.model_type == 'b5' or \ args.model_type == 'b6' or \ args.model_type == 'b7': model_arch = "efficientnet_{}".format(args.model_type) self._model = dt.model.timm.create_model(model_arch, pretrained=pretrained) elif args.model_name == 'fairnas': if args.model_type == 'a': self._model = dt.model.fairnas.FairNasA() # 8-gpu elif args.model_name == 'resnet_rw': #if dt.train.is_chief(): # dt.print_pp(dt.model.timm.list_models()) if args.model_type == '34': self._model = dt.model.timm.create_model('resnet34', pretrained=pretrained) elif args.model_type == '50': self._model = dt.model.timm.create_model('resnet50', pretrained=pretrained) else: #if dt.train.is_chief(): # dt.print_pp(torchvision.models.__dict__) self._model = torchvision.models.__dict__[args.model_name](pretrained=pretrained) dt.info(dt.DC.TRAIN, "model {}, type {}, pretrained {}".format(args.model_name, args.model_type, args.pretrained)) return True
def __init__(self, ctx): super(ImageNetEstimator, self).__init__(ctx) self.tag = "EST::IMAGENET" dt.trace(dt.DC.MODEL, "[{}] ({}) __init__".format(self.tag, type(self).__name__))
def __init__(self, ctx): super(ClassEstimator, self).__init__(ctx) self.tag = "EST::CLASS" dt.trace(dt.DC.MODEL, "[{}] ({}) __init__".format(self.tag, type(self).__name__))
def __init__(self): self.tag = "DATA::BASE" dt.trace(dt.DC.DATA, "[{}] ({}) __init__".format(self.tag, type(self).__name__))
def __init__(self, opt, cfg): super(MnistEstimator, self).__init__(opt, cfg) self.tag = "EST::MNIST" dt.trace(dt.DC.MODEL, "[{}] ({}) __init__".format(self.tag, type(self).__name__))
def load_data(self): dt.trace(dt.DC.DATA, "[{}] load data".format(self.tag)) transform_train = transforms.Compose([ transforms.RandomResizedCrop(self._out_size, interpolation=PIL.Image.BICUBIC), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, ), transforms.ToTensor(), transforms.Normalize(mean=ImageNet.MEAN_RGB, std=ImageNet.VAR_RGB), ]) transform_test = transforms.Compose([ transforms.Resize(self._out_size + ImageNet.CROP_PAD, interpolation=PIL.Image.BICUBIC), transforms.CenterCrop(self._out_size), transforms.ToTensor(), transforms.Normalize(mean=ImageNet.MEAN_RGB, std=ImageNet.VAR_RGB), ]) kwargs = {'num_workers': self._num_workers, 'pin_memory': True} if self._pin_memory else {} train_dataset_root = os.path.join(self._data_dir, ImageNet.TRAIN_DIR) self.train.dataset = datasets.ImageFolder(root=train_dataset_root, transform=transform_train) self.train.sampler = None if dt.train.is_mp(): # Horovod: use DistributedSampler to partition the training data. self.train.sampler = torch.utils.data.distributed.DistributedSampler( self.train.dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=self._shuffle) self.train.loader = torch.utils.data.DataLoader(self.train.dataset, batch_size=self._batch_size, shuffle=False, sampler=self.train.sampler, **kwargs) else: self.train.loader = torch.utils.data.DataLoader(self.train.dataset, batch_size=self._batch_size, shuffle=self._shuffle, **kwargs) valid_dataset_root = os.path.join(self._data_dir, ImageNet.VALIDATION_DIR) self.valid.dataset = datasets.ImageFolder(root=valid_dataset_root, transform=transform_test) self.valid.sampler = None if dt.train.is_mp(): # Horovod: use DistributedSampler to partition the validation data. self.valid.sampler = torch.utils.data.distributed.DistributedSampler( self.valid.dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) self.valid.loader = torch.utils.data.DataLoader(self.valid.dataset, batch_size=self._valid_size, shuffle=False, sampler=self.valid.sampler, **kwargs) else: self.valid.loader = torch.utils.data.DataLoader(self.valid.dataset, batch_size=self._valid_size, shuffle=False, **kwargs) test_dataset_root = os.path.join(self._data_dir, ImageNet.TEST_DIR) self.test.dataset = datasets.ImageFolder(root=test_dataset_root, transform=transform_test) self.test.sampler = None if dt.train.is_mp(): # Horovod: use DistributedSampler to partition the test data. self.test.sampler = torch.utils.data.distributed.DistributedSampler( self.test.dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) self.test.loader = torch.utils.data.DataLoader(self.test.dataset, batch_size=self._test_size, shuffle=False, sampler=self.test.sampler, **kwargs) else: self.test.loader = torch.utils.data.DataLoader(self.test.dataset, batch_size=self._test_size, shuffle=False, **kwargs) dt.trace(dt.DC.DATA, "[{}] loaded, train {}, valid {}, test {}".format( self.tag, len(self.train.dataset), len(self.valid.dataset), len(self.test.dataset))) return self
def __init__(self, ctx): super(Cifar10Estimator, self).__init__(ctx) self.tag = "EST::CIFAR10" dt.trace(dt.DC.MODEL, "[{}] ({}) __init__".format(self.tag, type(self).__name__))