def test_torchmodel_constructor(self): class TwoInputModel(nn.Module): def __init__(self): super(TwoInputModel, self).__init__() self.dense1 = nn.Linear(2, 2) self.dense2 = nn.Linear(3, 1) def forward(self, x1, x2): x1 = self.dense1(x1) x2 = self.dense2(x2) return x1, x2 TorchModel.from_pytorch(TwoInputModel())
def load(self, model_path): """ Load the Estimator state (model and possibly with optimizer) from provided model_path. The model file should be generated by the save method of this estimator, or by ``torch.save(state_dict, model_path)``, where `state_dict` can be obtained by the ``state_dict()`` method of a pytorch model. :param model_path: path to the saved model. :return: """ from bigdl.orca.torch import TorchModel import os try: pytorch_model = self.get_model() pytorch_model.load_state_dict(torch.load(model_path)) self.model = TorchModel.from_pytorch(pytorch_model) except Exception: raise ValueError( "Cannot load the PyTorch model. Please check your model path.") optim_path = self._get_optimizer_path(model_path) if os.path.isfile(optim_path): try: self.optimizer = OptimMethod.load(optim_path) except Exception: raise ValueError( "Cannot load the optimizer. Only `bigdl.dllib.optim.optimizer." "OptimMethod` is supported for loading.") else: self.optimizer = None self.estimator = SparkEstimator(self.model, self.optimizer, self.model_dir)
def test_train_model_with_bn(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() loss_fn = torch.nn.BCELoss() az_model = TorchModel.from_pytorch(torch_model) zoo_loss = TorchLoss.from_pytorch(loss_fn) inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) train_featureset = FeatureSet.pytorch_dataloader(train_loader) val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) val_featureset = FeatureSet.pytorch_dataloader(val_loader) zooOptimizer = Adam() estimator = Estimator(az_model, optim_methods=zooOptimizer) estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4), checkpoint_trigger=EveryEpoch(), validation_set=val_featureset, validation_method=[Accuracy()]) trained_model = az_model.to_pytorch()
def test_torch_net_predict_resnet(self): torch.random.manual_seed(1) pytorch_model = torchvision.models.resnet18(pretrained=False).eval() zoo_model = TorchModel.from_pytorch(pytorch_model) zoo_model.evaluate() dummy_input = torch.ones(1, 3, 224, 224) pytorch_result = pytorch_model(dummy_input).data.numpy() zoo_result = zoo_model.forward(dummy_input.numpy()) print(pytorch_result) print(zoo_result) assert np.allclose(pytorch_result, zoo_result, rtol=1.e-6, atol=1.e-6)
def test_model_save_and_load(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() az_model = TorchModel.from_pytorch(torch_model) with tempfile.TemporaryDirectory() as tmp_dir_name: path = tmp_dir_name + "/model.obj" az_model.save(path, True) loaded_model = Model.load(path) loaded_torchModel = TorchModel.from_value(loaded_model.value) dummy_input = torch.ones(16, 2) loaded_torchModel.forward(dummy_input.numpy()) loaded_torchModel.to_pytorch()
def __init__(self, model, loss, optimizer, config=None, metrics=None, model_dir=None, bigdl_type="float"): from bigdl.orca.torch import TorchModel, TorchLoss, TorchOptim self.loss = loss self.optimizer = optimizer self.config = {} if config is None else config if self.loss is None: self.loss = TorchLoss() else: self.loss = TorchLoss.from_pytorch(loss) if isinstance(model, types.FunctionType): def model_creator(self): return model(self.config) model = model_creator(self) if self.optimizer is None: from bigdl.orca.learn.optimizers.schedule import Default self.optimizer = SGD( learningrate_schedule=Default()).get_optimizer() elif isinstance(self.optimizer, TorchOptimizer): self.optimizer = TorchOptim.from_pytorch(self.optimizer) elif isinstance(self.optimizer, OrcaOptimizer): self.optimizer = self.optimizer.get_optimizer() else: raise ValueError( "Only PyTorch optimizer and orca optimizer are supported") from bigdl.orca.learn.metrics import Metric self.metrics = Metric.convert_metrics_list(metrics) self.log_dir = None self.app_name = None self.model_dir = model_dir self.model = TorchModel.from_pytorch(model) self.estimator = SparkEstimator(self.model, self.optimizer, model_dir, bigdl_type=bigdl_type)
def test_train_model_function_with_bn(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() az_model = TorchModel.from_pytorch(torch_model) weights = az_model.get_weights() weights[0][0] = 1.0 az_model.set_weights(weights) exported_model = az_model.to_pytorch()
def load_orca_checkpoint(self, path, version=None, prefix=None): """ Load existing checkpoint. To load a specific checkpoint, please provide both `version` and `perfix`. If `version` is None, then the latest checkpoint will be loaded. :param path: Path to the existing checkpoint (or directory containing Orca checkpoint files). :param version: checkpoint version, which is the suffix of model.* file, i.e., for modle.4 file, the version is 4. If it is None, then load the latest checkpoint. :param prefix: optimMethod prefix, for example 'optimMethod-TorchModelf53bddcc'. :return: """ import os from bigdl.dllib.nn.layer import Model from bigdl.dllib.optim.optimizer import OptimMethod from bigdl.orca.learn.utils import find_latest_checkpoint from bigdl.orca.torch import TorchModel if version is None: path, prefix, version = find_latest_checkpoint( path, model_type="pytorch") if path is None: raise ValueError( "Cannot find PyTorch checkpoint, please check your checkpoint" " path.") else: assert prefix is not None, "You should provide optimMethod prefix, " \ "for example 'optimMethod-TorchModelf53bddcc'" try: loaded_model = Model.load( os.path.join(path, "model.{}".format(version))) self.model = TorchModel.from_value(loaded_model.value) self.optimizer = OptimMethod.load( os.path.join(path, "{}.{}".format(prefix, version))) except Exception as e: raise ValueError( "Cannot load PyTorch checkpoint, please check your checkpoint path " "and checkpoint type." + str(e)) self.estimator = SparkEstimator(self.model, self.optimizer, self.model_dir)
def test_model_with_bn_to_pytorch(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() az_model = TorchModel.from_pytorch(torch_model) dummy_input = torch.ones(16, 2) zoo_result = az_model.forward(dummy_input.numpy()) exported_model = az_model.to_pytorch() assert len((list(exported_model.named_buffers()))) != 0
def test_model_to_pytorch(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() az_model = TorchModel.from_pytorch(torch_model) weights = az_model.get_weights() weights[0][0] = 1.0 az_model.set_weights(weights) exported_model = az_model.to_pytorch() p = list(exported_model.parameters()) assert p[0][0][0] == 1.0
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument( '--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument( '--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument( '--deploy-mode', default="local", help='supported deploy mode is local, yarn-client, yarn-cluster') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=False) # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided. if args.deploy_mode == "local": sc = init_orca_context() else: sc = init_orca_context(cluster_mode=args.deploy_mode, cores=2, memory="2g", num_nodes=4) model = Net() model.train() criterion = nn.NLLLoss() adam = torch.optim.Adam(model.parameters(), lr=args.lr) zoo_model = TorchModel.from_pytorch(model) zoo_criterion = TorchLoss.from_pytorch(criterion) zoo_optim = TorchOptim.from_pytorch(adam) zoo_estimator = Estimator(zoo_model, optim_methods=zoo_optim) train_featureset = FeatureSet.pytorch_dataloader(train_loader) test_featureset = FeatureSet.pytorch_dataloader(test_loader) from bigdl.dllib.optim.optimizer import MaxEpoch, EveryEpoch zoo_estimator.train_minibatch(train_featureset, zoo_criterion, end_trigger=MaxEpoch(args.epochs), checkpoint_trigger=EveryEpoch(), validation_set=test_featureset, validation_method=[Accuracy()])
num_cores_per_executor = 4 zoo_conda_name = detect_conda_env_name() # auto detect current conda env name sc = init_spark_on_yarn( hadoop_conf=hadoop_conf_dir, conda_name=zoo_conda_name, num_executors=num_executors, executor_cores=num_cores_per_executor, executor_memory="8g", driver_memory="2g", driver_cores=1) else: num_cores_per_executor = 4 sc = init_spark_on_local(cores=num_cores_per_executor, conf={"spark.driver.memory": "10g"}) model = CatDogModel() zoo_model = TorchModel.from_pytorch(model) def lossFunc(input, target): return nn.NLLLoss().forward(input, target.flatten().long()) zoo_loss = TorchLoss.from_pytorch(lossFunc) # prepare training data as Spark DataFrame image_path = sys.argv[1] imageDF = NNImageReader.readImages(image_path, sc, resizeH=256, resizeW=256, image_codec=1) getName = udf(lambda row: os.path.basename(row[0]), StringType()) getLabel = udf(lambda name: 1.0 if name.startswith('cat') else 0.0, DoubleType()) labelDF = imageDF.withColumn("name", getName(col("image"))) \ .withColumn("label", getLabel(col('name'))).cache() (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1])
def main(): parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--max_epochs', default=90, type=int, metavar='N', help='number of max epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') parser.add_argument('--cores', default=4, type=int, help='num of CPUs to use.') parser.add_argument('--nodes', default=1, type=int, help='num of nodes to use.') parser.add_argument('--executor_memory', default='20g', type=str, help='size of executor memory.') parser.add_argument('--driver_memory', default='20g', type=str, help='size of driver memory.') parser.add_argument('--driver_cores', default=1, type=int, help='num of driver cores to use.') args = parser.parse_args() if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=args.cores, conf={"spark.driver.memory": "20g"}) else: hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') num_executors = args.nodes executor_memory = args.executor_memory driver_memory = args.driver_memory driver_cores = args.driver_cores num_cores_per_executor = args.cores os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor) os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor) sc = init_spark_on_yarn( hadoop_conf=hadoop_conf_dir, conda_name=detect_conda_env_name(), # auto detect current conda env name num_executors=num_executors, executor_cores=num_cores_per_executor, executor_memory=executor_memory, driver_memory=driver_memory, driver_cores=driver_cores, conf={"spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"}) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True) model = torchvision.models.resnet50() val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False) iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size)) step = Step(iterationPerEpoch * 30, 0.1) zooOptimizer = SGD(args.lr, momentum=args.momentum, dampening=0.0, leaningrate_schedule=step, weightdecay=args.weight_decay) zooModel = TorchModel.from_pytorch(model) criterion = torch.nn.CrossEntropyLoss() zooCriterion = TorchLoss.from_pytorch(criterion) estimator = Estimator(zooModel, optim_methods=zooOptimizer) train_featureSet = FeatureSet.pytorch_dataloader(train_loader) test_featureSet = FeatureSet.pytorch_dataloader(val_loader) estimator.train_minibatch(train_featureSet, zooCriterion, end_trigger=MaxEpoch(args.max_epochs), checkpoint_trigger=EveryEpoch(), validation_set=test_featureSet, validation_method=[Accuracy(), Top5Accuracy()])