Ejemplo n.º 1
0
    def test_torchmodel_constructor(self):
        class TwoInputModel(nn.Module):
            def __init__(self):
                super(TwoInputModel, self).__init__()
                self.dense1 = nn.Linear(2, 2)
                self.dense2 = nn.Linear(3, 1)

            def forward(self, x1, x2):
                x1 = self.dense1(x1)
                x2 = self.dense2(x2)
                return x1, x2

        TorchModel.from_pytorch(TwoInputModel())
    def load(self, model_path):
        """
        Load the Estimator state (model and possibly with optimizer) from provided model_path.
        The model file should be generated by the save method of this estimator, or by
        ``torch.save(state_dict, model_path)``, where `state_dict` can be obtained by
        the ``state_dict()`` method of a pytorch model.

        :param model_path: path to the saved model.
        :return:
        """

        from bigdl.orca.torch import TorchModel
        import os

        try:
            pytorch_model = self.get_model()
            pytorch_model.load_state_dict(torch.load(model_path))
            self.model = TorchModel.from_pytorch(pytorch_model)
        except Exception:
            raise ValueError(
                "Cannot load the PyTorch model. Please check your model path.")

        optim_path = self._get_optimizer_path(model_path)
        if os.path.isfile(optim_path):
            try:
                self.optimizer = OptimMethod.load(optim_path)
            except Exception:
                raise ValueError(
                    "Cannot load the optimizer. Only `bigdl.dllib.optim.optimizer."
                    "OptimMethod` is supported for loading.")
        else:
            self.optimizer = None

        self.estimator = SparkEstimator(self.model, self.optimizer,
                                        self.model_dir)
Ejemplo n.º 3
0
    def test_train_model_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        loss_fn = torch.nn.BCELoss()
        az_model = TorchModel.from_pytorch(torch_model)
        zoo_loss = TorchLoss.from_pytorch(loss_fn)
        inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]])
        targets = torch.Tensor([[0], [0], [0], [1], [1], [1]])
        train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        train_featureset = FeatureSet.pytorch_dataloader(train_loader)
        val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        val_featureset = FeatureSet.pytorch_dataloader(val_loader)

        zooOptimizer = Adam()
        estimator = Estimator(az_model, optim_methods=zooOptimizer)
        estimator.train_minibatch(train_featureset,
                                  zoo_loss,
                                  end_trigger=MaxEpoch(4),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=val_featureset,
                                  validation_method=[Accuracy()])

        trained_model = az_model.to_pytorch()
Ejemplo n.º 4
0
    def test_torch_net_predict_resnet(self):
        torch.random.manual_seed(1)
        pytorch_model = torchvision.models.resnet18(pretrained=False).eval()
        zoo_model = TorchModel.from_pytorch(pytorch_model)
        zoo_model.evaluate()

        dummy_input = torch.ones(1, 3, 224, 224)
        pytorch_result = pytorch_model(dummy_input).data.numpy()
        zoo_result = zoo_model.forward(dummy_input.numpy())
        print(pytorch_result)
        print(zoo_result)
        assert np.allclose(pytorch_result, zoo_result, rtol=1.e-6, atol=1.e-6)
Ejemplo n.º 5
0
    def test_model_save_and_load(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)

        with tempfile.TemporaryDirectory() as tmp_dir_name:
            path = tmp_dir_name + "/model.obj"
            az_model.save(path, True)
            loaded_model = Model.load(path)
            loaded_torchModel = TorchModel.from_value(loaded_model.value)
            dummy_input = torch.ones(16, 2)
            loaded_torchModel.forward(dummy_input.numpy())
            loaded_torchModel.to_pytorch()
    def __init__(self,
                 model,
                 loss,
                 optimizer,
                 config=None,
                 metrics=None,
                 model_dir=None,
                 bigdl_type="float"):
        from bigdl.orca.torch import TorchModel, TorchLoss, TorchOptim
        self.loss = loss
        self.optimizer = optimizer
        self.config = {} if config is None else config

        if self.loss is None:
            self.loss = TorchLoss()
        else:
            self.loss = TorchLoss.from_pytorch(loss)
        if isinstance(model, types.FunctionType):

            def model_creator(self):
                return model(self.config)

            model = model_creator(self)
        if self.optimizer is None:
            from bigdl.orca.learn.optimizers.schedule import Default
            self.optimizer = SGD(
                learningrate_schedule=Default()).get_optimizer()
        elif isinstance(self.optimizer, TorchOptimizer):
            self.optimizer = TorchOptim.from_pytorch(self.optimizer)
        elif isinstance(self.optimizer, OrcaOptimizer):
            self.optimizer = self.optimizer.get_optimizer()
        else:
            raise ValueError(
                "Only PyTorch optimizer and orca optimizer are supported")
        from bigdl.orca.learn.metrics import Metric
        self.metrics = Metric.convert_metrics_list(metrics)
        self.log_dir = None
        self.app_name = None
        self.model_dir = model_dir
        self.model = TorchModel.from_pytorch(model)
        self.estimator = SparkEstimator(self.model,
                                        self.optimizer,
                                        model_dir,
                                        bigdl_type=bigdl_type)
Ejemplo n.º 7
0
    def test_train_model_function_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)

        weights = az_model.get_weights()
        weights[0][0] = 1.0
        az_model.set_weights(weights)

        exported_model = az_model.to_pytorch()
    def load_orca_checkpoint(self, path, version=None, prefix=None):
        """
        Load existing checkpoint. To load a specific checkpoint, please provide both `version` and
        `perfix`. If `version` is None, then the latest checkpoint will be loaded.

        :param path: Path to the existing checkpoint (or directory containing Orca checkpoint
               files).
        :param version: checkpoint version, which is the suffix of model.* file, i.e., for
               modle.4 file, the version is 4. If it is None, then load the latest checkpoint.
        :param prefix: optimMethod prefix, for example 'optimMethod-TorchModelf53bddcc'.
        :return:
        """
        import os
        from bigdl.dllib.nn.layer import Model
        from bigdl.dllib.optim.optimizer import OptimMethod
        from bigdl.orca.learn.utils import find_latest_checkpoint
        from bigdl.orca.torch import TorchModel

        if version is None:
            path, prefix, version = find_latest_checkpoint(
                path, model_type="pytorch")
            if path is None:
                raise ValueError(
                    "Cannot find PyTorch checkpoint, please check your checkpoint"
                    " path.")
        else:
            assert prefix is not None, "You should provide optimMethod prefix, " \
                                       "for example 'optimMethod-TorchModelf53bddcc'"

        try:
            loaded_model = Model.load(
                os.path.join(path, "model.{}".format(version)))
            self.model = TorchModel.from_value(loaded_model.value)
            self.optimizer = OptimMethod.load(
                os.path.join(path, "{}.{}".format(prefix, version)))
        except Exception as e:
            raise ValueError(
                "Cannot load PyTorch checkpoint, please check your checkpoint path "
                "and checkpoint type." + str(e))
        self.estimator = SparkEstimator(self.model, self.optimizer,
                                        self.model_dir)
Ejemplo n.º 9
0
    def test_model_with_bn_to_pytorch(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)
        dummy_input = torch.ones(16, 2)
        zoo_result = az_model.forward(dummy_input.numpy())

        exported_model = az_model.to_pytorch()
        assert len((list(exported_model.named_buffers()))) != 0
Ejemplo n.º 10
0
    def test_model_to_pytorch(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)

        weights = az_model.get_weights()
        weights[0][0] = 1.0
        az_model.set_weights(weights)

        exported_model = az_model.to_pytorch()
        p = list(exported_model.parameters())
        assert p[0][0][0] == 1.0
Ejemplo n.º 11
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store mnist data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=256,
        metavar='N',
        help='input batch size for training per executor(default: 256)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        metavar='N',
                        help='number of epochs to train (default: 2)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    parser.add_argument(
        '--deploy-mode',
        default="local",
        help='supported deploy mode is local, yarn-client, yarn-cluster')

    args = parser.parse_args()

    torch.manual_seed(args.seed)

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=False)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if args.deploy_mode == "local":
        sc = init_orca_context()
    else:
        sc = init_orca_context(cluster_mode=args.deploy_mode,
                               cores=2,
                               memory="2g",
                               num_nodes=4)

    model = Net()
    model.train()
    criterion = nn.NLLLoss()

    adam = torch.optim.Adam(model.parameters(), lr=args.lr)
    zoo_model = TorchModel.from_pytorch(model)
    zoo_criterion = TorchLoss.from_pytorch(criterion)
    zoo_optim = TorchOptim.from_pytorch(adam)
    zoo_estimator = Estimator(zoo_model, optim_methods=zoo_optim)
    train_featureset = FeatureSet.pytorch_dataloader(train_loader)
    test_featureset = FeatureSet.pytorch_dataloader(test_loader)
    from bigdl.dllib.optim.optimizer import MaxEpoch, EveryEpoch
    zoo_estimator.train_minibatch(train_featureset,
                                  zoo_criterion,
                                  end_trigger=MaxEpoch(args.epochs),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=test_featureset,
                                  validation_method=[Accuracy()])
Ejemplo n.º 12
0
        num_cores_per_executor = 4
        zoo_conda_name = detect_conda_env_name()  # auto detect current conda env name
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=zoo_conda_name,
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory="8g",
            driver_memory="2g",
            driver_cores=1)
    else:
        num_cores_per_executor = 4
        sc = init_spark_on_local(cores=num_cores_per_executor, conf={"spark.driver.memory": "10g"})

    model = CatDogModel()
    zoo_model = TorchModel.from_pytorch(model)

    def lossFunc(input, target):
        return nn.NLLLoss().forward(input, target.flatten().long())

    zoo_loss = TorchLoss.from_pytorch(lossFunc)

    # prepare training data as Spark DataFrame
    image_path = sys.argv[1]
    imageDF = NNImageReader.readImages(image_path, sc, resizeH=256, resizeW=256, image_codec=1)
    getName = udf(lambda row: os.path.basename(row[0]), StringType())
    getLabel = udf(lambda name: 1.0 if name.startswith('cat') else 0.0, DoubleType())
    labelDF = imageDF.withColumn("name", getName(col("image"))) \
        .withColumn("label", getLabel(col('name'))).cache()
    (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1])
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR',
                        help='path to dataset')
    parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                        choices=model_names,
                        help='model architecture: ' +
                             ' | '.join(model_names) +
                             ' (default: resnet18)')
    parser.add_argument('--epochs', default=90, type=int, metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--max_epochs', default=90, type=int, metavar='N',
                        help='number of max epochs to run')
    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument('-b', '--batch-size', default=256, type=int,
                        metavar='N',
                        help='mini-batch size (default: 256), this is the total '
                             'batch size of all GPUs on the current node when '
                             'using Data Parallel or Distributed Data Parallel')
    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                        metavar='LR', help='initial learning rate', dest='lr')
    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                        help='momentum')
    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                        metavar='W', help='weight decay (default: 1e-4)',
                        dest='weight_decay')
    parser.add_argument('-p', '--print-freq', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                        help='use pre-trained model')
    parser.add_argument('--world-size', default=-1, type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank', default=-1, type=int,
                        help='node rank for distributed training')
    parser.add_argument('--seed', default=None, type=int,
                        help='seed for initializing training. ')
    parser.add_argument('--cores', default=4, type=int,
                        help='num of CPUs to use.')
    parser.add_argument('--nodes', default=1, type=int,
                        help='num of nodes to use.')
    parser.add_argument('--executor_memory', default='20g', type=str,
                        help='size of executor memory.')
    parser.add_argument('--driver_memory', default='20g', type=str,
                        help='size of driver memory.')
    parser.add_argument('--driver_cores', default=1, type=int,
                        help='num of driver cores to use.')
    args = parser.parse_args()
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=args.cores, conf={"spark.driver.memory": "20g"})
    else:
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        num_executors = args.nodes
        executor_memory = args.executor_memory
        driver_memory = args.driver_memory
        driver_cores = args.driver_cores
        num_cores_per_executor = args.cores
        os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor)
        os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor)
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=detect_conda_env_name(),  # auto detect current conda env name
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            driver_cores=driver_cores,
            conf={"spark.rpc.message.maxSize": "1024",
                  "spark.task.maxFailures": "1",
                  "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"})

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True)

    model = torchvision.models.resnet50()
    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False)

    iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size))
    step = Step(iterationPerEpoch * 30, 0.1)
    zooOptimizer = SGD(args.lr, momentum=args.momentum, dampening=0.0,
                       leaningrate_schedule=step, weightdecay=args.weight_decay)
    zooModel = TorchModel.from_pytorch(model)
    criterion = torch.nn.CrossEntropyLoss()
    zooCriterion = TorchLoss.from_pytorch(criterion)
    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    train_featureSet = FeatureSet.pytorch_dataloader(train_loader)
    test_featureSet = FeatureSet.pytorch_dataloader(val_loader)
    estimator.train_minibatch(train_featureSet, zooCriterion, end_trigger=MaxEpoch(args.max_epochs),
                              checkpoint_trigger=EveryEpoch(), validation_set=test_featureSet,
                              validation_method=[Accuracy(), Top5Accuracy()])