Beispiel #1
0
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        def transform(df):
            result = {
                "x": [df['user'].to_numpy(), df['item'].to_numpy()],
                "y": df['label'].to_numpy()
            }
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(model=model,
                                             loss=loss_func,
                                             optimizer=SGD(),
                                             model_dir=temp_dir_name,
                                             backend="bigdl")
            estimator.fit(data=data_shard,
                          epochs=4,
                          batch_size=2,
                          validation_data=data_shard,
                          validation_methods=[Accuracy()],
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard,
                               validation_methods=[Accuracy()],
                               batch_size=2)
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        optimizer=None,
                                        backend="bigdl")
            est2.load(temp_dir_name, loss=loss_func)
            est2.fit(data=data_shard,
                     epochs=8,
                     batch_size=2,
                     validation_data=data_shard,
                     validation_methods=[Accuracy()],
                     checkpoint_trigger=EveryEpoch())
            est2.evaluate(data_shard,
                          validation_methods=[Accuracy()],
                          batch_size=2)
Beispiel #2
0
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input, target.flatten().long())

        def transform(df):
            result = {
                "x": np.stack([df['user'].to_numpy(), df['item'].to_numpy()], axis=1),
                "y": df['label'].to_numpy()
            }
            return result

        def transform_del_y(d):
            result = {"x": d["x"]}
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(model=model, loss=loss_func,
                                             metrics=[Accuracy()],
                                             optimizer=SGD(learningrate_schedule=Default()),
                                             model_dir=temp_dir_name)
            estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard, batch_size=2)
            est2 = Estimator.from_torch(model=model, loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load(temp_dir_name, loss=loss_func)
            est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard,
                     checkpoint_trigger=EveryEpoch())
            est2.evaluate(data_shard, batch_size=2)
            pred_result = est2.predict(data_shard)
            pred_c = pred_result.collect()
            assert(pred_result, SparkXShards)
            pred_shard = data_shard.transform_shard(transform_del_y)
            pred_result2 = est2.predict(pred_shard)
            pred_c_2 = pred_result2.collect()
            assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()
    def test_train(self):
        estimator = Estimator.from_torch(
            model_creator=model_creator,
            optimizer_creator=optimizer_creator,
            loss_creator=nn.MSELoss,
            scheduler_creator=scheduler_creator,
            config={
                "lr": 1e-2,  # used in optimizer_creator
                "hidden_size": 1,  # used in model_creator
                "batch_size": 4,  # used in data_creator
            })
        stats1 = estimator.fit(train_data_creator, epochs=5)
        train_loss1 = stats1[-1]["train_loss"]
        validation_loss1 = estimator.evaluate(
            validation_data_creator)["val_loss"]

        stats2 = estimator.fit(train_data_creator, epochs=3)
        train_loss2 = stats2[-1]["train_loss"]
        validation_loss2 = estimator.evaluate(
            validation_data_creator)["val_loss"]

        assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
        assert validation_loss2 <= validation_loss1, (validation_loss2,
                                                      validation_loss1)
        estimator.shutdown()
Beispiel #4
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--dir', default='/tmp/data', metavar='N',
                        help='the folder store mnist data')
    parser.add_argument('--batch-size', type=int, default=256, metavar='N',
                        help='input batch size for training per executor(default: 256)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs', type=int, default=2, metavar='N',
                        help='number of epochs to train (default: 2)')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    parser.add_argument('--cluster_mode', type=str, default="local",
                        help='The mode for the Spark cluster. local or yarn.')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(args.dir, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(args.dir, train=False,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.test_batch_size, shuffle=False)

    if args.cluster_mode == "local":
        init_orca_context(cores=1, memory="2g")
    elif args.cluster_mode == "yarn":
        init_orca_context(
            cluster_mode="yarn-client", cores=4, num_nodes=2, memory="2g",
            driver_memory="10g", driver_cores=1,
            conf={"spark.rpc.message.maxSize": "1024",
                  "spark.task.maxFailures": "1",
                  "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"})

    model = LeNet()
    model.train()
    criterion = nn.NLLLoss()

    adam = torch.optim.Adam(model.parameters(), args.lr)
    est = Estimator.from_torch(model=model, optimizer=adam, loss=criterion)
    est.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader,
            validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch())
    result = est.evaluate(data=test_loader, validation_metrics=[Accuracy()])
    for r in result:
        print(str(r))
    stop_orca_context()
Beispiel #5
0
    def test_bigdl_pytorch_estimator_dataloader_creator(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        model = SimpleModel()

        estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(),
                                         optimizer=Adam())

        def get_dataloader():
            inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]])
            targets = torch.Tensor([[0], [0], [0], [1], [1], [1]])
            return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2)

        estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader,
                      validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch())
        estimator.evaluate(data=get_dataloader, validation_metrics=[Accuracy()])
        model = estimator.get_model()
        assert isinstance(model, nn.Module)
Beispiel #6
0
    def test_horovod_initialized_correctly(self):
        estimator = Estimator.from_torch(
            model=model_creator,
            optimizer=optimizer_creator,
            loss=nn.MSELoss(),
            scheduler_creator=scheduler_creator,
            config={
                "lr": 1e-2,  # used in optimizer_creator
                "hidden_size": 1  # used in model_creator
            },
            backend="horovod",
            workers_per_node=2)

        def get_size():
            import horovod.torch as hvd
            return hvd.size()

        results = estimator.estimator.horovod_runner.run(get_size)
        assert results == [2, 2]

        def get_rank():
            import horovod.torch as hvd
            return hvd.rank()

        results = estimator.estimator.horovod_runner.run(get_rank)
        results = sorted(results)
        assert results == [0, 1]
    def test_bigdl_pytorch_estimator_dataframe_predict(self):
        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        class IdentityNet(nn.Module):
            def __init__(self):
                super().__init__()
                # need this line to avoid optimizer raise empty variable list
                self.fc1 = nn.Linear(5, 5)

            def forward(self, input_):
                return input_

        model = IdentityNet()
        rdd = self.sc.range(0, 100)
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            result = estimator.predict(df, feature_cols=["feature"])
            expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
            assert result.selectExpr(expr).first()["error"] == 0
Beispiel #8
0
    def test_data_creator(self):
        estimator = Estimator.from_torch(model=get_model,
                                         optimizer=get_optimizer,
                                         loss=nn.BCELoss(),
                                         config={"lr": 1e-2},
                                         workers_per_node=2,
                                         backend="torch_distributed")
        train_stats = estimator.fit(train_data_loader,
                                    epochs=2,
                                    batch_size=128)
        print(train_stats)
        val_stats = estimator.evaluate(val_data_loader, batch_size=64)
        print(val_stats)
        assert 0 < val_stats["val_accuracy"] < 1
        assert estimator.get_model()

        # Verify syncing weights, i.e. the two workers have the same weights after training
        import ray
        remote_workers = estimator.estimator.remote_workers
        state_dicts = ray.get(
            [worker.state_dict.remote() for worker in remote_workers])
        weights = [state["models"] for state in state_dicts]
        worker1_weights = weights[0][0]
        worker2_weights = weights[1][0]
        for layer in list(worker1_weights.keys()):
            assert np.allclose(worker1_weights[layer].numpy(),
                               worker2_weights[layer].numpy())
        estimator.shutdown()
Beispiel #9
0
def get_estimator(workers_per_node=1, model_fn=get_model):
    estimator = Estimator.from_torch(model=model_fn,
                                     optimizer=get_optimizer,
                                     loss=nn.BCELoss(),
                                     config={"lr": 1e-2},
                                     workers_per_node=workers_per_node,
                                     backend="torch_distributed")
    return estimator
    def test_bigdl_pytorch_estimator_pandas_dataframe(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(1, 10)

            def forward(self, x):
                x = torch.unsqueeze(x, dim=1)
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        model = SimpleModel()

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path,
                                 "orca/learn/simple_feature_label.csv")
        data_shard = read_csv(file_path)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=data_shard,
                          epochs=1,
                          batch_size=4,
                          feature_cols=['feature'],
                          label_cols=['label'],
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard,
                               batch_size=4,
                               feature_cols=['feature'],
                               label_cols=['label'])
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load_orca_checkpoint(temp_dir_name)
            est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
    def test_save_and_restore(self):
        estimator1 = Estimator.from_torch(
            model=model_creator,
            optimizer=optimizer_creator,
            loss=nn.MSELoss(),
            scheduler_creator=scheduler_creator,
            config={
                "lr": 1e-2,  # used in optimizer_creator
                "hidden_size": 1,  # used in model_creator
                "batch_size": 4,  # used in data_creator
            },
            backend="horovod")
        with TemporaryDirectory() as tmp_path:
            estimator1.fit(train_data_creator, epochs=1)
            checkpoint_path = os.path.join(tmp_path, "checkpoint")
            estimator1.save(checkpoint_path)

            model1 = estimator1.get_model()

            estimator1.shutdown()

            estimator2 = Estimator.from_torch(
                model=model_creator,
                optimizer=optimizer_creator,
                loss=nn.MSELoss(),
                scheduler_creator=scheduler_creator,
                config={
                    "lr": 1e-2,  # used in optimizer_creator
                    "hidden_size": 1,  # used in model_creator
                    "batch_size": 4,  # used in data_creator
                },
                backend="horovod")
            estimator2.load(checkpoint_path)

            model2 = estimator2.get_model()

        model1_state_dict = model1.state_dict()
        model2_state_dict = model2.state_dict()

        assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

        for k in model1_state_dict:
            assert torch.equal(model1_state_dict[k], model2_state_dict[k])
        estimator2.shutdown()
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch Tensorboard Example')

    parser.add_argument('--cluster_mode',
                        type=str,
                        default="local",
                        help='The cluster mode, such as local, yarn or k8s.')
    args = parser.parse_args()
    if args.cluster_mode == "local":
        init_orca_context()
    elif args.cluster_mode == "yarn":
        init_orca_context(cluster_mode=args.cluster_mode, cores=4, num_nodes=2)

    writer = SummaryWriter('runs/fashion_mnist_experiment_1')
    # constant for classes
    classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal',
               'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')

    # plot some random training images
    dataiter = iter(train_data_creator(config={}))
    images, labels = dataiter.next()

    # create grid of images
    img_grid = torchvision.utils.make_grid(images)

    # show images
    matplotlib_imshow(img_grid, one_channel=True)

    # write to tensorboard
    writer.add_image('four_fashion_mnist_images', img_grid)

    # inspect the model using tensorboard
    writer.add_graph(model_creator(config={}), images)
    writer.close()

    # training loss vs. epochs
    criterion = nn.CrossEntropyLoss()
    orca_estimator = Estimator.from_torch(model=model_creator,
                                          optimizer=optimizer_creator,
                                          loss=criterion,
                                          backend="torch_distributed")
    stats = orca_estimator.fit(train_data_creator, epochs=5, batch_size=4)

    for stat in stats:
        writer.add_scalar("training_loss", stat['train_loss'], stat['epoch'])
    print("Train stats: {}".format(stats))
    val_stats = orca_estimator.evaluate(validation_data_creator)
    print("Validation stats: {}".format(val_stats))
    orca_estimator.shutdown()

    stop_orca_context()
 def test_linear(self):
     estimator = Estimator.from_torch(model=get_model,
                                      optimizer=get_optimizer,
                                      loss=nn.BCELoss(),
                                      config={"lr": 1e-2},
                                      backend="torch_distributed")
     train_stats = estimator.fit(train_data_loader,
                                 epochs=2,
                                 batch_size=128)
     print(train_stats)
     val_stats = estimator.evaluate(val_data_loader, batch_size=64)
     print(val_stats)
     assert 0 < val_stats["val_accuracy"] < 1
     assert estimator.get_model()
     estimator.shutdown()
Beispiel #14
0
    def test_bigdl_pytorch_estimator_dataloader_creator(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        def model_creator(config):
            model = SimpleModel()
            return model

        def optim_creator(model, config):
            return optim.Adam(model.parameters(), lr=config.get("lr", 0.01))

        estimator = Estimator.from_torch(model=model_creator,
                                         loss=nn.BCELoss(),
                                         metrics=[Accuracy()],
                                         optimizer=optim_creator,
                                         config={"lr": 0.001})

        def get_dataloader(config, batch_size):
            inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9],
                                   [1, 9]])
            targets = torch.Tensor([[0], [0], [0], [1], [1], [1]])
            data_loader = torch.utils.data.DataLoader(
                TensorDataset(inputs, targets),
                batch_size=batch_size,
                num_workers=config.get("threads", 1))
            return data_loader

        estimator.fit(data=get_dataloader,
                      epochs=2,
                      batch_size=2,
                      validation_data=get_dataloader,
                      checkpoint_trigger=EveryEpoch())
        estimator.evaluate(data=get_dataloader, batch_size=2)
        model = estimator.get_model()
        assert isinstance(model, nn.Module)
def train_yseq_hvd(workers_per_node, epochs, **config):
    from zoo.orca.learn.pytorch import Estimator
    estimator = Estimator.from_torch(model=model_creator,
                                     optimizer=optimizer_creator,
                                     loss=loss_creator,
                                     workers_per_node=workers_per_node,
                                     config=config)

    stats = estimator.fit(train_data_creator, epochs=epochs)
    for s in stats:
        print(pretty_print(s))
    val_stats = estimator.evaluate(val_data_creator)
    val_loss = val_stats['val_loss']

    # retrieve the model
    yseq = estimator.get_model()
    estimator.shutdown()
    return yseq, val_loss
 def test_spark_xshards(self):
     from zoo import init_nncontext
     from zoo.orca.data import SparkXShards
     estimator = Estimator.from_torch(model=get_model,
                                      optimizer=get_optimizer,
                                      loss=nn.BCELoss(),
                                      config={"lr": 1e-1},
                                      backend="torch_distributed")
     sc = init_nncontext()
     x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32))
     y_rdd = sc.parallelize(
         np.random.randint(0, 2, size=(4000, 1)).astype(np.float32))
     rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]})
     train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1])
     train_xshards = SparkXShards(train_rdd)
     val_xshards = SparkXShards(val_rdd)
     train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2)
     print(train_stats)
     val_stats = estimator.evaluate(val_xshards, batch_size=128)
     print(val_stats)
     estimator.shutdown()
    def test_bigdl_pytorch_estimator_dataframe_fit_evaluate(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(5, 5)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        rdd = self.sc.range(0, 100)
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=df,
                          epochs=4,
                          batch_size=2,
                          validation_data=df,
                          checkpoint_trigger=EveryEpoch(),
                          feature_cols=["feature"],
                          label_cols=["label"])
            eval_result = estimator.evaluate(df,
                                             batch_size=2,
                                             feature_cols=["feature"],
                                             label_cols=["label"])
            assert isinstance(eval_result, dict)
Beispiel #18
0
    def test_train(self):
        estimator = Estimator.from_torch(
            model=model_creator,
            optimizer=optimizer_creator,
            loss=nn.MSELoss(),
            scheduler_creator=scheduler_creator,
            config={
                "lr": 1e-2,  # used in optimizer_creator
                "hidden_size": 1  # used in model_creator
            },
            backend="horovod",
            workers_per_node=2)
        stats1 = estimator.fit(train_data_creator, batch_size=4, epochs=5)
        train_loss1 = stats1[-1]["train_loss"]
        validation_loss1 = estimator.evaluate(
            validation_data_creator)["val_loss"]

        stats2 = estimator.fit(train_data_creator, batch_size=4, epochs=3)
        train_loss2 = stats2[-1]["train_loss"]
        validation_loss2 = estimator.evaluate(
            validation_data_creator)["val_loss"]

        # Verify syncing weights, i.e. the two workers have the same weights after training
        import ray
        import numpy as np
        remote_workers = estimator.estimator.remote_workers
        state_dicts = ray.get(
            [worker.state_dict.remote() for worker in remote_workers])
        weights = [state["models"] for state in state_dicts]
        worker1_weights = weights[0][0]
        worker2_weights = weights[1][0]
        for layer in list(worker1_weights.keys()):
            assert np.allclose(worker1_weights[layer].numpy(),
                               worker2_weights[layer].numpy())

        assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
        # todo this test maybe too strict, need to further check
        # assert validation_loss2 <= validation_loss1, (validation_loss2,
        #                                               validation_loss1)
        estimator.shutdown()
Beispiel #19
0
def train_example():
    estimator = Estimator.from_torch(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        })

    # train 5 epochs
    stats = estimator.fit(train_data_creator, epochs=5)
    print("train stats: {}".format(stats))
    val_stats = estimator.evaluate(validation_data_creator)
    print("validation stats: {}".format(val_stats))

    # retrieve the model
    model = estimator.estimator.get_model()
    print("trained weight: % .2f, bias: % .2f" %
          (model.weight.item(), model.bias.item()))
Beispiel #20
0
def train_example(workers_per_node):
    estimator = Estimator.from_torch(
        model=model_creator,
        optimizer=optimizer_creator,
        loss=nn.MSELoss(),
        scheduler_creator=scheduler_creator,
        workers_per_node=workers_per_node,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1  # used in model_creator
        }, backend="horovod")

    # train 5 epochs
    stats = estimator.fit(train_data_creator, batch_size=4, epochs=5)
    print("train stats: {}".format(stats))
    val_stats = estimator.evaluate(validation_data_creator)
    print("validation stats: {}".format(val_stats))

    # retrieve the model
    model = estimator.get_model()
    print("trained weight: % .2f, bias: % .2f" % (
        model.weight.item(), model.bias.item()))
Beispiel #21
0
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))

dataiter = iter(test_loader)
images, labels = dataiter.next()
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ',
      ' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))

if args.backend == "bigdl":
    net = model_creator(config={})
    optimizer = optim_creator(model=net, config={"lr": 0.001})
    orca_estimator = Estimator.from_torch(model=net,
                                          optimizer=optimizer,
                                          loss=criterion,
                                          metrics=[Accuracy()],
                                          backend="bigdl")

    orca_estimator.fit(data=train_loader,
                       epochs=2,
                       validation_data=test_loader,
                       checkpoint_trigger=EveryEpoch())

    res = orca_estimator.evaluate(data=test_loader)
    print("Accuracy of the network on the test images: %s" % res)
elif args.backend == "torch_distributed":
    orca_estimator = Estimator.from_torch(model=model_creator,
                                          optimizer=optim_creator,
                                          loss=criterion,
                                          metrics=[Accuracy()],
Beispiel #22
0
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

net.train()
orca_estimator = Estimator.from_torch(model=net,
                                      optimizer=optimizer,
                                      loss=criterion,
                                      metrics=[Accuracy()],
                                      backend="bigdl")
orca_estimator.fit(data=trainloader,
                   epochs=2,
                   validation_data=testloader,
                   checkpoint_trigger=EveryEpoch())
print('Finished Training')
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

res = orca_estimator.evaluate(data=testloader)
Beispiel #23
0
    return net


criterion = nn.MSELoss()


def optim_creator(model, config):
    return optim.Adam(model.parameters(), lr=config.get("lr", 0.01))


estimator = Estimator.from_torch(
    model=model_creator,
    optimizer=optim_creator,
    loss=nn.MSELoss(),
    backend="torch_distributed",
    config={
        "lr": opt.lr,
        "upscale_factor": opt.upscale_factor,
        "threads": opt.threads,
        "seed": opt.seed
    }
)


def train(epoch):
    stats = estimator.fit(data=train_data_creator, epochs=1, batch_size=opt.batch_size)
    for epochinfo in stats:
        print("===> Epoch {} Complete: Avg. Loss: {:.4f}"
              .format(epoch, epochinfo["train_loss"]))


def test():
Beispiel #24
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch Tensorboard Example')
    parser.add_argument('--cluster_mode',
                        type=str,
                        default="local",
                        help='The cluster mode, such as local, yarn or k8s.')
    parser.add_argument('--backend',
                        type=str,
                        default="bigdl",
                        help='The backend of PyTorch Estimator; '
                        'bigdl and torch_distributed are supported.')
    args = parser.parse_args()

    if args.cluster_mode == "local":
        init_orca_context()
    elif args.cluster_mode == "yarn":
        init_orca_context(cluster_mode=args.cluster_mode, cores=4, num_nodes=2)

    tensorboard_dir = "runs"
    writer = SummaryWriter(tensorboard_dir + '/fashion_mnist_experiment_1')
    # constant for classes
    classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal',
               'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')

    # plot some random training images
    dataiter = iter(train_data_creator(config={}, batch_size=4))
    images, labels = dataiter.next()

    # create grid of images
    img_grid = torchvision.utils.make_grid(images)

    # show images
    matplotlib_imshow(img_grid, one_channel=True)

    # write to tensorboard
    writer.add_image('four_fashion_mnist_images', img_grid)

    # inspect the model using tensorboard
    writer.add_graph(model_creator(config={}), images)
    writer.close()

    # training loss vs. epochs
    criterion = nn.CrossEntropyLoss()
    batch_size = 4
    epochs = 5
    if args.backend == "bigdl":
        train_loader = train_data_creator(config={}, batch_size=batch_size)
        test_loader = validation_data_creator(config={}, batch_size=batch_size)

        net = model_creator(config={})
        optimizer = optimizer_creator(model=net, config={"lr": 0.001})
        orca_estimator = Estimator.from_torch(model=net,
                                              optimizer=optimizer,
                                              loss=criterion,
                                              metrics=[Accuracy()],
                                              backend="bigdl")

        orca_estimator.set_tensorboard(tensorboard_dir, "bigdl")

        orca_estimator.fit(data=train_loader,
                           epochs=epochs,
                           validation_data=test_loader,
                           checkpoint_trigger=EveryEpoch())

        res = orca_estimator.evaluate(data=test_loader)
        print("Accuracy of the network on the test images: %s" % res)
    elif args.backend == "torch_distributed":
        orca_estimator = Estimator.from_torch(model=model_creator,
                                              optimizer=optimizer_creator,
                                              loss=criterion,
                                              metrics=[Accuracy()],
                                              backend="torch_distributed")
        stats = orca_estimator.fit(train_data_creator,
                                   epochs=epochs,
                                   batch_size=batch_size)

        for stat in stats:
            writer.add_scalar("training_loss", stat['train_loss'],
                              stat['epoch'])
        print("Train stats: {}".format(stats))
        val_stats = orca_estimator.evaluate(validation_data_creator,
                                            batch_size=batch_size)
        print("Validation stats: {}".format(val_stats))
        orca_estimator.shutdown()
    else:
        raise NotImplementedError(
            "Only bigdl and torch_distributed are supported "
            "as the backend, but got {}".format(args.backend))

    stop_orca_context()
Beispiel #25
0
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

net.train()
orca_estimator = Estimator.from_torch(model=net,
                                      optimizer=optimizer,
                                      loss=criterion,
                                      backend="bigdl")
orca_estimator.fit(data=trainloader,
                   epochs=2,
                   validation_data=testloader,
                   validation_metrics=[Accuracy()],
                   checkpoint_trigger=EveryEpoch())
print('Finished Training')
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

res = orca_estimator.evaluate(data=testloader,
Beispiel #26
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store mnist data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=256,
        metavar='N',
        help='input batch size for training per executor(default: 256)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        metavar='N',
                        help='number of epochs to train (default: 2)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=False)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executors=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = LeNet()
    model.train()
    criterion = nn.NLLLoss()

    adam = Adam(args.lr)
    zoo_estimator = Estimator.from_torch(model=model,
                                         optimizer=adam,
                                         loss=criterion,
                                         backend="bigdl")
    from bigdl.optim.optimizer import EveryEpoch
    zoo_estimator.fit(data=train_loader,
                      epochs=args.epochs,
                      validation_data=test_loader,
                      validation_methods=[Accuracy()],
                      checkpoint_trigger=EveryEpoch())
    zoo_estimator.evaluate(data=test_loader, validation_methods=[Accuracy()])
    def test_bigdl_pytorch_estimator_save_and_load(self):
        class Network(nn.Module):
            def __init__(self):
                super(Network, self).__init__()

                self.fc1 = nn.Linear(28 * 28, 500)
                self.fc2 = nn.Linear(500, 10)

            def forward(self, x):
                x = x.view(-1, 28 * 28)
                x = F.relu(self.fc1(x))
                x = self.fc2(x)
                return F.log_softmax(x, dim=1)

        model = Network()
        model.train()
        criterion = nn.NLLLoss()
        adam = torch.optim.Adam(model.parameters(), 0.001)

        dir = "./dataset"
        batch_size = 320
        train_loader = torch.utils.data.DataLoader(datasets.MNIST(
            dir,
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
                                                   batch_size=batch_size,
                                                   shuffle=True)

        test_loader = torch.utils.data.DataLoader(datasets.MNIST(
            dir,
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
                                                  batch_size=batch_size,
                                                  shuffle=False)

        # epoch 1
        est = Estimator.from_torch(model=model,
                                   optimizer=adam,
                                   loss=criterion,
                                   metrics=[Accuracy()])

        est.fit(data=train_loader,
                epochs=1,
                validation_data=test_loader,
                batch_size=batch_size,
                checkpoint_trigger=EveryEpoch())
        paras1 = list(est.get_model().named_parameters())
        est.save("model_epoch_1")

        # epoch 2
        est.fit(data=train_loader,
                epochs=2,
                validation_data=test_loader,
                batch_size=batch_size,
                checkpoint_trigger=EveryEpoch())
        paras2 = list(est.get_model().named_parameters())
        est.load("model_epoch_1")
        paras3 = list(est.get_model().named_parameters())

        load_success = 0
        for i in range(len(paras2)):
            name2, para2 = paras2[i]
            name3, para3 = paras3[i]
            if not torch.all(torch.eq(para2, para3)):
                load_success = 1
                break
        if not load_success:
            raise Exception(
                "Load failed. Parameters did not change after loading.")

        for i in range(len(paras1)):
            name1, para1 = paras1[i]
            name3, para3 = paras3[i]
            if not torch.all(torch.eq(para1, para3)):
                raise Exception("After reloading the model," + name1 +
                                "does not match.")
        print("pass")
Beispiel #28
0
def optim_creator(model, config):
    return optim.Adam(model.parameters(), lr=config.get("lr", 0.01))


criterion = nn.MSELoss()
model_dir = "models"

if opt.backend == "bigdl":
    model = model_creator(config={
        "upscale_factor": opt.upscale_factor,
        "seed": opt.seed
    })
    optimizer = optim_creator(model, config={"lr": opt.lr})
    estimator = Estimator.from_torch(model=model,
                                     optimizer=optimizer,
                                     loss=criterion,
                                     metrics=[MSE()],
                                     model_dir=model_dir,
                                     backend="bigdl")

    train_loader = train_data_creator(config={
        "upscale_factor": opt.upscale_factor,
        "threads": opt.threads
    },
                                      batch_size=opt.batch_size)
    test_loader = validation_data_creator(config={
        "upscale_factor": opt.upscale_factor,
        "threads": opt.threads
    },
                                          batch_size=opt.batch_size)

    estimator.fit(data=train_loader,