def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch(model=model, loss=loss_func, optimizer=SGD(), model_dir=temp_dir_name, backend="bigdl") estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2) est2 = Estimator.from_torch(model=model, loss=loss_func, optimizer=None, backend="bigdl") est2.load(temp_dir_name, loss=loss_func) est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) est2.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2)
def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": np.stack([df['user'].to_numpy(), df['item'].to_numpy()], axis=1), "y": df['label'].to_numpy() } return result def transform_del_y(d): result = {"x": d["x"]} return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=2) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load(temp_dir_name, loss=loss_func) est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) est2.evaluate(data_shard, batch_size=2) pred_result = est2.predict(data_shard) pred_c = pred_result.collect() assert(pred_result, SparkXShards) pred_shard = data_shard.transform_shard(transform_del_y) pred_result2 = est2.predict(pred_shard) pred_c_2 = pred_result2.collect() assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()
def test_train(self): estimator = Estimator.from_torch( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }) stats1 = estimator.fit(train_data_creator, epochs=5) train_loss1 = stats1[-1]["train_loss"] validation_loss1 = estimator.evaluate( validation_data_creator)["val_loss"] stats2 = estimator.fit(train_data_creator, epochs=3) train_loss2 = stats2[-1]["train_loss"] validation_loss2 = estimator.evaluate( validation_data_creator)["val_loss"] assert train_loss2 <= train_loss1, (train_loss2, train_loss1) assert validation_loss2 <= validation_loss1, (validation_loss2, validation_loss1) estimator.shutdown()
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument('--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--cluster_mode', type=str, default="local", help='The mode for the Spark cluster. local or yarn.') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader( datasets.MNIST(args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST(args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=False) if args.cluster_mode == "local": init_orca_context(cores=1, memory="2g") elif args.cluster_mode == "yarn": init_orca_context( cluster_mode="yarn-client", cores=4, num_nodes=2, memory="2g", driver_memory="10g", driver_cores=1, conf={"spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"}) model = LeNet() model.train() criterion = nn.NLLLoss() adam = torch.optim.Adam(model.parameters(), args.lr) est = Estimator.from_torch(model=model, optimizer=adam, loss=criterion) est.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) result = est.evaluate(data=test_loader, validation_metrics=[Accuracy()]) for r in result: print(str(r)) stop_orca_context()
def test_bigdl_pytorch_estimator_dataloader_creator(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x model = SimpleModel() estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(), optimizer=Adam()) def get_dataloader(): inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2) estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data=get_dataloader, validation_metrics=[Accuracy()]) model = estimator.get_model() assert isinstance(model, nn.Module)
def test_horovod_initialized_correctly(self): estimator = Estimator.from_torch( model=model_creator, optimizer=optimizer_creator, loss=nn.MSELoss(), scheduler_creator=scheduler_creator, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1 # used in model_creator }, backend="horovod", workers_per_node=2) def get_size(): import horovod.torch as hvd return hvd.size() results = estimator.estimator.horovod_runner.run(get_size) assert results == [2, 2] def get_rank(): import horovod.torch as hvd return hvd.rank() results = estimator.estimator.horovod_runner.run(get_rank) results = sorted(results) assert results == [0, 1]
def test_bigdl_pytorch_estimator_dataframe_predict(self): def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) class IdentityNet(nn.Module): def __init__(self): super().__init__() # need this line to avoid optimizer raise empty variable list self.fc1 = nn.Linear(5, 5) def forward(self, input_): return input_ model = IdentityNet() rdd = self.sc.range(0, 100) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) result = estimator.predict(df, feature_cols=["feature"]) expr = "sum(cast(feature <> to_array(prediction) as int)) as error" assert result.selectExpr(expr).first()["error"] == 0
def test_data_creator(self): estimator = Estimator.from_torch(model=get_model, optimizer=get_optimizer, loss=nn.BCELoss(), config={"lr": 1e-2}, workers_per_node=2, backend="torch_distributed") train_stats = estimator.fit(train_data_loader, epochs=2, batch_size=128) print(train_stats) val_stats = estimator.evaluate(val_data_loader, batch_size=64) print(val_stats) assert 0 < val_stats["val_accuracy"] < 1 assert estimator.get_model() # Verify syncing weights, i.e. the two workers have the same weights after training import ray remote_workers = estimator.estimator.remote_workers state_dicts = ray.get( [worker.state_dict.remote() for worker in remote_workers]) weights = [state["models"] for state in state_dicts] worker1_weights = weights[0][0] worker2_weights = weights[1][0] for layer in list(worker1_weights.keys()): assert np.allclose(worker1_weights[layer].numpy(), worker2_weights[layer].numpy()) estimator.shutdown()
def get_estimator(workers_per_node=1, model_fn=get_model): estimator = Estimator.from_torch(model=model_fn, optimizer=get_optimizer, loss=nn.BCELoss(), config={"lr": 1e-2}, workers_per_node=workers_per_node, backend="torch_distributed") return estimator
def test_bigdl_pytorch_estimator_pandas_dataframe(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(1, 10) def forward(self, x): x = torch.unsqueeze(x, dim=1) x = self.fc(x) return F.log_softmax(x, dim=1) def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) model = SimpleModel() OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/simple_feature_label.csv") data_shard = read_csv(file_path) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=1, batch_size=4, feature_cols=['feature'], label_cols=['label'], validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=4, feature_cols=['feature'], label_cols=['label']) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load_orca_checkpoint(temp_dir_name) est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
def test_save_and_restore(self): estimator1 = Estimator.from_torch( model=model_creator, optimizer=optimizer_creator, loss=nn.MSELoss(), scheduler_creator=scheduler_creator, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="horovod") with TemporaryDirectory() as tmp_path: estimator1.fit(train_data_creator, epochs=1) checkpoint_path = os.path.join(tmp_path, "checkpoint") estimator1.save(checkpoint_path) model1 = estimator1.get_model() estimator1.shutdown() estimator2 = Estimator.from_torch( model=model_creator, optimizer=optimizer_creator, loss=nn.MSELoss(), scheduler_creator=scheduler_creator, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="horovod") estimator2.load(checkpoint_path) model2 = estimator2.get_model() model1_state_dict = model1.state_dict() model2_state_dict = model2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k]) estimator2.shutdown()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tensorboard Example') parser.add_argument('--cluster_mode', type=str, default="local", help='The cluster mode, such as local, yarn or k8s.') args = parser.parse_args() if args.cluster_mode == "local": init_orca_context() elif args.cluster_mode == "yarn": init_orca_context(cluster_mode=args.cluster_mode, cores=4, num_nodes=2) writer = SummaryWriter('runs/fashion_mnist_experiment_1') # constant for classes classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot') # plot some random training images dataiter = iter(train_data_creator(config={})) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('four_fashion_mnist_images', img_grid) # inspect the model using tensorboard writer.add_graph(model_creator(config={}), images) writer.close() # training loss vs. epochs criterion = nn.CrossEntropyLoss() orca_estimator = Estimator.from_torch(model=model_creator, optimizer=optimizer_creator, loss=criterion, backend="torch_distributed") stats = orca_estimator.fit(train_data_creator, epochs=5, batch_size=4) for stat in stats: writer.add_scalar("training_loss", stat['train_loss'], stat['epoch']) print("Train stats: {}".format(stats)) val_stats = orca_estimator.evaluate(validation_data_creator) print("Validation stats: {}".format(val_stats)) orca_estimator.shutdown() stop_orca_context()
def test_linear(self): estimator = Estimator.from_torch(model=get_model, optimizer=get_optimizer, loss=nn.BCELoss(), config={"lr": 1e-2}, backend="torch_distributed") train_stats = estimator.fit(train_data_loader, epochs=2, batch_size=128) print(train_stats) val_stats = estimator.evaluate(val_data_loader, batch_size=64) print(val_stats) assert 0 < val_stats["val_accuracy"] < 1 assert estimator.get_model() estimator.shutdown()
def test_bigdl_pytorch_estimator_dataloader_creator(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x def model_creator(config): model = SimpleModel() return model def optim_creator(model, config): return optim.Adam(model.parameters(), lr=config.get("lr", 0.01)) estimator = Estimator.from_torch(model=model_creator, loss=nn.BCELoss(), metrics=[Accuracy()], optimizer=optim_creator, config={"lr": 0.001}) def get_dataloader(config, batch_size): inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) data_loader = torch.utils.data.DataLoader( TensorDataset(inputs, targets), batch_size=batch_size, num_workers=config.get("threads", 1)) return data_loader estimator.fit(data=get_dataloader, epochs=2, batch_size=2, validation_data=get_dataloader, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data=get_dataloader, batch_size=2) model = estimator.get_model() assert isinstance(model, nn.Module)
def train_yseq_hvd(workers_per_node, epochs, **config): from zoo.orca.learn.pytorch import Estimator estimator = Estimator.from_torch(model=model_creator, optimizer=optimizer_creator, loss=loss_creator, workers_per_node=workers_per_node, config=config) stats = estimator.fit(train_data_creator, epochs=epochs) for s in stats: print(pretty_print(s)) val_stats = estimator.evaluate(val_data_creator) val_loss = val_stats['val_loss'] # retrieve the model yseq = estimator.get_model() estimator.shutdown() return yseq, val_loss
def test_spark_xshards(self): from zoo import init_nncontext from zoo.orca.data import SparkXShards estimator = Estimator.from_torch(model=get_model, optimizer=get_optimizer, loss=nn.BCELoss(), config={"lr": 1e-1}, backend="torch_distributed") sc = init_nncontext() x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32)) y_rdd = sc.parallelize( np.random.randint(0, 2, size=(4000, 1)).astype(np.float32)) rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]}) train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1]) train_xshards = SparkXShards(train_rdd) val_xshards = SparkXShards(val_rdd) train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2) print(train_stats) val_stats = estimator.evaluate(val_xshards, batch_size=128) print(val_stats) estimator.shutdown()
def test_bigdl_pytorch_estimator_dataframe_fit_evaluate(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(5, 5) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) rdd = self.sc.range(0, 100) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=df, epochs=4, batch_size=2, validation_data=df, checkpoint_trigger=EveryEpoch(), feature_cols=["feature"], label_cols=["label"]) eval_result = estimator.evaluate(df, batch_size=2, feature_cols=["feature"], label_cols=["label"]) assert isinstance(eval_result, dict)
def test_train(self): estimator = Estimator.from_torch( model=model_creator, optimizer=optimizer_creator, loss=nn.MSELoss(), scheduler_creator=scheduler_creator, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1 # used in model_creator }, backend="horovod", workers_per_node=2) stats1 = estimator.fit(train_data_creator, batch_size=4, epochs=5) train_loss1 = stats1[-1]["train_loss"] validation_loss1 = estimator.evaluate( validation_data_creator)["val_loss"] stats2 = estimator.fit(train_data_creator, batch_size=4, epochs=3) train_loss2 = stats2[-1]["train_loss"] validation_loss2 = estimator.evaluate( validation_data_creator)["val_loss"] # Verify syncing weights, i.e. the two workers have the same weights after training import ray import numpy as np remote_workers = estimator.estimator.remote_workers state_dicts = ray.get( [worker.state_dict.remote() for worker in remote_workers]) weights = [state["models"] for state in state_dicts] worker1_weights = weights[0][0] worker2_weights = weights[1][0] for layer in list(worker1_weights.keys()): assert np.allclose(worker1_weights[layer].numpy(), worker2_weights[layer].numpy()) assert train_loss2 <= train_loss1, (train_loss2, train_loss1) # todo this test maybe too strict, need to further check # assert validation_loss2 <= validation_loss1, (validation_loss2, # validation_loss1) estimator.shutdown()
def train_example(): estimator = Estimator.from_torch( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }) # train 5 epochs stats = estimator.fit(train_data_creator, epochs=5) print("train stats: {}".format(stats)) val_stats = estimator.evaluate(validation_data_creator) print("validation stats: {}".format(val_stats)) # retrieve the model model = estimator.estimator.get_model() print("trained weight: % .2f, bias: % .2f" % (model.weight.item(), model.bias.item()))
def train_example(workers_per_node): estimator = Estimator.from_torch( model=model_creator, optimizer=optimizer_creator, loss=nn.MSELoss(), scheduler_creator=scheduler_creator, workers_per_node=workers_per_node, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1 # used in model_creator }, backend="horovod") # train 5 epochs stats = estimator.fit(train_data_creator, batch_size=4, epochs=5) print("train stats: {}".format(stats)) val_stats = estimator.evaluate(validation_data_creator) print("validation stats: {}".format(val_stats)) # retrieve the model model = estimator.get_model() print("trained weight: % .2f, bias: % .2f" % ( model.weight.item(), model.bias.item()))
imshow(torchvision.utils.make_grid(images)) # print labels print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size))) dataiter = iter(test_loader) images, labels = dataiter.next() imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(batch_size))) if args.backend == "bigdl": net = model_creator(config={}) optimizer = optim_creator(model=net, config={"lr": 0.001}) orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, metrics=[Accuracy()], backend="bigdl") orca_estimator.fit(data=train_loader, epochs=2, validation_data=test_loader, checkpoint_trigger=EveryEpoch()) res = orca_estimator.evaluate(data=test_loader) print("Accuracy of the network on the test images: %s" % res) elif args.backend == "torch_distributed": orca_estimator = Estimator.from_torch(model=model_creator, optimizer=optim_creator, loss=criterion, metrics=[Accuracy()],
x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) net.train() orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, metrics=[Accuracy()], backend="bigdl") orca_estimator.fit(data=trainloader, epochs=2, validation_data=testloader, checkpoint_trigger=EveryEpoch()) print('Finished Training') dataiter = iter(testloader) images, labels = dataiter.next() # print images imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) res = orca_estimator.evaluate(data=testloader)
return net criterion = nn.MSELoss() def optim_creator(model, config): return optim.Adam(model.parameters(), lr=config.get("lr", 0.01)) estimator = Estimator.from_torch( model=model_creator, optimizer=optim_creator, loss=nn.MSELoss(), backend="torch_distributed", config={ "lr": opt.lr, "upscale_factor": opt.upscale_factor, "threads": opt.threads, "seed": opt.seed } ) def train(epoch): stats = estimator.fit(data=train_data_creator, epochs=1, batch_size=opt.batch_size) for epochinfo in stats: print("===> Epoch {} Complete: Avg. Loss: {:.4f}" .format(epoch, epochinfo["train_loss"])) def test():
def main(): parser = argparse.ArgumentParser(description='PyTorch Tensorboard Example') parser.add_argument('--cluster_mode', type=str, default="local", help='The cluster mode, such as local, yarn or k8s.') parser.add_argument('--backend', type=str, default="bigdl", help='The backend of PyTorch Estimator; ' 'bigdl and torch_distributed are supported.') args = parser.parse_args() if args.cluster_mode == "local": init_orca_context() elif args.cluster_mode == "yarn": init_orca_context(cluster_mode=args.cluster_mode, cores=4, num_nodes=2) tensorboard_dir = "runs" writer = SummaryWriter(tensorboard_dir + '/fashion_mnist_experiment_1') # constant for classes classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot') # plot some random training images dataiter = iter(train_data_creator(config={}, batch_size=4)) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('four_fashion_mnist_images', img_grid) # inspect the model using tensorboard writer.add_graph(model_creator(config={}), images) writer.close() # training loss vs. epochs criterion = nn.CrossEntropyLoss() batch_size = 4 epochs = 5 if args.backend == "bigdl": train_loader = train_data_creator(config={}, batch_size=batch_size) test_loader = validation_data_creator(config={}, batch_size=batch_size) net = model_creator(config={}) optimizer = optimizer_creator(model=net, config={"lr": 0.001}) orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, metrics=[Accuracy()], backend="bigdl") orca_estimator.set_tensorboard(tensorboard_dir, "bigdl") orca_estimator.fit(data=train_loader, epochs=epochs, validation_data=test_loader, checkpoint_trigger=EveryEpoch()) res = orca_estimator.evaluate(data=test_loader) print("Accuracy of the network on the test images: %s" % res) elif args.backend == "torch_distributed": orca_estimator = Estimator.from_torch(model=model_creator, optimizer=optimizer_creator, loss=criterion, metrics=[Accuracy()], backend="torch_distributed") stats = orca_estimator.fit(train_data_creator, epochs=epochs, batch_size=batch_size) for stat in stats: writer.add_scalar("training_loss", stat['train_loss'], stat['epoch']) print("Train stats: {}".format(stats)) val_stats = orca_estimator.evaluate(validation_data_creator, batch_size=batch_size) print("Validation stats: {}".format(val_stats)) orca_estimator.shutdown() else: raise NotImplementedError( "Only bigdl and torch_distributed are supported " "as the backend, but got {}".format(args.backend)) stop_orca_context()
x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) net.train() orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, backend="bigdl") orca_estimator.fit(data=trainloader, epochs=2, validation_data=testloader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) print('Finished Training') dataiter = iter(testloader) images, labels = dataiter.next() # print images imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) res = orca_estimator.evaluate(data=testloader,
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument( '--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument( '--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=False) # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided. if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"}) else: num_executors = 2 num_cores_per_executor = 4 hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') zoo_conda_name = os.environ.get( 'ZOO_CONDA_NAME') # The name of the created conda-env sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir, conda_name=zoo_conda_name, num_executors=num_executors, executor_cores=num_cores_per_executor, executor_memory="2g", driver_memory="10g", driver_cores=1, conf={ "spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1" }) model = LeNet() model.train() criterion = nn.NLLLoss() adam = Adam(args.lr) zoo_estimator = Estimator.from_torch(model=model, optimizer=adam, loss=criterion, backend="bigdl") from bigdl.optim.optimizer import EveryEpoch zoo_estimator.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) zoo_estimator.evaluate(data=test_loader, validation_methods=[Accuracy()])
def test_bigdl_pytorch_estimator_save_and_load(self): class Network(nn.Module): def __init__(self): super(Network, self).__init__() self.fc1 = nn.Linear(28 * 28, 500) self.fc2 = nn.Linear(500, 10) def forward(self, x): x = x.view(-1, 28 * 28) x = F.relu(self.fc1(x)) x = self.fc2(x) return F.log_softmax(x, dim=1) model = Network() model.train() criterion = nn.NLLLoss() adam = torch.optim.Adam(model.parameters(), 0.001) dir = "./dataset" batch_size = 320 train_loader = torch.utils.data.DataLoader(datasets.MNIST( dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=batch_size, shuffle=False) # epoch 1 est = Estimator.from_torch(model=model, optimizer=adam, loss=criterion, metrics=[Accuracy()]) est.fit(data=train_loader, epochs=1, validation_data=test_loader, batch_size=batch_size, checkpoint_trigger=EveryEpoch()) paras1 = list(est.get_model().named_parameters()) est.save("model_epoch_1") # epoch 2 est.fit(data=train_loader, epochs=2, validation_data=test_loader, batch_size=batch_size, checkpoint_trigger=EveryEpoch()) paras2 = list(est.get_model().named_parameters()) est.load("model_epoch_1") paras3 = list(est.get_model().named_parameters()) load_success = 0 for i in range(len(paras2)): name2, para2 = paras2[i] name3, para3 = paras3[i] if not torch.all(torch.eq(para2, para3)): load_success = 1 break if not load_success: raise Exception( "Load failed. Parameters did not change after loading.") for i in range(len(paras1)): name1, para1 = paras1[i] name3, para3 = paras3[i] if not torch.all(torch.eq(para1, para3)): raise Exception("After reloading the model," + name1 + "does not match.") print("pass")
def optim_creator(model, config): return optim.Adam(model.parameters(), lr=config.get("lr", 0.01)) criterion = nn.MSELoss() model_dir = "models" if opt.backend == "bigdl": model = model_creator(config={ "upscale_factor": opt.upscale_factor, "seed": opt.seed }) optimizer = optim_creator(model, config={"lr": opt.lr}) estimator = Estimator.from_torch(model=model, optimizer=optimizer, loss=criterion, metrics=[MSE()], model_dir=model_dir, backend="bigdl") train_loader = train_data_creator(config={ "upscale_factor": opt.upscale_factor, "threads": opt.threads }, batch_size=opt.batch_size) test_loader = validation_data_creator(config={ "upscale_factor": opt.upscale_factor, "threads": opt.threads }, batch_size=opt.batch_size) estimator.fit(data=train_loader,