def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch(model=model, loss=loss_func, optimizer=SGD(), model_dir=temp_dir_name, backend="bigdl") estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2) est2 = Estimator.from_torch(model=model, loss=loss_func, optimizer=None, backend="bigdl") est2.load(temp_dir_name, loss=loss_func) est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) est2.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2)
def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": np.stack([df['user'].to_numpy(), df['item'].to_numpy()], axis=1), "y": df['label'].to_numpy() } return result def transform_del_y(d): result = {"x": d["x"]} return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=2) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load(temp_dir_name, loss=loss_func) est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) est2.evaluate(data_shard, batch_size=2) pred_result = est2.predict(data_shard) pred_c = pred_result.collect() assert(pred_result, SparkXShards) pred_shard = data_shard.transform_shard(transform_del_y) pred_result2 = est2.predict(pred_shard) pred_c_2 = pred_result2.collect() assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()
def test_bigdl_pytorch_estimator_dataloader_creator(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x model = SimpleModel() estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(), optimizer=Adam()) def get_dataloader(): inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2) estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data=get_dataloader, validation_metrics=[Accuracy()]) model = estimator.get_model() assert isinstance(model, nn.Module)
def test_nnEstimator_fit_with_train_val_summary(self): model = Sequential().add(Linear(2, 2)) criterion = MSECriterion() df, val_df = self.get_estimator_df() from zoo.orca.learn.metrics import MAE est = Estimator.from_bigdl(model=model, loss=criterion, optimizer=Adam(), metrics=[MAE()], feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) tmp_dir = tempfile.mkdtemp() est.set_tensorboard(log_dir=tmp_dir, app_name="estTest") est.fit(df, epochs=5, batch_size=4, validation_data=val_df, validation_trigger=EveryEpoch(), checkpoint_trigger=SeveralIteration(1)) res = est.predict(df) loss_result = est.get_train_summary("Loss") mae_result = est.get_validation_summary("MAE") assert type(res).__name__ == 'DataFrame' assert len(loss_result) == 5 assert len(mae_result) == 4
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument('--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--cluster_mode', type=str, default="local", help='The mode for the Spark cluster. local or yarn.') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader( datasets.MNIST(args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST(args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=False) if args.cluster_mode == "local": init_orca_context(cores=1, memory="2g") elif args.cluster_mode == "yarn": init_orca_context( cluster_mode="yarn-client", cores=4, num_nodes=2, memory="2g", driver_memory="10g", driver_cores=1, conf={"spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"}) model = LeNet() model.train() criterion = nn.NLLLoss() adam = torch.optim.Adam(model.parameters(), args.lr) est = Estimator.from_torch(model=model, optimizer=adam, loss=criterion) est.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) result = est.evaluate(data=test_loader, validation_metrics=[Accuracy()]) for r in result: print(str(r)) stop_orca_context()
def test_bigdl_pytorch_estimator_pandas_dataframe(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(1, 10) def forward(self, x): x = torch.unsqueeze(x, dim=1) x = self.fc(x) return F.log_softmax(x, dim=1) def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) model = SimpleModel() OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/simple_feature_label.csv") data_shard = read_csv(file_path) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=1, batch_size=4, feature_cols=['feature'], label_cols=['label'], validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=4, feature_cols=['feature'], label_cols=['label']) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load_orca_checkpoint(temp_dir_name) est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
def test_bigdl_pytorch_estimator_dataloader_creator(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x def model_creator(config): model = SimpleModel() return model def optim_creator(model, config): return optim.Adam(model.parameters(), lr=config.get("lr", 0.01)) estimator = Estimator.from_torch(model=model_creator, loss=nn.BCELoss(), metrics=[Accuracy()], optimizer=optim_creator, config={"lr": 0.001}) def get_dataloader(config, batch_size): inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) data_loader = torch.utils.data.DataLoader( TensorDataset(inputs, targets), batch_size=batch_size, num_workers=config.get("threads", 1)) return data_loader estimator.fit(data=get_dataloader, epochs=2, batch_size=2, validation_data=get_dataloader, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data=get_dataloader, batch_size=2) model = estimator.get_model() assert isinstance(model, nn.Module)
def test_xshards_spark_estimator_multi_inputs(self): resource_path = os.path.join( os.path.split(__file__)[0], "../../../resources") def transform(df): result = { "x": [ np.expand_dims(df['user'].to_numpy(), axis=1), np.expand_dims(df['item'].to_numpy(), axis=1) ], "y": df['label'].to_numpy() } return result file_path = os.path.join(resource_path, "orca/learn/ncf2.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) zx1 = ZLayer.Input(shape=(1, )) zx2 = ZLayer.Input(shape=(1, )) zz = ZLayer.merge([zx1, zx2], mode="concat") zy = ZLayer.Dense(2)(zz) model = ZModel([zx1, zx2], zy) optim_method = SGD(learningrate=0.01) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_bigdl(model=model, optimizer=optim_method, loss=ClassNLLCriterion(), metrics=[Accuracy()], model_dir=temp_dir_name) estimator.set_constant_gradient_clipping(0.1, 1.2) r1 = estimator.predict(data=data_shard) r_c = r1.collect() estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test") estimator.fit(data=data_shard, epochs=5, batch_size=8, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) summary = estimator.get_train_summary(tag="Loss") temp_path = os.path.join(temp_dir_name, "save_model") estimator.save(temp_path) eval_result = estimator.evaluate(data=data_shard, batch_size=8)
def test_bigdl_pytorch_estimator_dataframe_fit_evaluate(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(5, 5) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) rdd = self.sc.range(0, 100) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=df, epochs=4, batch_size=2, validation_data=df, checkpoint_trigger=EveryEpoch(), feature_cols=["feature"], label_cols=["label"]) eval_result = estimator.evaluate(df, batch_size=2, feature_cols=["feature"], label_cols=["label"]) assert isinstance(eval_result, dict)
net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) net.train() orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, backend="bigdl") orca_estimator.fit(data=trainloader, epochs=2, validation_data=testloader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) print('Finished Training') dataiter = iter(testloader) images, labels = dataiter.next() # print images imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) res = orca_estimator.evaluate(data=testloader, validation_metrics=[Accuracy()])[0] total_num = res.total_num result = res.result print("Accuracy of the network on the %s test images: %s" % (total_num, result)) stop_orca_context()
def test_xshards_spark_estimator(self): resource_path = os.path.join( os.path.split(__file__)[0], "../../../resources") def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result file_path = os.path.join(resource_path, "orca/learn/ncf2.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) model = Sequential() model.add(Linear(2, 2)) model.add(LogSoftMax()) optim_method = SGD(learningrate=0.01) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_bigdl( model=model, optimizer=optim_method, loss=ClassNLLCriterion(), model_dir=temp_dir_name, feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([1])) estimator.set_constant_gradient_clipping(0.1, 1.2) r1 = estimator.predict(data=data_shard) r_c = r1.collect() estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test") estimator.fit(data=data_shard, epochs=5, batch_size=8, validation_data=data_shard, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) summary = estimator.get_train_summary(tag="Loss") temp_path = os.path.join(temp_dir_name, "save_model") estimator.save(temp_path) estimator.evaluate(data=data_shard, validation_metrics=[Accuracy()], batch_size=8) result = estimator.predict(data=data_shard) assert type(result).__name__ == 'SparkXShards' result_c = result.collect() df = self.get_estimator_df2() r0 = estimator.predict(df) r0_c = r0.collect() assert type(r0).__name__ == 'DataFrame' for idx in range(len(r0_c)): assert abs(r0_c[idx]["prediction"][0] - result_c[0]["prediction"][idx][0]) == 0 assert abs(r0_c[idx]["prediction"][1] - result_c[0]["prediction"][idx][1]) == 0 estimator.fit(data=df, epochs=6, batch_size=8, validation_data=df, validation_metrics=[Accuracy()], validation_trigger=EveryEpoch()) summary = estimator.get_train_summary() # test load from checkpoint est2 = Estimator.from_bigdl(model=Sequential(), optimizer=None, loss=None, model_dir=None) est2.load(temp_dir_name, loss=ClassNLLCriterion(), is_checkpoint=True) r2 = est2.predict(data=data_shard) r2_c = r2.collect() assert (result_c[0]["prediction"] == r2_c[0]["prediction"]).all() # resume training est2.fit(data=data_shard, epochs=10, batch_size=8, validation_data=data_shard, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) est2.evaluate(data=data_shard, validation_metrics=[Accuracy()], batch_size=8) # test load from saved model est3 = Estimator.from_bigdl(model=Sequential(), optimizer=None, loss=None, model_dir=None) est3.load(temp_path, optimizer=optim_method, loss=ClassNLLCriterion()) r3 = est3.predict(data=data_shard) r3_c = r3.collect() assert (r3_c[0]["prediction"] == r2_c[0]["prediction"]).all()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tensorboard Example') parser.add_argument('--cluster_mode', type=str, default="local", help='The cluster mode, such as local, yarn or k8s.') parser.add_argument('--backend', type=str, default="bigdl", help='The backend of PyTorch Estimator; ' 'bigdl and torch_distributed are supported.') args = parser.parse_args() if args.cluster_mode == "local": init_orca_context() elif args.cluster_mode == "yarn": init_orca_context(cluster_mode=args.cluster_mode, cores=4, num_nodes=2) tensorboard_dir = "runs" writer = SummaryWriter(tensorboard_dir + '/fashion_mnist_experiment_1') # constant for classes classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot') # plot some random training images dataiter = iter(train_data_creator(config={}, batch_size=4)) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('four_fashion_mnist_images', img_grid) # inspect the model using tensorboard writer.add_graph(model_creator(config={}), images) writer.close() # training loss vs. epochs criterion = nn.CrossEntropyLoss() batch_size = 4 epochs = 5 if args.backend == "bigdl": train_loader = train_data_creator(config={}, batch_size=batch_size) test_loader = validation_data_creator(config={}, batch_size=batch_size) net = model_creator(config={}) optimizer = optimizer_creator(model=net, config={"lr": 0.001}) orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, metrics=[Accuracy()], backend="bigdl") orca_estimator.set_tensorboard(tensorboard_dir, "bigdl") orca_estimator.fit(data=train_loader, epochs=epochs, validation_data=test_loader, checkpoint_trigger=EveryEpoch()) res = orca_estimator.evaluate(data=test_loader) print("Accuracy of the network on the test images: %s" % res) elif args.backend == "torch_distributed": orca_estimator = Estimator.from_torch(model=model_creator, optimizer=optimizer_creator, loss=criterion, metrics=[Accuracy()], backend="torch_distributed") stats = orca_estimator.fit(train_data_creator, epochs=epochs, batch_size=batch_size) for stat in stats: writer.add_scalar("training_loss", stat['train_loss'], stat['epoch']) print("Train stats: {}".format(stats)) val_stats = orca_estimator.evaluate(validation_data_creator, batch_size=batch_size) print("Validation stats: {}".format(val_stats)) orca_estimator.shutdown() else: raise NotImplementedError( "Only bigdl and torch_distributed are supported " "as the backend, but got {}".format(args.backend)) stop_orca_context()
def test_bigdl_pytorch_estimator_save_and_load(self): class Network(nn.Module): def __init__(self): super(Network, self).__init__() self.fc1 = nn.Linear(28 * 28, 500) self.fc2 = nn.Linear(500, 10) def forward(self, x): x = x.view(-1, 28 * 28) x = F.relu(self.fc1(x)) x = self.fc2(x) return F.log_softmax(x, dim=1) model = Network() model.train() criterion = nn.NLLLoss() adam = torch.optim.Adam(model.parameters(), 0.001) dir = "./dataset" batch_size = 320 train_loader = torch.utils.data.DataLoader(datasets.MNIST( dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=batch_size, shuffle=False) # epoch 1 est = Estimator.from_torch(model=model, optimizer=adam, loss=criterion, metrics=[Accuracy()]) est.fit(data=train_loader, epochs=1, validation_data=test_loader, batch_size=batch_size, checkpoint_trigger=EveryEpoch()) paras1 = list(est.get_model().named_parameters()) est.save("model_epoch_1") # epoch 2 est.fit(data=train_loader, epochs=2, validation_data=test_loader, batch_size=batch_size, checkpoint_trigger=EveryEpoch()) paras2 = list(est.get_model().named_parameters()) est.load("model_epoch_1") paras3 = list(est.get_model().named_parameters()) load_success = 0 for i in range(len(paras2)): name2, para2 = paras2[i] name3, para3 = paras3[i] if not torch.all(torch.eq(para2, para3)): load_success = 1 break if not load_success: raise Exception( "Load failed. Parameters did not change after loading.") for i in range(len(paras1)): name1, para1 = paras1[i] name3, para3 = paras3[i] if not torch.all(torch.eq(para1, para3)): raise Exception("After reloading the model," + name1 + "does not match.") print("pass")
x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) net.train() orca_estimator = Estimator.from_torch(model=net, optimizer=optimizer, loss=criterion, backend="bigdl") orca_estimator.fit(data=trainloader, epochs=2, validation_data=testloader, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) print('Finished Training') dataiter = iter(testloader) images, labels = dataiter.next() # print images imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) res = orca_estimator.evaluate(data=testloader, validation_methods=[Accuracy()])[0] total_num = res.total_num result = res.result print("Accuracy of the network on the %s test images: %s" % (total_num, result)) stop_orca_context()