def test_bigdl_pytorch_estimator_dataloader_creator(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x model = SimpleModel() estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(), optimizer=Adam(), backend="bigdl") def get_dataloader(): inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2) estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data=get_dataloader, validation_methods=[Accuracy()]) model = estimator.get_model() assert isinstance(model, nn.Module)
def test_tf_optimizer_metrics(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean(tf.losses. sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, {"dense/": Adam(1e-3), "dense_1/": SGD(0.0)}, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}) initial_weights = optimizer.tf_model.training_helper_layer.get_weights() optimizer.optimize(end_trigger=MaxEpoch(1)) updated_weights = optimizer.tf_model.training_helper_layer.get_weights() for i in [0, 1]: # weights and bias combined with "dense/" should be updated assert not np.allclose(initial_weights[i], updated_weights[i]) for i in [2, 3]: # weights and bias combined with "dense_1" should be unchanged assert np.allclose(initial_weights[i], updated_weights[i]) optimizer.sess.close()
def test_train_model_with_bn(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() loss_fn = torch.nn.BCELoss() az_model = TorchModel.from_pytorch(torch_model) zoo_loss = TorchLoss.from_pytorch(loss_fn) inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) train_featureset = FeatureSet.pytorch_dataloader(train_loader) val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) val_featureset = FeatureSet.pytorch_dataloader(val_loader) zooOptimizer = Adam() estimator = Estimator(az_model, optim_methods=zooOptimizer) estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4), checkpoint_trigger=EveryEpoch(), validation_set=val_featureset, validation_method=[Accuracy()]) trained_model = az_model.to_pytorch()
def test_control_inputs(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) is_training = tf.placeholder(dtype=tf.bool, shape=()) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) features = tf.layers.dropout(features, training=is_training) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss( loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), tensor_with_value={is_training: (True, False)}, metrics={"loss": loss}) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) estimator = Estimator.from_torch(model=model, loss=loss_func, optimizer=SGD(), backend="bigdl") estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2)
def test_checkpoint(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean(tf.losses. sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) model_dir = tempfile.mkdtemp() try: optimizer = TFOptimizer.from_loss(loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}, model_dir=model_dir) optimizer.optimize(end_trigger=MaxEpoch(1)) import re ckpt_path = None versions = [] for (root, dirs, files) in os.walk(model_dir, topdown=True): temp_versions = [] for file_name in files: if re.match("^optimMethod-TFParkTraining\.[0-9]+$", file_name) is not None: version = int(file_name.split(".")[1]) temp_versions.append(version) if temp_versions: ckpt_path = root versions = temp_versions break assert ckpt_path is not None, "Cannot fine checkpoint file" optimizer.load_checkpoint(ckpt_path, max(versions)) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close() finally: import shutil shutil.rmtree(model_dir)
def test_tf_optimizer_metrics(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors output = tf.layers.dense(feature_tensor, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, Adam(1e-3), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
.withColumn("label", getLabel(col('name'))).cache() (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1]) # run training and evaluation featureTransformer = ChainedPreprocessing( [RowToImageFeature(), ImageCenterCrop(224, 224), ImageChannelNormalize(123.0, 117.0, 104.0, 255.0, 255.0, 255.0), ImageMatToTensor(), ImageFeatureToTensor()]) classifier = NNClassifier(torchnet, torchcriterion, featureTransformer) \ .setLearningRate(0.001) \ .setBatchSize(8) \ .setMaxEpoch(2) \ .setFeaturesCol("image") \ .setCachingSample(False) \ .setValidation(EveryEpoch(), validationDF, [Accuracy()], 8) catdogModel = classifier.fit(trainingDF) shift = udf(lambda p: p - 1, DoubleType()) predictionDF = catdogModel.transform(validationDF) \ .withColumn("prediction", shift(col('prediction'))).cache() predictionDF.sample(False, 0.1).show() correct = predictionDF.filter("label=prediction").count() overall = predictionDF.count() accuracy = correct * 1.0 / overall # expecting: accuracy > 96% print("Validation accuracy = %g " % accuracy)
model = MLP(x_train.shape[1], 512, classes, dropout=0.4) loss_fn = torch.nn.BCELoss() zooOptimizer = Adam(learningrate=learning_rate) zooModel = TorchModel.from_pytorch(model) zooLoss = TorchLoss.from_pytorch(loss_fn) train_featureSet = get_featureset(x_train, y_train, shuffle=True) val_featureSet = get_featureset(x_valid, y_valid, shuffle=False) estimator = Estimator(zooModel, optim_methods=zooOptimizer) estimator.train(train_featureSet, zooLoss, end_trigger=MaxEpoch(epochs), checkpoint_trigger=EveryEpoch(), validation_set=val_featureSet, validation_method=[Accuracy()], batch_size=batch_size) # Predict def get_rdd(x, y, shuffle=False): x = np.split(x.data.numpy(), x.shape[0]) y = np.split(y.data.numpy(), y.shape[0]) samples = [ Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x)) ] sample_rdd = sc.parallelize(samples) return sample_rdd
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument( '--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument( '--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=False) # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided. if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"}) else: num_executors = 2 num_cores_per_executor = 4 hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') zoo_conda_name = os.environ.get( 'ZOO_CONDA_NAME') # The name of the created conda-env sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir, conda_name=zoo_conda_name, num_executors=num_executors, executor_cores=num_cores_per_executor, executor_memory="2g", driver_memory="10g", driver_cores=1, conf={ "spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1" }) model = LeNet() model.train() criterion = nn.NLLLoss() adam = Adam(args.lr) zoo_estimator = Estimator.from_torch(model=model, optimizer=adam, loss=criterion, backend="bigdl") from bigdl.optim.optimizer import EveryEpoch zoo_estimator.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) zoo_estimator.evaluate(data=test_loader, validation_methods=[Accuracy()])
model = TextClassifier(CLASS_NUM, token_length, sequence_len, options.encoder, int(options.encoder_output_dim)) optimizer = Optimizer(model=model, training_rdd=train_rdd, criterion=SparseCategoricalCrossEntropy(), end_trigger=MaxEpoch(int(options.nb_epoch)), batch_size=batch_size, optim_method=Adagrad(learningrate=float( options.learning_rate), learningrate_decay=0.001)) optimizer.set_validation(batch_size=batch_size, val_rdd=val_rdd, trigger=EveryEpoch(), val_method=[Accuracy()]) log_dir = options.log_dir app_name = 'adam-' + dt.datetime.now().strftime("%Y%m%d-%H%M%S") train_summary = TrainSummary(log_dir=log_dir, app_name=app_name) train_summary.set_summary_trigger("Parameters", SeveralIteration(50)) val_summary = ValidationSummary(log_dir=log_dir, app_name=app_name) optimizer.set_train_summary(train_summary) optimizer.set_val_summary(val_summary) optimizer.optimize() # Predict for probability distributions results = model.predict(val_rdd) results.take(5) # Predict for labels
def main(): parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument( '-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') parser.add_argument('--cores', default=4, type=int, help='num of CPUs to use.') parser.add_argument('--nodes', default=1, type=int, help='num of nodes to use.') parser.add_argument('--executor_memory', default='20g', type=str, help='size of executor memory.') parser.add_argument('--driver_memory', default='20g', type=str, help='size of driver memory.') parser.add_argument('--driver_cores', default=1, type=int, help='num of driver cores to use.') args = parser.parse_args() # sc = init_nncontext() if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=args.cores, conf={"spark.driver.memory": "20g"}) else: hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') num_executors = args.nodes executor_memory = args.executor_memory driver_memory = args.driver_memory driver_cores = args.driver_cores num_cores_per_executor = args.cores os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor) os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor) sc = init_spark_on_yarn( hadoop_conf=hadoop_conf_dir, conda_name=detect_python_location().split("/") [-3], # The name of the created conda-env num_executors=num_executors, executor_cores=num_cores_per_executor, executor_memory=executor_memory, driver_memory=driver_memory, driver_cores=driver_cores, conf={ "spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1" }) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) model = torchvision.models.resnet50() val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False) iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size)) step = Step(iterationPerEpoch * 30, 0.1) zooOptimizer = SGD(args.lr, momentum=args.momentum, dampening=0.0, leaningrate_schedule=step, weightdecay=args.weight_decay) zooModel = TorchModel.from_pytorch(model) criterion = torch.nn.CrossEntropyLoss() zooCriterion = TorchLoss.from_pytorch(criterion) estimator = Estimator(zooModel, optim_methods=zooOptimizer) train_featureSet = FeatureSet.pytorch_dataloader(train_loader) test_featureSet = FeatureSet.pytorch_dataloader(val_loader) estimator.train_minibatch(train_featureSet, zooCriterion, end_trigger=MaxEpoch(90), checkpoint_trigger=EveryEpoch(), validation_set=test_featureSet, validation_method=[Accuracy(), Top5Accuracy()])
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) num_executors = 4 num_cores_per_executor = 1 hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') sc = init_spark_on_yarn( hadoop_conf=hadoop_conf_dir, conda_name=os. environ["ZOO_CONDA_NAME"], # The name of the created conda-env num_executor=num_executors, executor_cores=num_cores_per_executor, executor_memory="10g", driver_memory="10g", driver_cores=1, spark_conf={"spark.rpc.message.maxSize": "1024"}) model.train() sgd = Adam() zooModel = TorchNet.from_pytorch(model, [64, 1, 28, 28]) def lossFunc(input, target): return nn.NLLLoss().forward(input, target.flatten().long()) zooCriterion = TorchCriterion.from_pytorch(lossFunc, [1, 2], torch.LongTensor([1])) # zooCriterion = SparseCategoricalCrossEntropy(zero_based_label=True) estimator = Estimator(zooModel, optim_methods=sgd) v_input = [] v_target = [] for data, target in test_loader: v_input.append([data.numpy()]) v_target.append([target.numpy()]) test_featureset = FeatureSet.minibatch(v_input, v_target) for epoch in range(1, args.epochs + 1): train(args, estimator, zooCriterion, train_loader, epoch) # test(args, estimator, zooCriterion, test_featureset) estimator.evaluate_minibatch( test_featureset, [Loss(zooCriterion), Accuracy()]) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store cifar10 data') parser.add_argument( '--batch-size', type=int, default=128, metavar='N', help='input batch size for training per executor(default: 128)') parser.add_argument( '--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=135, metavar='N', help='number of epochs to train (default: 135)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--lrd', type=float, default=0.0, metavar='LRD', help='learning rate decay(default: 0.0)') parser.add_argument('--wd', type=float, default=5e-4, metavar='WD', help='weight decay(default: 5e-4)') parser.add_argument('--momentum', type=float, default=0.9, metavar='momentum', help='momentum (default: 0.9)') parser.add_argument('--dampening', type=float, default=0.0, metavar='dampening', help='dampening (default: 0.0)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() torch.manual_seed(args.seed) # 准备数据并预处理 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), # 先四周填充0,在吧图像随机裁剪成32*32 transforms.RandomHorizontalFlip(), # 图像一半的概率翻转,一半的概率不翻转 transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # R,G,B每层的归一化用到的均值和方差 ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_set = datasets.CIFAR10(args.dir, train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=2) test_set = datasets.CIFAR10(args.dir, train=False, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.test_batch_size, shuffle=False, num_workers=2) # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided. if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"}) else: num_executors = 2 num_cores_per_executor = 4 hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') zoo_conda_name = os.environ.get( 'ZOO_CONDA_NAME') # The name of the created conda-env sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir, conda_name=zoo_conda_name, num_executor=num_executors, executor_cores=num_cores_per_executor, executor_memory="2g", driver_memory="10g", driver_cores=1, spark_conf={ "spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1" }) model = ResNet18() model.train() criterion = nn.CrossEntropyLoss() optimizer = SGD(args.lr, args.lrd, args.wd, args.momentum, args.dampening) zoo_model = TorchModel.from_pytorch(model) zoo_criterion = TorchLoss.from_pytorch(criterion) zoo_estimator = Estimator(zoo_model, optim_methods=optimizer) train_featureset = FeatureSet.pytorch_dataloader(train_loader) test_featureset = FeatureSet.pytorch_dataloader(test_loader) from bigdl.optim.optimizer import MaxEpoch, EveryEpoch zoo_estimator.train_minibatch(train_featureset, zoo_criterion, end_trigger=MaxEpoch(args.epochs), checkpoint_trigger=EveryEpoch(), validation_set=test_featureset, validation_method=[Accuracy()])