def test_bigdl_pytorch_estimator_dataloader_creator(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        model = SimpleModel()

        estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(),
                                         optimizer=Adam(), backend="bigdl")

        def get_dataloader():
            inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]])
            targets = torch.Tensor([[0], [0], [0], [1], [1], [1]])
            return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2)

        estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader,
                      validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch())
        model = estimator.get_model()
        assert isinstance(model, nn.Module)
예제 #2
0
    def optimize(self, end_trigger=None, checkpoint_trigger=None):
        """
        Run the training loop of the this optimizer
        :param end_trigger: BigDL's Trigger to indicate when to stop the training.
        :param checkpoint_trigger: When to save a checkpoint and evaluate model.
        """
        if end_trigger is None:
            end_trigger = MaxEpoch(1)

        if checkpoint_trigger is None:
            checkpoint_trigger = EveryEpoch()

        if self.tf_model.val_methods is not None and self.val_rdd is not None:
            self.estimator.train(train_set=self.training_rdd,
                                 criterion=IdentityCriterion(),
                                 end_trigger=end_trigger,
                                 checkpoint_trigger=checkpoint_trigger,
                                 validation_set=self.val_rdd,
                                 validation_method=self.tf_model.val_methods,
                                 batch_size=self.batch_size)
        else:
            self.estimator.train(train_set=self.training_rdd,
                                 criterion=IdentityCriterion(),
                                 end_trigger=end_trigger,
                                 batch_size=self.batch_size)

        self.tf_model.training_helper_layer.get_weights_to_python()
예제 #3
0
    def test_train_model_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        loss_fn = torch.nn.BCELoss()
        az_model = TorchModel.from_pytorch(torch_model)
        zoo_loss = TorchLoss.from_pytorch(loss_fn)
        inputs = torch.Tensor([[1, 2], [1, 3], [3, 2],
                               [5, 6], [8, 9], [1, 9]])
        targets = torch.Tensor([[0], [0], [0],
                               [1], [1], [1]])
        train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        train_featureset = FeatureSet.pytorch_dataloader(train_loader)
        val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        val_featureset = FeatureSet.pytorch_dataloader(val_loader)

        zooOptimizer = Adam()
        estimator = Estimator(az_model, optim_methods=zooOptimizer)
        estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=val_featureset,
                                  validation_method=[Accuracy()])

        trained_model = az_model.to_pytorch()
예제 #4
0
    def optimize(self, end_trigger=None, checkpoint_trigger=None):
        """
        Run the training loop of the this optimizer
        :param end_trigger: BigDL's Trigger to indicate when to stop the training.
        :param checkpoint_trigger: When to save a checkpoint and evaluate model.
        """
        if end_trigger is None:
            end_trigger = MaxEpoch(1)

        if checkpoint_trigger is None:
            checkpoint_trigger = EveryEpoch()

        if isinstance(self.train_data, FeatureSet):
            if self.train_data.value.getNumOfSlice() != 1:
                if isinstance(checkpoint_trigger, EveryEpoch):
                    checkpoint_trigger = ZEveryEpoch()
                elif not isinstance(checkpoint_trigger, ZooTrigger):
                    raise Exception("Please use a trigger defined in zoo.util.triggers")

        if self.tf_model.val_methods and self.val_data is not None:
            self.estimator.train_minibatch(train_set=self.train_data,
                                           criterion=self.tf_model.criterion,
                                           end_trigger=end_trigger,
                                           checkpoint_trigger=checkpoint_trigger,
                                           validation_set=self.val_data,
                                           validation_method=self.tf_model.val_methods)
        else:
            self.estimator.train_minibatch(train_set=self.train_data,
                                           criterion=self.tf_model.criterion,
                                           end_trigger=end_trigger,
                                           checkpoint_trigger=checkpoint_trigger)

        self.tf_model.training_helper_layer.get_weights_to_python()
예제 #5
0
 def convert_trigger(trigger):
     if trigger is None:
         return None
     if isinstance(trigger, str):
         if trigger.lower() == "everyepoch":
             return EveryEpoch().get_trigger()
         else:
             raise ValueError(
                 "Only 'EveryEpoch', orca triggers and bigdl triggers are "
                 "supported now")
     elif isinstance(trigger, Trigger):
         return trigger.get_trigger()
     else:
         return trigger
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        def transform(df):
            result = {
                "x": [df['user'].to_numpy(), df['item'].to_numpy()],
                "y": df['label'].to_numpy()
            }
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        estimator = Estimator.from_torch(model=model,
                                         loss=loss_func,
                                         optimizer=SGD(),
                                         backend="bigdl")
        estimator.fit(data=data_shard,
                      epochs=4,
                      batch_size=2,
                      validation_data=data_shard,
                      validation_methods=[Accuracy()],
                      checkpoint_trigger=EveryEpoch())
        estimator.evaluate(data_shard,
                           validation_methods=[Accuracy()],
                           batch_size=2)
예제 #7
0
    def __init__(self, loss, optim_method, sess=None, dataset=None, inputs=None,
                 grads=None, variables=None, graph=None,
                 val_outputs=None, val_labels=None, val_method=None, val_split=0.0,
                 tensors_with_value=None, session_config=None):
        '''
        TFOptimizer is used for distributed training of TensorFlow
        on Spark/BigDL.

        :param loss: The loss tensor of the TensorFlow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current TensorFlow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''

        import tensorflow as tf
        from tensorflow.python.util import nest
        from zoo.util.tf import export_tf

        if dataset is None:
            args = TFOptimizer._get_arguments_from_loss(loss, optim_method, sess,
                                                        val_outputs, val_labels, val_method)
            loss, optim_method, sess, dataset, inputs = args[:5]
            grads, variables, graph, val_outputs, val_labels, val_method = args[5:]

        additional_inputs = []
        additional_values = []
        all_required_inputs = _find_placeholders([loss])
        all_required_inputs_names = [v.name for v in all_required_inputs]
        if tensors_with_value:
            for t, v in tensors_with_value.items():
                if t.name in all_required_inputs_names:
                    additional_inputs.append(t)
                    additional_values.append(v)

        if not isinstance(inputs, list):
            inputs = nest.flatten(inputs)

        self.optim_method = optim_method
        self.sess = sess
        self.dataset = dataset
        self.inputs = inputs + additional_inputs
        self.graph = graph
        self.session_config = session_config

        from zoo.util.tf import process_grad
        grads = [process_grad(grad) for grad in grads]

        if self.dataset.batch_size <= 0:
            raise ValueError("You should set batch_size instead of batch_per_thread for training")

        if val_outputs is not None and val_labels is not None:
            with self.graph.as_default():
                val_labels = [tf.identity(v) for v in val_labels]
            outputs = val_outputs + val_labels + [loss]
        else:
            outputs = [loss]

        self.grads = grads
        self.outputs = outputs

        self.export_dir = tempfile.mkdtemp()
        export_tf(self.sess, self.export_dir,
                  inputs=self.inputs,
                  outputs=self.grads + self.outputs)

        variable_names = [v.name for v in variables]
        grad_names = [g.name for g in grads]
        output_names = [o.name for o in outputs]

        def to_floats(vs):
            return [float(v) for v in vs]

        meta = {
            "input_names": [i.name for i in self.inputs],
            "output_names": output_names,
            "variables": variable_names,
            "grad_variables": grad_names,
            "default_tensor_values": [to_floats(v) for v in additional_values]
        }

        with open(os.path.join(self.export_dir, "training_meta.json"), "w") as f:
            f.write(json.dumps(meta))

        self.variable_placeholders = []
        with self.graph.as_default():
            assigns = []
            for v in variables:
                p = tf.placeholder(dtype=tf.float32, shape=v.shape)
                a = tf.assign(v, p)
                self.variable_placeholders.append(p)
                assigns.append(a)
            assign = tf.group(*assigns)
        self.assign = assign
        try:
            self.training_helper_layer = TFTrainingHelper(self.export_dir, session_config)
        except Py4JJavaError as e:
            if "expects to be colocated with unknown node" in str(e):
                raise Exception("""
If you are using the embedding layer in tf.keras, then this is a
known issue of TensorFlow, see https://github.com/tensorflow/tensorflow/issues/21889.
Please add zoo.util.tf.variable_creator_scope before model construction.
For example:
from zoo.util.tf import variable_creator_scope
with variable_creator_scope():
    model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(1, 1, input_length=1)])
                """)
            else:
                raise e

        data = self.dataset.rdd
        batch_size = self.dataset.batch_size

        def to_sample(t):
            if isinstance(t, list):
                t = tuple(t)
            return Sample.from_ndarray(nest.flatten(t), [np.array([0.0])])

        sample_rdd = data.map(to_sample)
        if val_outputs is not None and val_labels is not None:
            if self.dataset.val_rdd is not None:
                val_rdd = self.dataset.val_rdd.map(to_sample)
                val_method = [TFValidationMethod(m, len(val_outputs), len(val_labels))
                              for m in to_list(val_method)]
                training_rdd = sample_rdd

            elif val_split != 0.0:
                training_rdd, val_rdd = sample_rdd.randomSplit([1 - val_split, val_split])
                val_method = [TFValidationMethod(m, len(val_outputs), len(val_labels))
                              for m in to_list(val_method)]
            else:
                raise ValueError("Validation data is not specified. Please set " +
                                 "val rdd in TFDataset, or set val_split larger than zero")

            self.optimizer = Optimizer.create(self.training_helper_layer,
                                              training_rdd,
                                              IdentityCriterion(),
                                              batch_size=batch_size,
                                              optim_method=self.optim_method)
            self.optimizer.set_validation(self.dataset.batch_size,
                                          val_rdd,
                                          EveryEpoch(),
                                          val_method)
        else:
            training_rdd = sample_rdd
            self.optimizer = Optimizer.create(self.training_helper_layer,
                                              training_rdd,
                                              IdentityCriterion(),
                                              batch_size=batch_size,
                                              optim_method=self.optim_method)
예제 #8
0
파일: net.py 프로젝트: zfxu/analytics-zoo
    def __init__(self,
                 loss,
                 optim_method,
                 sess=None,
                 val_outputs=None,
                 val_labels=None,
                 val_method=None):
        import tensorflow as tf
        from zoo.util.tf import export_tf
        '''
        TFOptimizer is used for distributed training of tensorflow
        on Spark/BigDL.

        :param loss: The loss tensor of the tensorflow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current tensorflow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''
        self.optim_method = optim_method
        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess
        grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients(
            loss)
        variables = []
        grads = []
        for (grad, var) in grads_vars:
            variables.append(var)
            grads.append(grad)
        self.export_dir = tempfile.mkdtemp()
        all_required_inputs = _find_placeholders([loss])
        self.dataset = tf.get_collection(all_required_inputs[0].name)[0]
        if self.dataset.batch_size <= 0:
            raise ValueError(
                "You should set batch_size instead of batch_per_thread for training"
            )
        self.inputs = self.dataset.tensors

        _check_the_same(all_required_inputs, self.inputs)

        if val_outputs is not None and val_labels is not None:
            outputs = val_outputs + val_labels + [loss]
        else:
            outputs = [loss]

        export_tf(self.sess,
                  self.export_dir,
                  inputs=self.inputs,
                  outputs=grads + outputs)

        variable_names = [v.name for v in variables]
        grad_names = [g.name for g in grads]
        output_names = [o.name for o in outputs]

        meta = {
            "input_names": [i.name for i in self.inputs],
            "output_names": output_names,
            "variables": variable_names,
            "grad_variables": grad_names
        }

        with open(os.path.join(self.export_dir, "training_meta.json"),
                  "w") as f:
            f.write(json.dumps(meta))

        self.training_helper_layer = TFTrainingHelper(self.export_dir)

        self.variable_placeholders = []
        assigns = []
        for v in variables:
            p = tf.placeholder(dtype=tf.float32, shape=v.shape)
            a = tf.assign(v, p)
            self.variable_placeholders.append(p)
            assigns.append(a)
        self.assign = tf.group(*assigns)

        data = self.dataset.rdd
        batch_size = self.dataset.batch_size
        sample_rdd = data.map(
            lambda t: Sample.from_ndarray(t, [np.array([0.0])]))

        self.optimizer = Optimizer.create(self.training_helper_layer,
                                          sample_rdd,
                                          IdentityCriterion(),
                                          batch_size=batch_size,
                                          optim_method=self.optim_method)

        if val_outputs is not None and val_labels is not None:
            val_sample_rdd = self.dataset.val_rdd\
                .map(lambda t: Sample.from_ndarray(t, [np.array([0.0])]))
            val_method = TFValidationMethod(val_method, len(val_outputs),
                                            len(val_labels))
            self.optimizer.set_validation(self.dataset.batch_size,
                                          val_sample_rdd, EveryEpoch(),
                                          val_method)
예제 #9
0
    y_valid = Variable(torch.Tensor(train_Y[val_.astype(int), np.newaxis]))

    model = MLP(x_train.shape[1], 512, classes, dropout=0.4)
    loss_fn = torch.nn.BCELoss()
    zooOptimizer = Adam(learningrate=learning_rate)
    zooModel = TorchModel.from_pytorch(model)
    zooLoss = TorchLoss.from_pytorch(loss_fn)

    train_featureSet = get_featureset(x_train, y_train, shuffle=True)
    val_featureSet = get_featureset(x_valid, y_valid, shuffle=False)

    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    estimator.train(train_featureSet,
                    zooLoss,
                    end_trigger=MaxEpoch(epochs),
                    checkpoint_trigger=EveryEpoch(),
                    validation_set=val_featureSet,
                    validation_method=[Accuracy()],
                    batch_size=batch_size)


# Predict
def get_rdd(x, y, shuffle=False):
    x = np.split(x.data.numpy(), x.shape[0])
    y = np.split(y.data.numpy(), y.shape[0])
    samples = [
        Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i]))
        for i in range(len(x))
    ]
    sample_rdd = sc.parallelize(samples)
    return sample_rdd
예제 #10
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store mnist data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=256,
        metavar='N',
        help='input batch size for training per executor(default: 256)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        metavar='N',
                        help='number of epochs to train (default: 2)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=False)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executors=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = LeNet()
    model.train()
    criterion = nn.NLLLoss()

    adam = Adam(args.lr)
    zoo_estimator = Estimator.from_torch(model=model,
                                         optimizer=adam,
                                         loss=criterion,
                                         backend="bigdl")
    from bigdl.optim.optimizer import EveryEpoch
    zoo_estimator.fit(data=train_loader,
                      epochs=args.epochs,
                      validation_data=test_loader,
                      validation_methods=[Accuracy()],
                      checkpoint_trigger=EveryEpoch())
    zoo_estimator.evaluate(data=test_loader, validation_methods=[Accuracy()])
예제 #11
0
                               input_shape=input_shape)(both_input)

encode_left = both_feature.index_select(1, 0)
encode_right = both_feature.index_select(1, 1)

distance = autograd.abs(encode_left - encode_right)
predict = Dense(output_dim=NUM_CLASS_LABEL,
                activation="sigmoid",
                W_regularizer=L2Regularizer(args.penalty_rate))(distance)

siamese_net = Model(input=both_input, output=predict)

# 声明优化器, 训练并测试模型.
optimizer = Optimizer(model=siamese_net,
                      training_rdd=train_rdd,
                      optim_method=Adam(args.learning_rate),
                      criterion=CrossEntropyCriterion(),
                      end_trigger=MaxEpoch(args.num_epoch),
                      batch_size=args.batch_size)
optimizer.set_validation(batch_size=args.batch_size,
                         val_rdd=test_rdd,
                         trigger=EveryEpoch(),
                         val_method=[Top1Accuracy()])

# 设置训练日志, 可用 TensorBoard 查询.
app_name = "logs"
optimizer.set_train_summary(TrainSummary(log_dir=".", app_name=app_name))
optimizer.set_val_summary(ValidationSummary(log_dir=".", app_name=app_name))

optimizer.optimize()
예제 #12
0
 def __init__(self):
     from bigdl.optim.optimizer import EveryEpoch
     self.trigger = EveryEpoch()
예제 #13
0
파일: main.py 프로젝트: GavinGu07/cifar10
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store cifar10 data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=128,
        metavar='N',
        help='input batch size for training per executor(default: 128)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=135,
                        metavar='N',
                        help='number of epochs to train (default: 135)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--lrd',
                        type=float,
                        default=0.0,
                        metavar='LRD',
                        help='learning rate decay(default: 0.0)')
    parser.add_argument('--wd',
                        type=float,
                        default=5e-4,
                        metavar='WD',
                        help='weight decay(default: 5e-4)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        metavar='momentum',
                        help='momentum (default: 0.9)')
    parser.add_argument('--dampening',
                        type=float,
                        default=0.0,
                        metavar='dampening',
                        help='dampening (default: 0.0)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    # 准备数据并预处理
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),  # 先四周填充0,在吧图像随机裁剪成32*32
        transforms.RandomHorizontalFlip(),  # 图像一半的概率翻转,一半的概率不翻转
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),  # R,G,B每层的归一化用到的均值和方差
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    train_set = datasets.CIFAR10(args.dir,
                                 train=True,
                                 download=True,
                                 transform=transform_train)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=2)

    test_set = datasets.CIFAR10(args.dir,
                                train=False,
                                transform=transform_test)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              num_workers=2)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executor=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                spark_conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = ResNet18()
    model.train()
    criterion = nn.CrossEntropyLoss()

    optimizer = SGD(args.lr, args.lrd, args.wd, args.momentum, args.dampening)
    zoo_model = TorchModel.from_pytorch(model)
    zoo_criterion = TorchLoss.from_pytorch(criterion)
    zoo_estimator = Estimator(zoo_model, optim_methods=optimizer)
    train_featureset = FeatureSet.pytorch_dataloader(train_loader)
    test_featureset = FeatureSet.pytorch_dataloader(test_loader)
    from bigdl.optim.optimizer import MaxEpoch, EveryEpoch
    zoo_estimator.train_minibatch(train_featureset,
                                  zoo_criterion,
                                  end_trigger=MaxEpoch(args.epochs),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=test_featureset,
                                  validation_method=[Accuracy()])
예제 #14
0
    def __init__(self,
                 loss,
                 optim_method,
                 sess=None,
                 dataset=None,
                 inputs=None,
                 grads=None,
                 variables=None,
                 graph=None,
                 val_outputs=None,
                 val_labels=None,
                 val_method=None,
                 val_split=0.0,
                 tensors_with_value=None,
                 session_config=None,
                 clip_norm=None,
                 clip_value=None,
                 metrics=None):
        '''
        TFOptimizer is used for distributed training of TensorFlow
        on Spark/BigDL.

        :param loss: The loss tensor of the TensorFlow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current TensorFlow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''

        if dataset is None:
            args = TFOptimizer._get_arguments_from_loss(
                loss, optim_method, sess, val_outputs, val_labels, val_method)
            loss, optim_method, sess, dataset, inputs = args[:5]
            grads, variables, graph, val_outputs, val_labels, val_method = args[
                5:]

        self.optim_method = optim_method
        self.sess = sess
        self.dataset = dataset
        self.graph = graph

        self.clip_norm = clip_norm
        if clip_value is not None and not isinstance(clip_value, tuple):
            raise ValueError(
                "The clip_value argument should be a tuple (min_value, max_value)"
            )
        self.clip_constant = clip_value

        if self.dataset.batch_size <= 0:
            raise ValueError(
                "You should set batch_size instead of batch_per_thread for training"
            )

        if val_method is not None:
            val_methods = to_list(val_method)
            if metrics is None:
                metrics = {}

            for i, method in enumerate(val_methods):
                metrics['bigdl_metirc_' + str(i)] = BigDLMetric(
                    method, val_outputs, val_labels)

        self.tf_model = TFModel.create(loss, sess, inputs, grads, variables,
                                       graph, tensors_with_value,
                                       session_config, metrics)

        batch_size = self.dataset.batch_size

        sample_rdd = self.dataset.get_training_data()

        if val_split != 0.0:
            training_rdd, val_rdd = sample_rdd.randomSplit(
                [1 - val_split, val_split])
        else:
            training_rdd = sample_rdd
            val_rdd = self.dataset.get_validation_data()

        if self.tf_model.val_methods is not None and val_rdd is not None:

            self.optimizer = Optimizer.create(
                self.tf_model.training_helper_layer,
                training_rdd,
                IdentityCriterion(),
                batch_size=batch_size,
                optim_method=self.optim_method)
            self.optimizer.set_validation(self.dataset.batch_size, val_rdd,
                                          EveryEpoch(),
                                          self.tf_model.val_methods)
        else:
            self.optimizer = Optimizer.create(
                self.tf_model.training_helper_layer,
                training_rdd,
                IdentityCriterion(),
                batch_size=batch_size,
                optim_method=self.optim_method)

        if self.clip_norm:
            self.optimizer.set_gradclip_l2norm(self.clip_norm)
        if self.clip_constant:
            min_value, max_value = self.clip_constant
            self.optimizer.set_gradclip_const(min_value, max_value)
예제 #15
0
    def __init__(self,
                 loss,
                 optim_method,
                 sess=None,
                 dataset=None,
                 inputs=None,
                 grads=None,
                 variables=None,
                 graph=None,
                 val_outputs=None,
                 val_labels=None,
                 val_method=None,
                 add_sample_weights_num=0):
        import tensorflow as tf
        from zoo.util.tf import export_tf
        '''
        TFOptimizer is used for distributed training of tensorflow
        on Spark/BigDL.

        :param loss: The loss tensor of the tensorflow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current tensorflow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''

        if dataset is None:
            args = TFOptimizer._get_arguments_from_loss(
                loss, optim_method, sess, val_outputs, val_labels, val_method)
            loss, optim_method, sess, dataset, inputs = args[:5]
            grads, variables, graph, val_outputs, val_labels, val_method = args[
                5:]

        self.optim_method = optim_method
        self.sess = sess
        self.dataset = dataset
        self.inputs = inputs
        self.graph = graph

        if self.dataset.batch_size <= 0:
            raise ValueError(
                "You should set batch_size instead of batch_per_thread for training"
            )

        if val_outputs is not None and val_labels is not None:
            with self.graph.as_default():
                val_labels = [tf.identity(v) for v in val_labels]
            outputs = val_outputs + val_labels + [loss]
        else:
            outputs = [loss]

        self.export_dir = tempfile.mkdtemp()
        export_tf(self.sess,
                  self.export_dir,
                  inputs=self.inputs,
                  outputs=grads + outputs)

        variable_names = [v.name for v in variables]
        grad_names = [g.name for g in grads]
        output_names = [o.name for o in outputs]

        meta = {
            "input_names": [i.name for i in self.inputs],
            "output_names": output_names,
            "variables": variable_names,
            "grad_variables": grad_names
        }

        with open(os.path.join(self.export_dir, "training_meta.json"),
                  "w") as f:
            f.write(json.dumps(meta))

        self.variable_placeholders = []
        with self.graph.as_default():
            assigns = []
            for v in variables:
                p = tf.placeholder(dtype=tf.float32, shape=v.shape)
                a = tf.assign(v, p)
                self.variable_placeholders.append(p)
                assigns.append(a)
            assign = tf.group(*assigns)
        self.assign = assign

        self.training_helper_layer = TFTrainingHelper(self.export_dir)

        data = self.dataset.rdd
        batch_size = self.dataset.batch_size
        sample_rdd = data.map(lambda t: Sample.from_ndarray(
            t + [np.array(1.0)] * add_sample_weights_num, [np.array([0.0])]))

        self.optimizer = Optimizer.create(self.training_helper_layer,
                                          sample_rdd,
                                          IdentityCriterion(),
                                          batch_size=batch_size,
                                          optim_method=self.optim_method)

        if val_outputs is not None and val_labels is not None:
            val_sample_rdd = self.dataset.val_rdd\
                .map(lambda t: Sample.from_ndarray(t + [np.array(1.0)] * add_sample_weights_num,
                                                   [np.array([0.0])]))
            val_method = [
                TFValidationMethod(m, len(val_outputs), len(val_labels))
                for m in to_list(val_method)
            ]
            self.optimizer.set_validation(self.dataset.batch_size,
                                          val_sample_rdd, EveryEpoch(),
                                          val_method)