Example #1
0
    def test_train_model_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        loss_fn = torch.nn.BCELoss()
        az_model = TorchModel.from_pytorch(torch_model)
        zoo_loss = TorchLoss.from_pytorch(loss_fn)
        inputs = torch.Tensor([[1, 2], [1, 3], [3, 2],
                               [5, 6], [8, 9], [1, 9]])
        targets = torch.Tensor([[0], [0], [0],
                               [1], [1], [1]])
        train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        train_featureset = FeatureSet.pytorch_dataloader(train_loader)
        val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        val_featureset = FeatureSet.pytorch_dataloader(val_loader)

        zooOptimizer = Adam()
        estimator = Estimator(az_model, optim_methods=zooOptimizer)
        estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=val_featureset,
                                  validation_method=[Accuracy()])

        trained_model = az_model.to_pytorch()
Example #2
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 label_cols=None):
        from zoo.orca.data.utils import xshard_to_sample

        assert data is not None, "validation data shouldn't be None"
        assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
                                         " argument when creating this estimator."

        if isinstance(data, SparkXShards):
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.flatMap(xshard_to_sample))
            result = self.estimator.evaluate(val_feature_set, self.metrics,
                                             batch_size)
        elif isinstance(data, DataFrame):
            schema = data.schema
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.map(lambda row: row_to_sample(
                    row, schema, feature_cols, label_cols)))
            result = self.estimator.evaluate(val_feature_set, self.metrics,
                                             batch_size)
        elif isinstance(data, DataLoader) or callable(data):
            val_feature_set = FeatureSet.pytorch_dataloader(data)
            result = self.estimator.evaluate_minibatch(val_feature_set,
                                                       self.metrics)
        else:
            raise ValueError(
                "Data should be a SparkXShards, a DataLoader or a callable "
                "data_creator, but get " + data.__class__.__name__)
        return bigdl_metric_results_to_dict(result)
Example #3
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 labels_cols=None,
                 validation_metrics=None):
        from zoo.orca.data.utils import to_sample
        from zoo.orca.learn.metrics import Metrics

        assert data is not None, "validation data shouldn't be None"
        validation_metrics = Metrics.convert_metrics_list(validation_metrics)

        if isinstance(data, SparkXShards):
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.flatMap(to_sample))
            return self.estimator.evaluate(val_feature_set, validation_metrics,
                                           batch_size)
        elif isinstance(data, DataLoader) or callable(data):
            val_feature_set = FeatureSet.pytorch_dataloader(data)
            return self.estimator.evaluate_minibatch(val_feature_set,
                                                     validation_metrics)
        else:
            raise ValueError(
                "Data should be a SparkXShards, a DataLoader or a callable "
                "data_creator, but get " + data.__class__.__name__)
Example #4
0
    def fit(self, data, epochs=1, batch_size=32, validation_data=None, validation_methods=None,
            checkpoint_trigger=None):
        from zoo.orca.data.utils import to_sample

        end_trigger = MaxEpoch(epochs)
        assert batch_size > 0, "batch_size should be greater than 0"

        if isinstance(data, SparkXShards):
            train_rdd = data.rdd.flatMap(to_sample)
            train_feature_set = FeatureSet.sample_rdd(train_rdd)
            if validation_data is None:
                val_feature_set = None
            else:
                assert isinstance(validation_data, SparkXShards), "validation_data should be a " \
                                                                  "SparkXShards"
                val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.flatMap(to_sample))

            self.estimator.train(train_feature_set, self.loss, end_trigger, checkpoint_trigger,
                                 val_feature_set, validation_methods, batch_size)
        elif isinstance(data, DataLoader) or callable(data):
            train_feature_set = FeatureSet.pytorch_dataloader(data, "", "")
            if validation_data is None:
                val_feature_set = None
            else:
                assert isinstance(validation_data, DataLoader) or callable(data), \
                    "validation_data should be a pytorch DataLoader or a callable data_creator"
                val_feature_set = FeatureSet.pytorch_dataloader(validation_data)

            self.estimator.train_minibatch(train_feature_set, self.loss, end_trigger,
                                           checkpoint_trigger, val_feature_set, validation_methods)
        else:
            raise ValueError("Data and validation data should be SparkXShards, DataLoaders or "
                             "callable data_creators but get " + data.__class__.__name__)
        return self
Example #5
0
 def _handle_xshards(self, data, validation_data):
     train_rdd = data.rdd.flatMap(xshard_to_sample)
     train_feature_set = FeatureSet.sample_rdd(train_rdd)
     if validation_data is None:
         val_feature_set = None
     else:
         assert isinstance(validation_data, SparkXShards), "validation_data should be a " \
                                                           "SparkXShards"
         val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.flatMap(xshard_to_sample))
     return train_feature_set, val_feature_set
Example #6
0
    def _hanle_data_loader(self, data, validation_data):
        train_feature_set = FeatureSet.pytorch_dataloader(data, "", "")
        if validation_data is None:
            val_feature_set = None
        else:
            assert isinstance(validation_data, DataLoader) or callable(data), \
                "validation_data should be a pytorch DataLoader or a callable data_creator"
            val_feature_set = FeatureSet.pytorch_dataloader(validation_data)

        return train_feature_set, val_feature_set
Example #7
0
    def fit(self,
            data,
            epochs=1,
            batch_size=32,
            feature_cols=None,
            labels_cols=None,
            validation_data=None,
            validation_metrics=None,
            checkpoint_trigger=None):
        from zoo.orca.data.utils import to_sample
        from zoo.orca.learn.metrics import Metrics
        from zoo.orca.learn.trigger import Trigger

        end_trigger = MaxEpoch(epochs)
        assert batch_size > 0, "batch_size should be greater than 0"
        validation_metrics = Metrics.convert_metrics_list(validation_metrics)
        checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger)

        if self.log_dir is not None and self.app_name is not None:
            self.estimator.set_tensorboard(self.log_dir, self.app_name)

        if isinstance(data, SparkXShards):
            train_rdd = data.rdd.flatMap(to_sample)
            train_feature_set = FeatureSet.sample_rdd(train_rdd)
            if validation_data is None:
                val_feature_set = None
            else:
                assert isinstance(validation_data, SparkXShards), "validation_data should be a " \
                                                                  "SparkXShards"
                val_feature_set = FeatureSet.sample_rdd(
                    validation_data.rdd.flatMap(to_sample))

            self.estimator.train(train_feature_set, self.loss, end_trigger,
                                 checkpoint_trigger, val_feature_set,
                                 validation_metrics, batch_size)
        elif isinstance(data, DataLoader) or callable(data):
            train_feature_set = FeatureSet.pytorch_dataloader(data, "", "")
            if validation_data is None:
                val_feature_set = None
            else:
                assert isinstance(validation_data, DataLoader) or callable(data), \
                    "validation_data should be a pytorch DataLoader or a callable data_creator"
                val_feature_set = FeatureSet.pytorch_dataloader(
                    validation_data)

            self.estimator.train_minibatch(train_feature_set, self.loss,
                                           end_trigger, checkpoint_trigger,
                                           val_feature_set, validation_metrics)
        else:
            raise ValueError(
                "Data and validation data should be SparkXShards, DataLoaders or "
                "callable data_creators but get " + data.__class__.__name__)
        return self
Example #8
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 label_cols=None,
                 validation_metrics=None):
        """
        Evaluate model.

        :param data: data: evaluation data. It can be an XShards, Spark Dataframe,
               PyTorch DataLoader and PyTorch DataLoader creator function.
               If data is an XShards, each partition can be a Pandas DataFrame or a dictionary of
               {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of
               numpy arrays.
        :param batch_size: Batch size used for evaluation. Only used when data is a SparkXShard.
        :param feature_cols: Feature column name(s) of data. Only used when data
               is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None.
        :param label_cols: Label column name(s) of data. Only used when data is
               a Spark DataFrame or an XShards of Pandas DataFrame. Default: None.
        :param validation_metrics: Orca validation metrics to be computed on validation_data.
        :return: validation results.
        """
        from zoo.orca.data.utils import xshard_to_sample

        assert data is not None, "validation data shouldn't be None"
        assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
                                         " argument when creating this estimator."

        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                data = process_xshards_of_pandas_dataframe(
                    data, feature_cols, label_cols)
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.flatMap(xshard_to_sample))
            result = self.estimator.evaluate(val_feature_set, self.metrics,
                                             batch_size)
        elif isinstance(data, DataFrame):
            schema = data.schema
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.map(lambda row: row_to_sample(
                    row, schema, feature_cols, label_cols)))
            result = self.estimator.evaluate(val_feature_set, self.metrics,
                                             batch_size)
        elif isinstance(data, DataLoader) or callable(data):
            val_feature_set = FeatureSet.pytorch_dataloader(data)
            result = self.estimator.evaluate_minibatch(val_feature_set,
                                                       self.metrics)
        else:
            raise ValueError(
                "Data should be a SparkXShards, a DataLoader or a callable "
                "data_creator, but get " + data.__class__.__name__)
        return bigdl_metric_results_to_dict(result)
Example #9
0
    def _handle_dataframe(self, data, validation_data, feature_cols, label_cols):
        schema = data.schema
        train_rdd = data.rdd.map(lambda row: row_to_sample(row, schema, feature_cols, label_cols))
        train_feature_set = FeatureSet.sample_rdd(train_rdd)
        if validation_data is None:
            val_feature_set = None
        else:
            assert isinstance(validation_data, DataFrame), "validation_data should also be a " \
                                                           "DataFrame"
            val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.map(
                lambda row: row_to_sample(row, schema, feature_cols, label_cols)))

        return train_feature_set, val_feature_set
Example #10
0
    def test_estimator_train(self):
        batch_size = 8
        epoch_num = 5

        images, labels = TestEstimator._generate_image_data(data_num=8,
                                                            img_shape=(3, 224,
                                                                       224))

        image_rdd = self.sc.parallelize(images)
        labels = self.sc.parallelize(labels)

        sample_rdd = image_rdd.zip(labels).map(
            lambda img_label: zoo.common.Sample.from_ndarray(
                img_label[0], img_label[1]))

        data_set = FeatureSet.sample_rdd(sample_rdd)

        model = TestEstimator._create_cnn_model()

        optim_method = SGD(learningrate=0.01)

        estimator = Estimator(model, optim_method, "")
        estimator.set_constant_gradient_clipping(0.1, 1.2)
        estimator.train(train_set=data_set,
                        criterion=ClassNLLCriterion(),
                        end_trigger=MaxEpoch(epoch_num),
                        checkpoint_trigger=EveryEpoch(),
                        validation_set=data_set,
                        validation_method=[Top1Accuracy()],
                        batch_size=batch_size)
        predict_result = model.predict(sample_rdd)
        assert (predict_result.count(), 8)
Example #11
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 label_cols=None):
        """
        Evaluate model.

        :param data: validation data. It can be XShards, each partition is a dictionary of
        {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
        :param batch_size: Batch size used for validation. Default: 32.
        :param feature_cols: (Not supported yet) Feature column name(s) of data. Only used when
        data is a Spark  DataFrame. Default: None.
        :param label_cols: (Not supported yet) Label column name(s) of data. Only used when data
        is a Spark DataFrame. Default: None.
        :return:
        """
        assert data is not None, "validation data shouldn't be None"
        assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
                                         " argument when creating this estimator."

        if isinstance(data, DataFrame):
            raise NotImplementedError
        elif isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.flatMap(xshard_to_sample))
            result = self.estimator.evaluate(val_feature_set, self.metrics,
                                             batch_size)
        else:
            raise ValueError(
                "Data should be XShards or Spark DataFrame, but get " +
                data.__class__.__name__)

        return bigdl_metric_results_to_dict(result)
Example #12
0
 def _get_training_data(self):
     jvalue = callZooFunc("float", "createTFDataFeatureSet",
                          self.rdd.map(lambda x: x[0]), self.init_op_name,
                          self.table_init_op, self.output_names,
                          self.output_types, self.shard_index_op_name,
                          self.inter_threads, self.intra_threads)
     return FeatureSet(jvalue=jvalue)
 def input_fn(mode):
     if mode == tf.estimator.ModeKeys.TRAIN:
         image_set = self.get_raw_image_set(with_label=True)
         feature_set = FeatureSet.image_frame(
             image_set.to_image_frame())
         train_transformer = ChainedPreprocessing([
             ImageBytesToMat(),
             ImageResize(256, 256),
             ImageRandomCrop(224, 224),
             ImageRandomPreprocessing(ImageHFlip(), 0.5),
             ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224,
                                   0.225),
             ImageMatToTensor(to_RGB=True, format="NHWC"),
             ImageSetToSample(input_keys=["imageTensor"],
                              target_keys=["label"])
         ])
         feature_set = feature_set.transform(train_transformer)
         feature_set = feature_set.transform(ImageFeatureToSample())
         training_dataset = TFDataset.from_feature_set(
             feature_set,
             features=(tf.float32, [224, 224, 3]),
             labels=(tf.int32, [1]),
             batch_size=8)
         return training_dataset
     else:
         raise NotImplementedError
Example #14
0
 def get_training_data(self):
     sample_rdd = self.rdd.map(
         lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0])))
     fs = FeatureSet.sample_rdd(sample_rdd,
                                sequential_order=self.sequential_order,
                                shuffle=self.shuffle)
     return fs
Example #15
0
 def get_training_data(self):
     jvalue = callZooFunc("float", "createMiniBatchRDDFromStringRDD",
                          self.train_rdd, self.batch_size)
     rdd = jvalue.value().toJavaRDD()
     fs = FeatureSet.rdd(rdd,
                         sequential_order=self.sequential_order,
                         shuffle=self.shuffle)
     return fs
Example #16
0
 def _get_validation_data(self):
     if self.validation_dataset is not None:
         jvalue = callZooFunc("float", "createTFDataFeatureSet",
                              self.val_rdd.map(lambda x: x[0]), self.init_op_name,
                              self.table_init_op, self.output_names,
                              self.output_types, self.shard_index_op_name)
         return FeatureSet(jvalue=jvalue)
     return None
Example #17
0
 def get_training_data(self):
     sample_rdd = self.text_set.get_samples().map(
         lambda sample: Sample.from_jtensor(
             features=sample.features + sample.labels,
             labels=JTensor.from_ndarray(np.array([0.0]))))
     return FeatureSet.sample_rdd(sample_rdd,
                                  sequential_order=self.sequential_order,
                                  shuffle=self.shuffle)
Example #18
0
    def get_training_data(self):
        fs = FeatureSet.image_set(self.image_set,
                                  sequential_order=self.sequential_order,
                                  shuffle=self.shuffle)
        fs = fs.transform(MergeFeatureLabelImagePreprocessing())
        fs = fs.transform(ImageFeatureToSample())

        return fs
Example #19
0
 def get_validation_data(self):
     if self.validation_rdd is not None:
         jvalue = callZooFunc("float",
                              "createMiniBatchFeatureSetFromStringRDD",
                              self.validation_rdd, self.batch_size,
                              self.sequential_order, self.shuffle)
         fs = FeatureSet(jvalue)
         return fs
     return None
Example #20
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols="features",
                 label_cols="label"):
        """
        Evaluate model.

        :param data: validation data. It can be XShardsor or Spark DataFrame, each partition is
               a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array
               or a list of numpy arrays.
        :param batch_size: Batch size used for validation. Default: 32.
        :param feature_cols: (Not supported yet) Feature column name(s) of data. Only used when
               data is a Spark  DataFrame. Default: None.
        :param label_cols: (Not supported yet) Label column name(s) of data. Only used when data
               is a Spark DataFrame. Default: None.
        :return:
        """
        assert data is not None, "validation data shouldn't be None"
        assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
                                         " argument when creating this estimator."

        if isinstance(data, DataFrame):
            if isinstance(feature_cols, list):
                data, _, feature_cols = \
                    BigDLEstimator._combine_cols(data, [feature_cols], col_name="features")

            if isinstance(label_cols, list):
                data, _, label_cols = \
                    BigDLEstimator._combine_cols(data, label_cols, col_name="label")

            self.nn_estimator._setNNBatchSize(batch_size)._setNNFeaturesCol(feature_cols) \
                ._setNNLabelCol(label_cols)

            self.nn_estimator.setValidation(None, None, self.metrics,
                                            batch_size)
            if self.log_dir is not None and self.app_name is not None:
                from bigdl.optim.optimizer import TrainSummary
                from bigdl.optim.optimizer import ValidationSummary
                val_summary = ValidationSummary(log_dir=self.log_dir,
                                                app_name=self.app_name)
                self.nn_estimator.setValidationSummary(val_summary)

            result = self.nn_estimator._eval(data)

        elif isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            val_feature_set = FeatureSet.sample_rdd(
                data.rdd.flatMap(xshard_to_sample))
            result = self.estimator.evaluate(val_feature_set, self.metrics,
                                             batch_size)
        else:
            raise ValueError(
                "Data should be XShards or Spark DataFrame, but get " +
                data.__class__.__name__)

        return bigdl_metric_results_to_dict(result)
Example #21
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray(
             nest.flatten(t), np.array([0.0])))
         return FeatureSet.sample_rdd(
             sample_rdd,
             sequential_order=self.sequential_order,
             shuffle=self.shuffle)
     return None
Example #22
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray(
             nest.flatten(t), np.array([0.0])))
         fs = FeatureSet.sample_rdd(sample_rdd,
                                    sequential_order=self.sequential_order,
                                    shuffle=self.shuffle)
         fs = fs.transform(SampleToMiniBatch(self.batch_size))
         return fs
     return None
Example #23
0
 def get_validation_data(self):
     if self.validation_rdd is not None:
         jvalue = callZooFunc("float", "createMiniBatchRDDFromStringRDD",
                              self.validation_rdd, self.batch_size)
         rdd = jvalue.value().toJavaRDD()
         fs = FeatureSet.rdd(rdd,
                             sequential_order=self.sequential_order,
                             shuffle=self.shuffle)
         return fs
     return None
Example #24
0
 def get_validation_data(self):
     if self.validation_image_set is not None:
         fs = FeatureSet.image_set(self.validation_image_set,
                                   sequential_order=self.sequential_order,
                                   shuffle=self.shuffle)
         fs = fs.transform(MergeFeatureLabelImagePreprocessing())
         fs = fs.transform(ImageFeatureToSample())
         fs = fs.transform(SampleToMiniBatch(self.batch_size))
         return fs
     return None
Example #25
0
 def get_validation_data(self):
     if self.validation_image_set is not None:
         fs = FeatureSet.image_set(
             self.validation_image_set,
             sequential_order=self.sequential_order,
             shuffle=self.shuffle).transform([
                 MergeFeatureLabelImagePreprocessing(),
                 ImageFeatureToSample()
             ])
         return fs
     return None
Example #26
0
 def get_validation_data(self):
     if self.validation_text_set is not None:
         sample_rdd = self.validation_text_set.get_samples().map(
             lambda sample: Sample.from_jtensor(
                 features=sample.features + sample.labels,
                 labels=JTensor.from_ndarray(np.array([0.0]))))
         return FeatureSet.sample_rdd(
             sample_rdd,
             sequential_order=self.sequential_order,
             shuffle=self.shuffle)
     return None
Example #27
0
def get_featureset(x, y, shuffle=True):
    x = np.split(x.data.numpy(), x.shape[0])
    y = np.split(y.data.numpy(), y.shape[0])
    print(x[0].shape)
    print(y[0].shape)
    samples = [
        Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i]))
        for i in range(len(x))
    ]
    sample_rdd = sc.parallelize(samples)
    return FeatureSet.sample_rdd(sample_rdd, shuffle=shuffle)
Example #28
0
 def get_validation_data(self):
     if self.validation_text_set is not None:
         sample_rdd = self.validation_text_set.get_samples().map(
             lambda sample: Sample.from_jtensor(
                 features=sample.features + sample.labels,
                 labels=JTensor.from_ndarray(np.array([0.0]))))
         fs = FeatureSet.sample_rdd(sample_rdd,
                                    sequential_order=self.sequential_order,
                                    shuffle=self.shuffle)
         fs = fs.transform(SampleToMiniBatch(self.batch_size))
         return fs
     return None
Example #29
0
 def create_train_features_Set(self):
     image_set = self.get_raw_image_set(with_label=True)
     feature_set = FeatureSet.image_frame(image_set.to_image_frame())
     train_transformer = ChainedPreprocessing([
         ImageBytesToMat(),
         ImageResize(256, 256),
         ImageRandomCrop(224, 224),
         ImageRandomPreprocessing(ImageHFlip(), 0.5),
         ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225),
         ImageMatToTensor(to_RGB=True, format="NHWC"),
         ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"])
     ])
     feature_set = feature_set.transform(train_transformer)
     return feature_set
Example #30
0
    def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None):
        assert data is not None, "validation data shouldn't be None"
        assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
                                         " argument when creating this estimator."

        if isinstance(data, DataFrame):
            raise NotImplementedError
        elif isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            from zoo.orca.learn.metrics import Metrics

            val_feature_set = FeatureSet.sample_rdd(data.rdd.flatMap(xshard_to_sample))
            result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size)
        else:
            raise ValueError("Data should be XShards or Spark DataFrame, but get " +
                             data.__class__.__name__)

        return bigdl_metric_results_to_dict(result)