def test_dataset_without_batch(self): x = np.random.rand(20, 10) y = np.random.randint(0, 2, (20)) rdd_x = self.sc.parallelize(x) rdd_y = self.sc.parallelize(y) rdd = rdd_x.zip(rdd_y) dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, []), names=["features", "labels"], val_rdd=rdd ) keras_model = self.create_model() model = KerasModel(keras_model) self.intercept(lambda: model.fit(dataset), "The batch_size of TFDataset must be" + " specified when used in KerasModel fit.") dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, []), names=["features", "labels"], ) self.intercept(lambda: model.evaluate(dataset), "The batch_per_thread of TFDataset must be " + "specified when used in KerasModel evaluate.") dataset = TFDataset.from_rdd(rdd_x, features=(tf.float32, [10]), names=["features", "labels"], ) self.intercept(lambda: model.predict(dataset), "The batch_per_thread of TFDataset must be" + " specified when used in KerasModel predict.")
class KerasEstimator(Estimator): def __init__(self, keras_model, metrics, model_dir, optimizer): self.model = KerasModel(keras_model, model_dir) self.load_checkpoint = False self.metrics = metrics self.tf_optimizer = None self.optimizer = optimizer from zoo.orca.learn.optimizers import Optimizer if self.optimizer is not None and isinstance(self.optimizer, Optimizer): self.optimizer = self.optimizer.get_optimizer() self.log_dir = None self.app_name = None self.clip_norm = None self.clip_min = None self.clip_max = None def fit(self, data, epochs=1, batch_size=32, feature_cols=None, label_cols=None, validation_data=None, session_config=None, checkpoint_trigger=None, auto_shard_files=True): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration( num_iterations),etc. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" data, validation_data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols, validation_data, "fit") if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map( _standardize_keras_target_data) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm( clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping( self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self def predict( self, data, batch_size=4, feature_cols=None, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature}, where feature is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame or XShards of Pandas DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" data = process_xshards_of_pandas_dataframe(data, feature_cols) assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None, auto_shard_files=False): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: evaluation result as a dictionary of {'metric name': metric value} """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert label_cols is not None, \ "label columns is None; it should not be None in evaluation" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert label_cols is not None, \ "label columns is None; it should not be None in evaluation" data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols) dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files) return self.model.evaluate(dataset, batch_per_thread=batch_size) def save_keras_model(self, path, overwrite=True): """ Save tensorflow keras model in this estimator. :param path: keras model save path. :param overwrite: Whether to silently overwrite any existing file at the target location. """ self.model.save_model(path, overwrite=overwrite) def get_model(self): """ Get the trained Keras model :return: The trained Keras model """ return self.model.model def save(self, model_path, overwrite=True): """ Save model to model_path :param model_path: path to save the trained model. :param overwrite: Whether to silently overwrite any existing file at the target location. :return: """ self.save_keras_model(model_path, overwrite=overwrite) def clear_gradient_clipping(self): """ Clear gradient clipping parameters. In this case, gradient clipping will not be applied. In order to take effect, it needs to be called before fit. :return: """ self.clip_norm = None self.clip_min = None self.clip_max = None def set_constant_gradient_clipping(self, min, max): """ Set constant gradient clipping during the training process. In order to take effect, it needs to be called before fit. :param min: The minimum value to clip by. :param max: The maximum value to clip by. :return: """ assert min > 0, "clip value should be larger than 0" assert min < max, "clip max should be larger than clip min" self.clip_min = min self.clip_max = max def set_l2_norm_gradient_clipping(self, clip_norm): """ Clip gradient to a maximum L2-Norm during the training process. In order to take effect, it needs to be called before fit. :param clip_norm: Gradient L2-Norm threshold. :return: """ self.clip_norm = clip_norm def save_keras_weights(self, filepath, overwrite=True, save_format=None): """ Save tensorflow keras model weights in this estimator. :param filepath: keras model weights save path. :param overwrite: Whether to silently overwrite any existing file at the target location. :param save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or '.keras' will default to HDF5 if `save_format` is `None`. Otherwise `None` defaults to 'tf'. """ self.model.save_weights(filepath, overwrite, save_format) def load_keras_weights(self, filepath, by_name=False): """ Save tensorflow keras model in this estimator. :param filepath: keras model weights save path. :param by_name: Boolean, whether to load weights by name or by topological order. Only topological loading is supported for weight files in TensorFlow format. """ self.model.load_weights(filepath, by_name)
class TFKerasWrapper(Estimator): def __init__(self, keras_model, metrics, model_dir): self.model = KerasModel(keras_model, model_dir) self.load_checkpoint = False self.metrics = metrics self.tf_optimizer = None self.log_dir = None self.app_name = None def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, checkpoint_trigger=None): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be bigdl optimzer trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboad(self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShard, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: if require hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance( data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=4, feature_cols=None, labels_cols=None, hard_code_batch_size=False): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for evaluation. :return: evaluation result as a dictionary of {'metric name': metric value} """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) return self.model.evaluate(dataset, batch_per_thread=batch_size) def save_keras_model(self, path): self.model.save_model(path)
class KerasEstimator(Estimator): def __init__(self, keras_model, metrics, model_dir, optimizer): self.model = KerasModel(keras_model, model_dir) self.load_checkpoint = False self.metrics = metrics self.tf_optimizer = None self.optimizer = optimizer from zoo.orca.learn.optimizers import Optimizer if self.optimizer is not None and isinstance(self.optimizer, Optimizer): self.optimizer = self.optimizer.get_optimizer() self.log_dir = None self.app_name = None self.clip_norm = None self.clip_min = None self.clip_max = None def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, checkpoint_trigger=None, auto_shard_files=True ): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map(_standardize_keras_target_data) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) self.tf_optimizer = TFOptimizer.from_keras(self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm(clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping(self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard(self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShard, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: if require hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance(data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=32, feature_cols=None, labels_cols=None, hard_code_batch_size=False, auto_shard_files=False ): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for evaluation. :return: evaluation result as a dictionary of {'metric name': metric value} """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files ) return self.model.evaluate(dataset, batch_per_thread=batch_size) def save_keras_model(self, path, overwrite=True): self.model.save_model(path, overwrite=overwrite) def get_model(self): return self.model.model def save(self, model_path, overwrite=True): self.save_keras_model(model_path, overwrite=True) def clear_gradient_clipping(self): self.clip_norm = None self.clip_min = None self.clip_max = None def set_constant_gradient_clipping(self, min, max): assert min > 0, "clip value should be larger than 0" assert min < max, "clip max should be larger than clip min" self.clip_min = min self.clip_max = max def set_l2_norm_gradient_clipping(self, clip_norm): self.clip_norm = clip_norm def save_keras_weights(self, filepath, overwrite=True, save_format=None): self.model.save_weights(filepath, overwrite, save_format) def load_keras_weights(self, filepath, by_name=False): self.model.load_weights(filepath, by_name)
class TFKerasWrapper(Estimator): def __init__(self, keras_model, model_dir): self.model = KerasModel(keras_model, model_dir) def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None): if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True) self.model.fit(dataset, batch_size=batch_size, epochs=epochs, session_config=session_config) return self def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False): if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=4, feature_cols=None, labels_cols=None, hard_code_batch_size=False): if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) return self.model.evaluate(dataset, batch_per_thread=batch_size)