def create_image_model(self): data = tf.keras.layers.Input(shape=[224, 224, 3]) x = tf.keras.layers.Flatten()(data) predictions = tf.keras.layers.Dense(10, activation='softmax')(x) model = tf.keras.models.Model(inputs=data, outputs=predictions) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) return KerasModel(model)
def main(max_epoch): sc = init_nncontext() training_rdd = get_data_rdd("train", sc) testing_rdd = get_data_rdd("test", sc) dataset = TFDataset.from_rdd(training_rdd, features=(tf.float32, [28, 28, 1]), labels=(tf.int32, []), batch_size=320, val_rdd=testing_rdd) model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) keras_model = KerasModel(model) keras_model.fit(dataset, epochs=max_epoch, distributed=True) eval_dataset = TFDataset.from_rdd(testing_rdd, features=(tf.float32, [28, 28, 1]), labels=(tf.int32, []), batch_per_thread=80) result = keras_model.evaluate(eval_dataset) print(model.metrics_names) print(result) # >> ['loss', 'acc'] # >> [0.08865142822265625, 0.9722] model.save_weights("/tmp/mnist_keras.h5")
def check_dataset(self, create_ds): seq = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(20, )), tf.keras.layers.Dense(10, activation="softmax") ]) seq.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(seq) model.fit(create_ds("train")) model.predict(create_ds("predict")).collect() model.evaluate(create_ds("evaluate"))
def test_dataset_without_batch(self): x = np.random.rand(20, 10) y = np.random.randint(0, 2, (20)) rdd_x = self.sc.parallelize(x) rdd_y = self.sc.parallelize(y) rdd = rdd_x.zip(rdd_y) dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, []), names=["features", "labels"], val_rdd=rdd ) keras_model = self.create_model() model = KerasModel(keras_model) self.intercept(lambda: model.fit(dataset), "The batch_size of TFDataset must be" + " specified when used in KerasModel fit.") dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, []), names=["features", "labels"], ) self.intercept(lambda: model.evaluate(dataset), "The batch_per_thread of TFDataset must be " + "specified when used in KerasModel evaluate.") dataset = TFDataset.from_rdd(rdd_x, features=(tf.float32, [10]), names=["features", "labels"], ) self.intercept(lambda: model.predict(dataset), "The batch_per_thread of TFDataset must be" + " specified when used in KerasModel predict.")
def main(max_epoch): _ = init_nncontext() (training_images_data, training_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (testing_images_data, testing_labels_data) = mnist.read_data_sets("/tmp/mnist", "test") training_images_data = (training_images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD testing_images_data = (testing_images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) keras_model = KerasModel(model) keras_model.fit(training_images_data, training_labels_data, validation_data=(testing_images_data, testing_labels_data), epochs=max_epoch, batch_size=320, distributed=True) result = keras_model.evaluate(testing_images_data, testing_labels_data, distributed=True, batch_per_thread=80) print(result) # >> [0.08865142822265625, 0.9722] # the following assert is used for internal testing assert result['acc Top1Accuracy'] > 0.95 keras_model.save_weights("/tmp/mnist_keras.h5")
def test_evaluate_with_ndarray(self): keras_model = self.create_model() model = KerasModel(keras_model) x, y = self.create_training_data() results_pre = model.evaluate(x, y) model.fit(x, y, batch_size=4, epochs=10) results_after = model.evaluate(x, y) assert results_pre["loss"] > results_after["loss"]
def test_evaluate_with_ndarray_distributed(self): keras_model = self.create_model() model = KerasModel(keras_model) x, y = self.create_training_data() results_pre = model.evaluate(x, y) model.fit(x, y, batch_size=4, epochs=10) results_after = model.evaluate(x, y, distributed=True) assert results_pre[0] > results_after[0]
def test_invalid_data_handling(self): keras_model = self.create_multi_input_output_model() model = KerasModel(keras_model) x, y = self.create_training_data() val_x, val_y = self.create_training_data() # Number doesn't match with pytest.raises(AssertionError) as excinfo: model.fit([x, x], [y, y, y], batch_size=4, distributed=True) assert "model_target number does not match data number" in str( excinfo.value) # Dict as input with pytest.raises(AssertionError) as excinfo: model.fit({"input_1": x}, [y, y], batch_size=4, distributed=True) assert "all model_input names should exist in data" in str( excinfo.value)
def test_tfdataset_with_tf_data_dataset(self): dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(102, 28, 28, 1), np.random.randint(0, 10, size=(102, )))) dataset = dataset.map(lambda feature, label: (tf.to_float(feature), label)) dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) seq = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(10, activation="softmax") ]) seq.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(seq) model.fit(dataset) dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(102, 28, 28, 1), np.random.randint(0, 10, size=(102, )))) dataset = dataset.map(lambda feature, label: (tf.to_float(feature), label)) dataset = TFDataset.from_tf_data_dataset(dataset, batch_per_thread=16) model.evaluate(dataset)
def test_gradient_clipping(self): data = tf.keras.layers.Input(shape=[10]) x = tf.keras.layers.Flatten()(data) x = tf.keras.layers.Dense(10, activation='relu')(x) predictions = tf.keras.layers.Dense(2, activation='softmax')(x) model = tf.keras.models.Model(inputs=data, outputs=predictions) model.compile(optimizer=tf.keras.optimizers.SGD(lr=1, clipvalue=1e-8), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(model) pre_weights = model.get_weights() dataset = self.create_training_dataset() # 5 iterations model.fit(dataset) current_weight = model.get_weights() np.all(np.abs((current_weight[0] - pre_weights[0])) < 1e-7)
class KerasEstimator(Estimator): def __init__(self, keras_model, metrics, model_dir, optimizer): self.model = KerasModel(keras_model, model_dir) self.load_checkpoint = False self.metrics = metrics self.tf_optimizer = None self.optimizer = optimizer from zoo.orca.learn.optimizers import Optimizer if self.optimizer is not None and isinstance(self.optimizer, Optimizer): self.optimizer = self.optimizer.get_optimizer() self.log_dir = None self.app_name = None self.clip_norm = None self.clip_min = None self.clip_max = None def fit(self, data, epochs=1, batch_size=32, feature_cols=None, label_cols=None, validation_data=None, session_config=None, checkpoint_trigger=None, auto_shard_files=True): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration( num_iterations),etc. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" data, validation_data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols, validation_data, "fit") if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map( _standardize_keras_target_data) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm( clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping( self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self def predict( self, data, batch_size=4, feature_cols=None, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature}, where feature is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame or XShards of Pandas DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" data = process_xshards_of_pandas_dataframe(data, feature_cols) assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None, auto_shard_files=False): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: evaluation result as a dictionary of {'metric name': metric value} """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert label_cols is not None, \ "label columns is None; it should not be None in evaluation" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert label_cols is not None, \ "label columns is None; it should not be None in evaluation" data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols) dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files) return self.model.evaluate(dataset, batch_per_thread=batch_size) def save_keras_model(self, path, overwrite=True): """ Save tensorflow keras model in this estimator. :param path: keras model save path. :param overwrite: Whether to silently overwrite any existing file at the target location. """ self.model.save_model(path, overwrite=overwrite) def get_model(self): """ Get the trained Keras model :return: The trained Keras model """ return self.model.model def save(self, model_path, overwrite=True): """ Save model to model_path :param model_path: path to save the trained model. :param overwrite: Whether to silently overwrite any existing file at the target location. :return: """ self.save_keras_model(model_path, overwrite=overwrite) def clear_gradient_clipping(self): """ Clear gradient clipping parameters. In this case, gradient clipping will not be applied. In order to take effect, it needs to be called before fit. :return: """ self.clip_norm = None self.clip_min = None self.clip_max = None def set_constant_gradient_clipping(self, min, max): """ Set constant gradient clipping during the training process. In order to take effect, it needs to be called before fit. :param min: The minimum value to clip by. :param max: The maximum value to clip by. :return: """ assert min > 0, "clip value should be larger than 0" assert min < max, "clip max should be larger than clip min" self.clip_min = min self.clip_max = max def set_l2_norm_gradient_clipping(self, clip_norm): """ Clip gradient to a maximum L2-Norm during the training process. In order to take effect, it needs to be called before fit. :param clip_norm: Gradient L2-Norm threshold. :return: """ self.clip_norm = clip_norm def save_keras_weights(self, filepath, overwrite=True, save_format=None): """ Save tensorflow keras model weights in this estimator. :param filepath: keras model weights save path. :param overwrite: Whether to silently overwrite any existing file at the target location. :param save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or '.keras' will default to HDF5 if `save_format` is `None`. Otherwise `None` defaults to 'tf'. """ self.model.save_weights(filepath, overwrite, save_format) def load_keras_weights(self, filepath, by_name=False): """ Save tensorflow keras model in this estimator. :param filepath: keras model weights save path. :param by_name: Boolean, whether to load weights by name or by topological order. Only topological loading is supported for weight files in TensorFlow format. """ self.model.load_weights(filepath, by_name)
class TFKerasWrapper(Estimator): def __init__(self, keras_model, metrics, model_dir): self.model = KerasModel(keras_model, model_dir) self.load_checkpoint = False self.metrics = metrics self.tf_optimizer = None self.log_dir = None self.app_name = None def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, checkpoint_trigger=None): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be bigdl optimzer trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboad(self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShard, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: if require hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance( data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=4, feature_cols=None, labels_cols=None, hard_code_batch_size=False): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for evaluation. :return: evaluation result as a dictionary of {'metric name': metric value} """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) return self.model.evaluate(dataset, batch_per_thread=batch_size) def save_keras_model(self, path): self.model.save_model(path)
def test_tfdataset_with_tfrecord(self): model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) keras_model = KerasModel(model) def parse_fn(example): keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'), 'image/class/label': tf.FixedLenFeature([1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)), } items_to_handlers = { 'image': tf.contrib.slim.tfexample_decoder.Image(shape=[28, 28, 1], channels=1), 'label': tf.contrib.slim.tfexample_decoder.Tensor('image/class/label', shape=[]), } decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) results = decoder.decode(example) if len(results[0].shape) > 0: feature = results[0] label = results[1] else: feature = results[1] label = results[0] return feature, label train_path = os.path.join(resource_path, "tfrecord/mnist_train.tfrecord") test_path = os.path.join(resource_path, "tfrecord/mnist_test.tfrecord") dataset = TFDataset.from_tfrecord(train_path, parse_fn=parse_fn, batch_size=8, validation_file_path=test_path) keras_model.fit(dataset) predict_dataset = TFDataset.from_tfrecord(test_path, parse_fn=lambda x: (parse_fn(x)[0], ), batch_per_thread=1) result = keras_model.predict(predict_dataset) result.collect()
class KerasEstimator(Estimator): def __init__(self, keras_model, metrics, model_dir, optimizer): self.model = KerasModel(keras_model, model_dir) self.load_checkpoint = False self.metrics = metrics self.tf_optimizer = None self.optimizer = optimizer from zoo.orca.learn.optimizers import Optimizer if self.optimizer is not None and isinstance(self.optimizer, Optimizer): self.optimizer = self.optimizer.get_optimizer() self.log_dir = None self.app_name = None self.clip_norm = None self.clip_min = None self.clip_max = None def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, checkpoint_trigger=None, auto_shard_files=True ): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map(_standardize_keras_target_data) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) self.tf_optimizer = TFOptimizer.from_keras(self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm(clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping(self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard(self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShard, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: if require hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance(data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=32, feature_cols=None, labels_cols=None, hard_code_batch_size=False, auto_shard_files=False ): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for evaluation. :return: evaluation result as a dictionary of {'metric name': metric value} """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files ) return self.model.evaluate(dataset, batch_per_thread=batch_size) def save_keras_model(self, path, overwrite=True): self.model.save_model(path, overwrite=overwrite) def get_model(self): return self.model.model def save(self, model_path, overwrite=True): self.save_keras_model(model_path, overwrite=True) def clear_gradient_clipping(self): self.clip_norm = None self.clip_min = None self.clip_max = None def set_constant_gradient_clipping(self, min, max): assert min > 0, "clip value should be larger than 0" assert min < max, "clip max should be larger than clip min" self.clip_min = min self.clip_max = max def set_l2_norm_gradient_clipping(self, clip_norm): self.clip_norm = clip_norm def save_keras_weights(self, filepath, overwrite=True, save_format=None): self.model.save_weights(filepath, overwrite, save_format) def load_keras_weights(self, filepath, by_name=False): self.model.load_weights(filepath, by_name)
def __init__(self, keras_model, model_dir): self.model = KerasModel(keras_model, model_dir)
class TFKerasWrapper(Estimator): def __init__(self, keras_model, model_dir): self.model = KerasModel(keras_model, model_dir) def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None): if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True) self.model.fit(dataset, batch_size=batch_size, epochs=epochs, session_config=session_config) return self def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False): if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) else: return predicted_rdd def evaluate(self, data, batch_size=4, feature_cols=None, labels_cols=None, hard_code_batch_size=False): if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) return self.model.evaluate(dataset, batch_per_thread=batch_size)
# training_dataset = TFDataset.from_ndarrays(tensors=x.values,batch_size=32) # print("Created TF Dataset\n") model = tf.keras.Sequential([ tf.keras.layers.Dense(inputDim, activation="relu", input_shape=(2, )), tf.keras.layers.Dense(inputDim, activation='relu'), tf.keras.layers.Dense(outputDim), ]) optimizer = tf.keras.optimizers.Adam() model.compile( optimizer=optimizer, loss='mean_squared_error', ) keras_model = KerasModel(model) print("Created Keras Model! \n") # print("batchSize TFDataset: {}".format(training_dataset.batch_size)) # keras_model.fit(x=x.values, y=y.values, epochs=5) print("Training Complete!\n") # keras_model.save_model("../resources/savedModels/tfParkModel.h5") weights = keras_model.get_weights() # weights = np.array(weights, dtype=object) # print(weights, type(weights)) kModel = Model() keras_model.save_weights("../resources/savedModels/keras/weights/wt.h5")
def _load_model(labor, path): with variable_creator_scope(): labor.load(path) model = KerasModel(labor.model) model.labor = labor return model
def _load_model(labor, path): labor.load(path) model = KerasModel(labor.model) model.labor = labor return model