def test_nnEstimator_fit_with_train_val_summary(self): model = Sequential().add(Linear(2, 2)) criterion = MSECriterion() df, val_df = self.get_estimator_df() from bigdl.orca.learn.metrics import MAE est = Estimator.from_bigdl(model=model, loss=criterion, optimizer=Adam(), metrics=[MAE()], feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) tmp_dir = tempfile.mkdtemp() est.set_tensorboard(log_dir=tmp_dir, app_name="estTest") est.fit(df, epochs=5, batch_size=4, validation_data=val_df, validation_trigger=EveryEpoch(), checkpoint_trigger=SeveralIteration(1)) res = est.predict(df) loss_result = est.get_train_summary("Loss") mae_result = est.get_validation_summary("MAE") assert type(res).__name__ == 'DataFrame' assert len(loss_result) == 5 assert len(mae_result) == 4
def test_control_inputs(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) is_training = tf.placeholder(dtype=tf.bool, shape=()) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) features = tf.layers.dropout(features, training=is_training) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss( loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), tensor_with_value={is_training: (True, False)}, metrics={"loss": loss}) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def test_nnEstimator_multiOutput_cols(self): from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame([(1.0, 2.0, 1.0, 2.0), (2.0, 2.0, 2.0, 1.0), (3.0, 2.0, 1.0, 2.0), (4.0, 1.0, 1.0, 2.0)], ["user", "age", "label1", "label2"]) linear_model = Sequential().add(Linear(2, 2)) mse_criterion = MSECriterion() est = Estimator.from_bigdl(model=linear_model, loss=mse_criterion, optimizer=Adam(), feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) est.fit(df, 1, batch_size=4, feature_cols=["user", "age"], label_cols=["label1", "label2"]) result = est.predict(df, feature_cols=["user", "age"]) result_c = result.collect() assert type(result).__name__ == 'DataFrame'
def test_nnEstimator_multiInput_cols(self): from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0), (4, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() est = Estimator.from_bigdl(model=zmodel, loss=criterion, optimizer=Adam(learningrate=0.1), feature_preprocessing=[[1], [2], [2, 2]]) est.fit(df, epochs=1, batch_size=4, feature_cols=["user", "age", "income", "history"]) res = est.predict(df, feature_cols=["user", "age", "income", "history"]) res_c = res.collect() assert type(res).__name__ == 'DataFrame'
def test_nnEstimator_evaluation(self): df = self.get_estimator_df2() linear_model = Sequential().add(Linear(2, 2)).add(LogSoftMax()) est = Estimator.from_bigdl(model=linear_model, loss=ClassNLLCriterion(), optimizer=Adam(), feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([1]), metrics=Accuracy()) est.fit(data=df, epochs=10, batch_size=8) result = est.evaluate(df, batch_size=8) shift = udf(lambda p: float(p.index(max(p))), DoubleType()) pred = est.predict(df).withColumn("prediction", shift(col('prediction'))).cache() correct = pred.filter("label=prediction").count() overall = pred.count() accuracy = correct * 1.0 / overall assert accuracy == round(result['Top1Accuracy'], 2)
def test_tf_optimizer_with_sparse_gradient(self): ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) with tf.Graph().as_default(): dataset = TFDataset.from_rdd(training_rdd, names=["ids", "labels"], shapes=[[], []], types=[tf.int32, tf.int32], batch_size=8) id_tensor, label_tensor = dataset.tensors embedding_table = tf.get_variable(name="word_embedding", shape=[10, 5]) embedding = tf.nn.embedding_lookup(embedding_table, id_tensor) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=embedding, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, Adam(1e-3)) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def test_tf_optimizer_metrics(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, { "dense/": Adam(1e-3), "dense_1/": SGD(0.0) }, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}) initial_weights = optimizer.tf_model.training_helper_layer.get_weights( ) optimizer.optimize(end_trigger=MaxEpoch(1)) updated_weights = optimizer.tf_model.training_helper_layer.get_weights( ) for i in [ 0, 1 ]: # weights and bias combined with "dense/" should be updated assert not np.allclose(initial_weights[i], updated_weights[i]) for i in [ 2, 3 ]: # weights and bias combined with "dense_1" should be unchanged assert np.allclose(initial_weights[i], updated_weights[i]) optimizer.sess.close()
def test_checkpoint(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) model_dir = tempfile.mkdtemp() try: optimizer = TFOptimizer.from_loss(loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}, model_dir=model_dir) optimizer.optimize(end_trigger=MaxEpoch(1)) first_weights = optimizer.sess.run(tf.trainable_variables()[0]) import re ckpt_path = None versions = [] for (root, dirs, files) in os.walk(model_dir, topdown=True): temp_versions = [] for file_name in files: if re.match("^optimMethod-TFParkTraining\.[0-9]+$", file_name) is not None: version = int(file_name.split(".")[1]) temp_versions.append(version) if temp_versions: ckpt_path = root versions = temp_versions break assert ckpt_path is not None, "Cannot fine checkpoint file" optimizer.sess.run( tf.global_variables_initializer()) # reset variable optimizer_load = TFOptimizer.from_loss( loss, Adam(), session=optimizer.sess, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}, model_dir=model_dir) optimizer_load.load_checkpoint(ckpt_path, max(versions)) loaded_first_weights_before_train = optimizer.sess.run( tf.trainable_variables()[0]) assert np.allclose(first_weights, loaded_first_weights_before_train) # max epoch still 1, should not train optimizer_load.optimize(end_trigger=MaxEpoch(1)) loaded_first_weights = optimizer.sess.run( tf.trainable_variables()[0]) assert np.allclose(first_weights, loaded_first_weights) # max epoch increase 1, should train 1 epoch optimizer_load.optimize(end_trigger=MaxEpoch(2)) loaded_first_weights_2 = optimizer.sess.run( tf.trainable_variables()[0]) assert not np.allclose(first_weights, loaded_first_weights_2) optimizer_load.sess.close() finally: import shutil shutil.rmtree(model_dir)
def test_nnEstimator(self): from bigdl.dllib.nnframes import NNModel linear_model = Sequential().add(Linear(2, 2)) mse_criterion = MSECriterion() df, _ = self.get_estimator_df() est = Estimator.from_bigdl(model=linear_model, loss=mse_criterion, optimizer=Adam(), feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) res0 = est.predict(df) res0_c = res0.collect() est.fit(df, 2, batch_size=4) nn_model = NNModel(est.get_model(), feature_preprocessing=SeqToTensor([2])) res1 = nn_model.transform(df) res2 = est.predict(df) res1_c = res1.collect() res2_c = res2.collect() assert type(res1).__name__ == 'DataFrame' assert type(res2).__name__ == 'DataFrame' assert len(res1_c) == len(res2_c) for idx in range(len(res1_c)): assert res1_c[idx]["prediction"] == res2_c[idx]["prediction"] with tempfile.TemporaryDirectory() as tempdirname: temp_path = os.path.join(tempdirname, "model") est.save(temp_path) est2 = Estimator.from_bigdl(model=linear_model, loss=mse_criterion) est2.load(temp_path, optimizer=Adam(), loss=mse_criterion, feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) est2.set_constant_gradient_clipping(0.1, 1.2) est2.clear_gradient_clipping() res3 = est2.predict(df) res3_c = res3.collect() assert type(res3).__name__ == 'DataFrame' assert len(res1_c) == len(res3_c) for idx in range(len(res1_c)): assert res1_c[idx]["prediction"] == res3_c[idx]["prediction"] est2.fit(df, 4, batch_size=4) data = self.sc.parallelize([((2.0, 1.0), (1.0, 2.0)), ((1.0, 2.0), (2.0, 1.0)), ((2.0, 1.0), (1.0, 2.0)), ((1.0, 2.0), (2.0, 1.0))]) data_shard = SparkXShards(data) data_shard = data_shard.transform_shard( lambda feature_label_tuple: { "x": np.stack([ np.expand_dims(np.array(feature_label_tuple[0][0]), axis=0 ), np.expand_dims(np.array(feature_label_tuple[0][1]), axis=0) ], axis=1), "y": np.stack([ np.expand_dims(np.array(feature_label_tuple[1][0]), axis=0 ), np.expand_dims(np.array(feature_label_tuple[1][1]), axis=0) ], axis=1) }) res4 = est.predict(data_shard) res4_c = res4.collect() assert type(res4).__name__ == 'SparkXShards' for idx in range(len(res4_c)): assert abs(res4_c[idx]["prediction"][0][0] - res3_c[idx]["prediction"][0]) == 0 assert abs(res4_c[idx]["prediction"][0][1] - res3_c[idx]["prediction"][1]) == 0 est.fit(data_shard, 1, batch_size=4) res5 = est.predict(data_shard) res5_c = res5.collect() res6 = est.predict(df) res6_c = res6.collect() for idx in range(len(res5_c)): assert abs(res5_c[idx]["prediction"][0][0] - res6_c[idx]["prediction"][0]) == 0 assert abs(res5_c[idx]["prediction"][0][1] - res6_c[idx]["prediction"][1]) == 0