def test_NNEstimator_multi_input(self): zx1 = ZLayer.Input(shape=(1, )) zx2 = ZLayer.Input(shape=(1, )) zz = ZLayer.merge([zx1, zx2], mode="concat") zy = ZLayer.Dense(2)(zz) zmodel = ZModel([zx1, zx2], zy) criterion = MSECriterion() df = self.get_estimator_df() estimator = NNEstimator(zmodel, criterion, [[1], [1]]).setMaxEpoch(5) \ .setBatchSize(4) nnmodel = estimator.fit(df) nnmodel.transform(df).collect()
def test_NNEstimator_works_with_VectorAssembler_multi_input(self): if self.sc.version.startswith("2"): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) assembler = VectorAssembler( inputCols=["user", "age", "income", "history"], outputCol="features") df = assembler.transform(df) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() classifier = NNClassifier(zmodel, criterion, [[1], [2], [2, 2]]) \ .setOptimMethod(Adam()) \ .setLearningRate(0.1) \ .setBatchSize(2) \ .setMaxEpoch(10) nnClassifierModel = classifier.fit(df) print(nnClassifierModel.getBatchSize()) res = nnClassifierModel.transform(df).collect()
def test_nnEstimator_multiInput(self): zx1 = ZLayer.Input(shape=(1, )) zx2 = ZLayer.Input(shape=(1, )) zz = ZLayer.merge([zx1, zx2], mode="concat") zy = ZLayer.Dense(2)(zz) zmodel = ZModel([zx1, zx2], zy) criterion = MSECriterion() df, _ = self.get_estimator_df() estimator = Estimator.from_bigdl(model=zmodel, loss=criterion, feature_preprocessing=[[1], [1]]) estimator.fit(df, epochs=5, batch_size=4) pred = estimator.predict(df) pred_data = pred.collect() assert type(pred).__name__ == 'DataFrame'
def test_merge_method_sum(self): zx1 = ZLayer.Input(shape=(8, )) zx2 = ZLayer.Input(shape=(6, )) zy1 = ZLayer.Dense(10)(zx1) zy2 = ZLayer.Dense(10)(zx2) zz = ZLayer.merge([zy1, zy2], mode="sum") zmodel = ZModel([zx1, zx2], zz, name="graph1") kx1 = KLayer.Input(shape=(8, )) kx2 = KLayer.Input(shape=(6, )) ky1 = KLayer.Dense(10)(kx1) ky2 = KLayer.Dense(10)(kx2) kz = kmerge([ky1, ky2], mode="sum") kmodel = KModel([kx1, kx2], kz) input_data = [np.random.random([2, 8]), np.random.random([2, 6])] self.compare_layer(kmodel, zmodel, input_data, self.convert_two_dense)
def test_nnEstimator_multiInput_cols(self): from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0), (4, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() est = Estimator.from_bigdl(model=zmodel, loss=criterion, optimizer=Adam(learningrate=0.1), feature_preprocessing=[[1], [2], [2, 2]]) est.fit(df, epochs=1, batch_size=4, feature_cols=["user", "age", "income", "history"]) res = est.predict(df, feature_cols=["user", "age", "income", "history"]) res_c = res.collect() assert type(res).__name__ == 'DataFrame'
def test_xshards_spark_estimator_multi_inputs(self): resource_path = os.path.join( os.path.split(__file__)[0], "../../resources") def transform(df): result = { "x": [ np.expand_dims(df['user'].to_numpy(), axis=1), np.expand_dims(df['item'].to_numpy(), axis=1) ], "y": df['label'].to_numpy() } return result file_path = os.path.join(resource_path, "orca/learn/ncf2.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) zx1 = ZLayer.Input(shape=(1, )) zx2 = ZLayer.Input(shape=(1, )) zz = ZLayer.merge([zx1, zx2], mode="concat") zy = ZLayer.Dense(2)(zz) model = ZModel([zx1, zx2], zy) optim_method = SGD(learningrate=0.01) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_bigdl(model=model, optimizer=optim_method, loss=ClassNLLCriterion(), metrics=[Accuracy()], model_dir=temp_dir_name) estimator.set_constant_gradient_clipping(0.1, 1.2) r1 = estimator.predict(data=data_shard) r_c = r1.collect() estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test") estimator.fit(data=data_shard, epochs=5, batch_size=8, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) summary = estimator.get_train_summary(tag="Loss") temp_path = os.path.join(temp_dir_name, "save_model") estimator.save(temp_path) eval_result = estimator.evaluate(data=data_shard, batch_size=8)
def test_merge_method_model_concat(self): zx1 = ZLayer.Input(shape=(4, )) zx2 = ZLayer.Input(shape=(5, )) zy1 = ZLayer.Dense(6, activation="sigmoid")(zx1) zbranch1 = ZModel(zx1, zy1)(zx1) zbranch2 = ZLayer.Dense(8)(zx2) zz = ZLayer.merge([zbranch1, zbranch2], mode="concat") zmodel = ZModel([zx1, zx2], zz) kx1 = KLayer.Input(shape=(4, )) kx2 = KLayer.Input(shape=(5, )) ky1 = KLayer.Dense(6, activation="sigmoid")(kx1) kbranch1 = KModel(kx1, ky1)(kx1) kbranch2 = KLayer.Dense(8)(kx2) kz = KLayer.merge([kbranch1, kbranch2], mode="concat") kmodel = KModel([kx1, kx2], kz) input_data = [np.random.random([2, 4]), np.random.random([2, 5])] self.compare_layer(kmodel, zmodel, input_data, self.convert_two_dense)
def test_merge_method_seq_concat(self): zx1 = ZLayer.Input(shape=(10, )) zx2 = ZLayer.Input(shape=(10, )) zy1 = ZLayer.Dense(12, activation="sigmoid")(zx1) zbranch1_node = ZModel(zx1, zy1)(zx1) zbranch2 = ZSequential() zbranch2.add(ZLayer.Dense(12, input_dim=10)) zbranch2_node = zbranch2(zx2) zz = ZLayer.merge([zbranch1_node, zbranch2_node], mode="concat") zmodel = ZModel([zx1, zx2], zz) kx1 = KLayer.Input(shape=(10, )) kx2 = KLayer.Input(shape=(10, )) ky1 = KLayer.Dense(12, activation="sigmoid")(kx1) kbranch1_node = KModel(kx1, ky1)(kx1) kbranch2 = KSequential() kbranch2.add(KLayer.Dense(12, input_dim=10)) kbranch2_node = kbranch2(kx2) kz = KLayer.merge([kbranch1_node, kbranch2_node], mode="concat") kmodel = KModel([kx1, kx2], kz) input_data = [np.random.random([2, 10]), np.random.random([2, 10])] self.compare_layer(kmodel, zmodel, input_data, self.convert_two_dense)