def test_NNEstimator_works_with_VectorAssembler_multi_input(self): if self.sc.version.startswith("2"): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) assembler = VectorAssembler( inputCols=["user", "age", "income", "history"], outputCol="features") df = assembler.transform(df) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() classifier = NNClassifier(zmodel, criterion, [[1], [2], [2, 2]]) \ .setOptimMethod(Adam()) \ .setLearningRate(0.1) \ .setBatchSize(2) \ .setMaxEpoch(10) nnClassifierModel = classifier.fit(df) print(nnClassifierModel.getBatchSize()) res = nnClassifierModel.transform(df).collect()
def test_nnEstimator_multiInput_cols(self): from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0), (4, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() est = Estimator.from_bigdl(model=zmodel, loss=criterion, optimizer=Adam(learningrate=0.1), feature_preprocessing=[[1], [2], [2, 2]]) est.fit(df, epochs=1, batch_size=4, feature_cols=["user", "age", "income", "history"]) res = est.predict(df, feature_cols=["user", "age", "income", "history"]) res_c = res.collect() assert type(res).__name__ == 'DataFrame'
def _to_tensor(self): data = self.model_inputs[0].zvalue indices = self.model_inputs[1].zvalue if self._initializer and isinstance(data, zautograd.Parameter): embedding = zlayers.Embedding(input_dim=data.shape[0], output_dim=data.shape[1], weights=data.get_weight(), input_length=indices.shape[1]) return embedding(indices) else: dim = int(self.onnx_attr['axis']) assert dim >= 1, "Currently only dim>=1 is supported." assert indices.shape == ( 1, ), "Currently only one index is supported." index = int(indices.get_weight().max()) return zautograd.expand_dims(data.index_select(dim=dim, index=index), axis=dim)
def test_embedding(self): input_data = np.random.randint(1000, size=(32, 10)) zlayer = ZLayer.Embedding(1000, 64, input_shape=(10, )) klayer = KLayer.Embedding(1000, 64, input_length=10) self.compare_layer(klayer, zlayer, input_data, WeightsConverter.convert_embedding)