def test_dataframe(self): sc = init_nncontext() rdd = sc.range(0, 10) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"batch_size": 4, "lr": 0.8} trainer = Estimator(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def test_predict_xshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100, )) }) expected = train_data_shard.collect() expected = [shard["x"] for shard in expected] for x in expected: print(x.shape) expected = np.concatenate(expected) config = {} trainer = Estimator(model_creator=identity_model_creator, verbose=True, config=config, workers_per_node=2) result = trainer.predict(train_data_shard, batch_size=10).collect() result = [shard["prediction"] for shard in result] result = np.concatenate(result) assert np.allclose(expected, result)