def test_tfdataset_with_dataframe(self): rdd = self.sc.range(0, 1000) df = rdd.map(lambda x: (DenseVector( np.random.rand(20).astype(np.float)), x % 10)).toDF( ["feature", "label"]) train_df, val_df = df.randomSplit([0.7, 0.3]) dataset = TFDataset.from_dataframe(train_df, feature_cols=["feature"], labels_cols=["label"], batch_size=32, validation_df=val_df) seq = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(20, )), tf.keras.layers.Dense(10, activation="softmax") ]) seq.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(seq) model.fit(dataset) dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], batch_per_thread=32) model.predict(dataset).collect() dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], labels_cols=["label"], batch_per_thread=32) model.evaluate(dataset)
def create_ds(mode): if mode == "train": dataset = TFDataset.from_dataframe(train_df, feature_cols=["feature"], labels_cols=["label"], batch_size=32, validation_df=val_df) elif mode == "predict": dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], batch_per_thread=32) elif mode == "evaluate": dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], labels_cols=["label"], batch_per_thread=32) else: raise ValueError("unrecognized mode: {}".format(mode)) return dataset