def test_write_dataframe_invalid_params(self, target_df, format, mode): # arrange spark_client = SparkClient() # act and assert with pytest.raises(ValueError): spark_client.write_dataframe(dataframe=target_df, format_=format, mode=mode)
def write( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, ) -> Optional[StreamingQuery]: """Loads the latest data from a feature set into the Feature Store. Args: feature_set: object processed with feature set metadata. dataframe: Spark dataframe containing data from a feature set. spark_client: client for Spark connections with external services. Returns: Streaming handler if writing streaming df, None otherwise. If the debug_mode is set to True, a temporary table with a name in the format: online_feature_store__{feature_set.name} will be created instead of writing to the real online feature store. If dataframe is streaming this temporary table will be updated in real time. """ if dataframe.isStreaming: if self.debug_mode: return self._write_in_debug_mode( feature_set=feature_set, dataframe=dataframe, spark_client=spark_client, ) return self._write_stream(feature_set=feature_set, dataframe=dataframe, spark_client=spark_client) latest_df = self.filter_latest(dataframe=dataframe, id_columns=feature_set.keys_columns) if self.debug_mode: return self._write_in_debug_mode(feature_set=feature_set, dataframe=latest_df, spark_client=spark_client) # TODO: Refactor this logic using the Sink for table in [feature_set.name, feature_set.entity]: spark_client.write_dataframe( dataframe=latest_df, format_=self.db_config.format_, mode=self.db_config.mode, **self.db_config.get_options(table=table), )
def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): # arrange spark_client = SparkClient() spark_client.write_stream = Mock() spark_client.write_dataframe = Mock() spark_client.write_stream.return_value = Mock(spec=StreamingQuery) dataframe = Mock(spec=DataFrame) dataframe.isStreaming = True if has_checkpoint: monkeypatch.setenv("STREAM_CHECKPOINT_PATH", "test") cassandra_config = CassandraConfig(keyspace="feature_set") target_checkpoint_path = ("test/entity/feature_set" if cassandra_config.stream_checkpoint_path else None) writer = OnlineFeatureStoreWriter(cassandra_config) writer.filter_latest = Mock() # act stream_handler = writer.write(feature_set, dataframe, spark_client) # assert assert isinstance(stream_handler, StreamingQuery) spark_client.write_stream.assert_any_call( dataframe, processing_time=cassandra_config.stream_processing_time, output_mode=cassandra_config.stream_output_mode, checkpoint_path=target_checkpoint_path, format_=cassandra_config.format_, mode=cassandra_config.mode, **cassandra_config.get_options(table=feature_set.name), ) writer.filter_latest.assert_not_called() spark_client.write_dataframe.assert_not_called()
def test_write_dataframe(self, format, mode, mocked_spark_write): SparkClient.write_dataframe(mocked_spark_write, format, mode) mocked_spark_write.save.assert_called_with(format=format, mode=mode)