Esempio n. 1
0
    def test_write_dataframe_invalid_params(self, target_df, format, mode):
        # arrange
        spark_client = SparkClient()

        # act and assert
        with pytest.raises(ValueError):
            spark_client.write_dataframe(dataframe=target_df,
                                         format_=format,
                                         mode=mode)
    def write(
        self,
        feature_set: FeatureSet,
        dataframe: DataFrame,
        spark_client: SparkClient,
    ) -> Optional[StreamingQuery]:
        """Loads the latest data from a feature set into the Feature Store.

        Args:
            feature_set: object processed with feature set metadata.
            dataframe: Spark dataframe containing data from a feature set.
            spark_client: client for Spark connections with external services.

        Returns:
            Streaming handler if writing streaming df, None otherwise.

        If the debug_mode is set to True, a temporary table with a name in the format:
        online_feature_store__{feature_set.name} will be created instead of writing to
        the real online feature store. If dataframe is streaming this temporary table
        will be updated in real time.

        """
        if dataframe.isStreaming:
            if self.debug_mode:
                return self._write_in_debug_mode(
                    feature_set=feature_set,
                    dataframe=dataframe,
                    spark_client=spark_client,
                )
            return self._write_stream(feature_set=feature_set,
                                      dataframe=dataframe,
                                      spark_client=spark_client)

        latest_df = self.filter_latest(dataframe=dataframe,
                                       id_columns=feature_set.keys_columns)

        if self.debug_mode:
            return self._write_in_debug_mode(feature_set=feature_set,
                                             dataframe=latest_df,
                                             spark_client=spark_client)

        # TODO: Refactor this logic using the Sink
        for table in [feature_set.name, feature_set.entity]:
            spark_client.write_dataframe(
                dataframe=latest_df,
                format_=self.db_config.format_,
                mode=self.db_config.mode,
                **self.db_config.get_options(table=table),
            )
Esempio n. 3
0
    def test_write_stream(self, feature_set, has_checkpoint, monkeypatch):
        # arrange
        spark_client = SparkClient()
        spark_client.write_stream = Mock()
        spark_client.write_dataframe = Mock()
        spark_client.write_stream.return_value = Mock(spec=StreamingQuery)

        dataframe = Mock(spec=DataFrame)
        dataframe.isStreaming = True

        if has_checkpoint:
            monkeypatch.setenv("STREAM_CHECKPOINT_PATH", "test")

        cassandra_config = CassandraConfig(keyspace="feature_set")
        target_checkpoint_path = ("test/entity/feature_set"
                                  if cassandra_config.stream_checkpoint_path
                                  else None)

        writer = OnlineFeatureStoreWriter(cassandra_config)
        writer.filter_latest = Mock()

        # act
        stream_handler = writer.write(feature_set, dataframe, spark_client)

        # assert
        assert isinstance(stream_handler, StreamingQuery)
        spark_client.write_stream.assert_any_call(
            dataframe,
            processing_time=cassandra_config.stream_processing_time,
            output_mode=cassandra_config.stream_output_mode,
            checkpoint_path=target_checkpoint_path,
            format_=cassandra_config.format_,
            mode=cassandra_config.mode,
            **cassandra_config.get_options(table=feature_set.name),
        )
        writer.filter_latest.assert_not_called()
        spark_client.write_dataframe.assert_not_called()
Esempio n. 4
0
 def test_write_dataframe(self, format, mode, mocked_spark_write):
     SparkClient.write_dataframe(mocked_spark_write, format, mode)
     mocked_spark_write.save.assert_called_with(format=format, mode=mode)