Ejemplo n.º 1
0
    def test_get_schema(self):
        expected_schema = [
            {"column_name": "id", "type": LongType(), "primary_key": True},
            {"column_name": "timestamp", "type": TimestampType(), "primary_key": False},
            {
                "column_name": "feature1__avg_over_2_minutes_fixed_windows",
                "type": FloatType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__avg_over_15_minutes_fixed_windows",
                "type": FloatType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows",
                "type": DoubleType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows",
                "type": DoubleType(),
                "primary_key": False,
            },
        ]

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.DOUBLE),
                        ]
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["2 minutes", "15 minutes"],
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.BIGINT,
                )
            ],
            timestamp=TimestampFeature(),
        )

        schema = feature_set.get_schema()

        assert schema == expected_schema
    def test_feature_set_with_invalid_feature(self, key_id, timestamp_c,
                                              dataframe):
        spark_client = SparkClient()

        with pytest.raises(ValueError):
            AggregatedFeatureSet(
                name="name",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature1",
                        description="test",
                        transformation=SparkFunctionTransform(functions=[
                            Function(functions.avg, DataType.FLOAT)
                        ], ).with_window(
                            partition_by="id",
                            mode="row_windows",
                            window_definition=["2 events"],
                        ),
                    ),
                ],
                keys=[key_id],
                timestamp=timestamp_c,
            ).construct(dataframe, spark_client)
Ejemplo n.º 3
0
    def test_construct(
        self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe
    ):
        # given

        spark_client = SparkClient()

        # arrange

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ]
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["2 minutes", "15 minutes"],
                    ),
                ),
                Feature(
                    name="divided_feature",
                    description="unit test",
                    dtype=DataType.FLOAT,
                    transformation=CustomTransform(
                        transformer=divide, column1="feature1", column2="feature2",
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        )

        output_df = (
            feature_set.construct(feature_set_dataframe, client=spark_client)
            .orderBy(feature_set.timestamp_column)
            .select(feature_set.columns)
        )

        target_df = fixed_windows_output_feature_set_dataframe.orderBy(
            feature_set.timestamp_column
        ).select(feature_set.columns)

        # assert
        assert_dataframe_equality(output_df, target_df)
Ejemplo n.º 4
0
def feature_set():
    feature_set = FeatureSet(
        name="feature_set",
        entity="entity",
        description="description",
        features=[
            Feature(
                name="feature1",
                description="test",
                transformation=SparkFunctionTransform(functions=[
                    Function(functions.avg, DataType.FLOAT),
                    Function(functions.stddev_pop, DataType.DOUBLE),
                ]).with_window(
                    partition_by="id",
                    order_by=TIMESTAMP_COLUMN,
                    mode="fixed_windows",
                    window_definition=["2 minutes", "15 minutes"],
                ),
            ),
        ],
        keys=[
            KeyFeature(
                name="id",
                description="The user's Main ID or device ID",
                dtype=DataType.BIGINT,
            )
        ],
        timestamp=TimestampFeature(),
    )

    return feature_set
    def test_feature_transform(self, feature_set_dataframe, target_df_spark):
        test_feature = Feature(
            name="feature",
            description="unit test",
            transformation=SparkFunctionTransform(
                functions=[Function(functions.cos, DataType.DOUBLE)], ),
            from_column="feature1",
        )

        output_df = test_feature.transform(feature_set_dataframe)

        assert_dataframe_equality(output_df, target_df_spark)
 def test_negative_windows(self, feature_set_dataframe):
     with pytest.raises(KeyError):
         Feature(
             name="feature1",
             description="unit test",
             transformation=SparkFunctionTransform(
                 functions=[Function(functions.avg,
                                     DataType.DOUBLE)], ).with_window(
                                         partition_by="id",
                                         mode="fixed_windows",
                                         window_definition=["-2 weeks"],
                                     ),
         ).transform(feature_set_dataframe)
Ejemplo n.º 7
0
 def test_source_raise(self):
     with pytest.raises(ValueError,
                        match="source must be a Source instance"):
         FeatureSetPipeline(
             spark_client=SparkClient(),
             source=Mock(
                 spark_client=SparkClient(),
                 readers=[
                     TableReader(
                         id="source_a",
                         database="db",
                         table="table",
                     ),
                 ],
                 query="select * from source_a",
             ),
             feature_set=Mock(
                 spec=FeatureSet,
                 name="feature_set",
                 entity="entity",
                 description="description",
                 keys=[
                     KeyFeature(
                         name="user_id",
                         description="The user's Main ID or device ID",
                         dtype=DataType.INTEGER,
                     )
                 ],
                 timestamp=TimestampFeature(from_column="ts"),
                 features=[
                     Feature(
                         name="listing_page_viewed__rent_per_month",
                         description="Average of something.",
                         transformation=SparkFunctionTransform(functions=[
                             Function(functions.avg, DataType.FLOAT),
                             Function(functions.stddev_pop, DataType.FLOAT),
                         ], ).with_window(
                             partition_by="user_id",
                             order_by=TIMESTAMP_COLUMN,
                             window_definition=["7 days", "2 weeks"],
                             mode="fixed_windows",
                         ),
                     ),
                 ],
             ),
             sink=Mock(
                 spec=Sink,
                 writers=[HistoricalFeatureStoreWriter(db_config=None)],
             ),
         )
Ejemplo n.º 8
0
def feature_set_pipeline(
    spark_context, spark_session,
):

    feature_set_pipeline = FeatureSetPipeline(
        source=Source(
            readers=[
                TableReader(id="b_source", table="b_table",).with_incremental_strategy(
                    incremental_strategy=IncrementalStrategy(column="timestamp")
                ),
            ],
            query=f"select * from b_source ",  # noqa
        ),
        feature_set=FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ],
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["1 day"],
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        ),
        sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]),
    )

    return feature_set_pipeline
    def test_feature_transform_with_window(self, feature_set_dataframe,
                                           target_df_rows_agg):
        test_feature = Feature(
            name="feature1",
            description="unit test",
            transformation=SparkFunctionTransform(functions=[
                Function(functions.avg, DataType.DOUBLE)
            ], ).with_window(
                partition_by="id",
                mode="row_windows",
                window_definition=["2 events", "3 events"],
            ),
        )

        output_df = test_feature.transform(feature_set_dataframe)

        assert_dataframe_equality(output_df, target_df_rows_agg)
    def test_output_columns(self):
        test_feature = Feature(
            name="feature1",
            description="unit test",
            transformation=SparkFunctionTransform(functions=[
                Function(functions.avg, DataType.DOUBLE)
            ], ).with_window(
                partition_by="id",
                mode="fixed_windows",
                window_definition=["7 days", "2 weeks"],
            ),
        )

        df_columns = test_feature.get_output_columns()

        assert all([
            a == b for a, b in zip(
                df_columns,
                [
                    "feature1__avg_over_7_days_fixed_windows",
                    "feature1__avg_over_2_weeks_fixed_windows",
                ],
            )
        ])
Ejemplo n.º 11
0
    def test_feature_set_args(self):
        # arrange and act
        out_columns = [
            "user_id",
            "timestamp",
            "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows",
            "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows",
            "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows",
            "listing_page_viewed__rent_per_month__"
            "stddev_pop_over_2_weeks_fixed_windows",
            # noqa
        ]
        pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id="source_a",
                        database="db",
                        table="table",
                    ),
                    FileReader(
                        id="source_b",
                        path="path",
                        format="parquet",
                    ),
                ],
                query="select a.*, b.specific_feature "
                "from source_a left join source_b on a.id=b.id",
            ),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                keys=[
                    KeyFeature(
                        name="user_id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(from_column="ts"),
                features=[
                    Feature(
                        name="listing_page_viewed__rent_per_month",
                        description="Average of something.",
                        transformation=SparkFunctionTransform(functions=[
                            Function(functions.avg, DataType.FLOAT),
                            Function(functions.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="user_id",
                            order_by=TIMESTAMP_COLUMN,
                            window_definition=["7 days", "2 weeks"],
                            mode="fixed_windows",
                        ),
                    ),
                ],
            ),
            sink=Sink(writers=[
                HistoricalFeatureStoreWriter(db_config=None),
                OnlineFeatureStoreWriter(db_config=None),
            ], ),
        )

        assert isinstance(pipeline.spark_client, SparkClient)
        assert len(pipeline.source.readers) == 2
        assert all(
            isinstance(reader, Reader) for reader in pipeline.source.readers)
        assert isinstance(pipeline.source.query, str)
        assert pipeline.feature_set.name == "feature_set"
        assert pipeline.feature_set.entity == "entity"
        assert pipeline.feature_set.description == "description"
        assert isinstance(pipeline.feature_set.timestamp, TimestampFeature)
        assert len(pipeline.feature_set.keys) == 1
        assert all(
            isinstance(k, KeyFeature) for k in pipeline.feature_set.keys)
        assert len(pipeline.feature_set.features) == 1
        assert all(
            isinstance(feature, Feature)
            for feature in pipeline.feature_set.features)
        assert pipeline.feature_set.columns == out_columns
        assert len(pipeline.sink.writers) == 2
        assert all(
            isinstance(writer, Writer) for writer in pipeline.sink.writers)
Ejemplo n.º 12
0
    def test_run_with_repartition(self, spark_session):
        test_pipeline = FeatureSetPipeline(
            spark_client=SparkClient(),
            source=Mock(
                spec=Source,
                readers=[
                    TableReader(
                        id="source_a",
                        database="db",
                        table="table",
                    )
                ],
                query="select * from source_a",
            ),
            feature_set=Mock(
                spec=FeatureSet,
                name="feature_set",
                entity="entity",
                description="description",
                keys=[
                    KeyFeature(
                        name="user_id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(from_column="ts"),
                features=[
                    Feature(
                        name="listing_page_viewed__rent_per_month",
                        description="Average of something.",
                        transformation=SparkFunctionTransform(functions=[
                            Function(functions.avg, DataType.FLOAT),
                            Function(functions.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="user_id",
                            order_by=TIMESTAMP_COLUMN,
                            window_definition=["7 days", "2 weeks"],
                            mode="fixed_windows",
                        ),
                    ),
                ],
            ),
            sink=Mock(
                spec=Sink,
                writers=[HistoricalFeatureStoreWriter(db_config=None)],
            ),
        )

        # feature_set need to return a real df for streaming validation
        sample_df = spark_session.createDataFrame([{
            "a": "x",
            "b": "y",
            "c": "3"
        }])
        test_pipeline.feature_set.construct.return_value = sample_df

        test_pipeline.run(partition_by=["id"])

        test_pipeline.source.construct.assert_called_once()
        test_pipeline.feature_set.construct.assert_called_once()
        test_pipeline.sink.flush.assert_called_once()
        test_pipeline.sink.validate.assert_called_once()
    def test_feature_set_pipeline(
        self,
        mocked_df,
        spark_session,
        fixed_windows_output_feature_set_dataframe,
    ):
        # arrange
        table_reader_id = "a_source"
        table_reader_table = "table"
        table_reader_db = environment.get_variable(
            "FEATURE_STORE_HISTORICAL_DATABASE")
        create_temp_view(dataframe=mocked_df, name=table_reader_id)
        create_db_and_table(
            spark=spark_session,
            table_reader_id=table_reader_id,
            table_reader_db=table_reader_db,
            table_reader_table=table_reader_table,
        )

        dbconfig = Mock()
        dbconfig.mode = "overwrite"
        dbconfig.format_ = "parquet"
        dbconfig.get_options = Mock(
            return_value={"path": "test_folder/historical/entity/feature_set"})

        historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig)

        # act
        test_pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id=table_reader_id,
                        database=table_reader_db,
                        table=table_reader_table,
                    ),
                ],
                query=f"select * from {table_reader_id} ",  # noqa
            ),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature1",
                        description="test",
                        transformation=SparkFunctionTransform(functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="id",
                            order_by=TIMESTAMP_COLUMN,
                            mode="fixed_windows",
                            window_definition=["2 minutes", "15 minutes"],
                        ),
                    ),
                    Feature(
                        name="divided_feature",
                        description="unit test",
                        dtype=DataType.FLOAT,
                        transformation=CustomTransform(
                            transformer=divide,
                            column1="feature1",
                            column2="feature2",
                        ),
                    ),
                ],
                keys=[
                    KeyFeature(
                        name="id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(),
            ),
            sink=Sink(writers=[historical_writer]),
        )
        test_pipeline.run()

        # assert
        path = dbconfig.get_options("historical/entity/feature_set").get(
            "path")
        df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN)

        target_df = fixed_windows_output_feature_set_dataframe.orderBy(
            test_pipeline.feature_set.timestamp_column)

        # assert
        assert_dataframe_equality(df, target_df)

        # tear down
        shutil.rmtree("test_folder")