Beispiel #1
0
    def test_filtering(
        self,
        filtering_dataframe,
        key_id,
        timestamp_c,
        feature1,
        feature2,
        feature3,
        output_filtering_dataframe,
    ):
        spark_client = Mock()

        # arrange
        feature_set = FeatureSet(
            "name",
            "entity",
            "description",
            [key_id],
            timestamp_c,
            [feature1, feature2, feature3],
        )

        # act
        result_df = (feature_set.construct(
            filtering_dataframe, spark_client).orderBy("timestamp").collect())

        # assert
        assert (result_df == output_filtering_dataframe.orderBy(
            "timestamp").select(feature_set.columns).collect())
Beispiel #2
0
    def test_construct_transformations(
        self,
        dataframe,
        feature_set_dataframe,
        key_id,
        timestamp_c,
        feature_add,
        feature_divide,
    ):
        spark_client = Mock()

        # arrange
        feature_set = FeatureSet(
            "name",
            "entity",
            "description",
            [key_id],
            timestamp_c,
            [feature_add, feature_divide],
        )

        # act
        result_df = feature_set.construct(dataframe, spark_client)

        # assert
        assert_dataframe_equality(result_df, feature_set_dataframe)
Beispiel #3
0
    def test_get_schema(self):
        expected_schema = [
            {"column_name": "id", "type": LongType(), "primary_key": True},
            {"column_name": "timestamp", "type": TimestampType(), "primary_key": False},
            {
                "column_name": "feature1__avg_over_2_minutes_fixed_windows",
                "type": FloatType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__avg_over_15_minutes_fixed_windows",
                "type": FloatType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows",
                "type": DoubleType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows",
                "type": DoubleType(),
                "primary_key": False,
            },
        ]

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.DOUBLE),
                        ]
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["2 minutes", "15 minutes"],
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.BIGINT,
                )
            ],
            timestamp=TimestampFeature(),
        )

        schema = feature_set.get_schema()

        assert schema == expected_schema
Beispiel #4
0
    def test_construct(
        self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe
    ):
        # given

        spark_client = SparkClient()

        # arrange

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ]
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["2 minutes", "15 minutes"],
                    ),
                ),
                Feature(
                    name="divided_feature",
                    description="unit test",
                    dtype=DataType.FLOAT,
                    transformation=CustomTransform(
                        transformer=divide, column1="feature1", column2="feature2",
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        )

        output_df = (
            feature_set.construct(feature_set_dataframe, client=spark_client)
            .orderBy(feature_set.timestamp_column)
            .select(feature_set.columns)
        )

        target_df = fixed_windows_output_feature_set_dataframe.orderBy(
            feature_set.timestamp_column
        ).select(feature_set.columns)

        # assert
        assert_dataframe_equality(output_df, target_df)
Beispiel #5
0
    def test_construct(
        self,
        dataframe,
        feature_set_dataframe,
        key_id,
        timestamp_c,
        feature_add,
        feature_divide,
    ):
        spark_client = Mock()

        # arrange
        feature_set = FeatureSet(
            "name",
            "entity",
            "description",
            [key_id],
            timestamp_c,
            [feature_add, feature_divide],
        )

        # act
        result_df = feature_set.construct(dataframe, spark_client)
        result_columns = result_df.columns

        # assert
        assert (result_columns == key_id.get_output_columns() +
                timestamp_c.get_output_columns() +
                feature_add.get_output_columns() +
                feature_divide.get_output_columns())
        assert_dataframe_equality(result_df, feature_set_dataframe)
        assert result_df.is_cached
Beispiel #6
0
    def test_columns(self, key_id, timestamp_c, feature_add, feature_divide):
        # arrange
        name = "name"
        entity = "entity"
        description = "description"

        # act
        fs = FeatureSet(
            name,
            entity,
            description,
            [key_id],
            timestamp_c,
            [feature_add, feature_divide],
        )
        out_columns = fs.columns

        # assert
        assert (
            out_columns
            == key_id.get_output_columns()
            + timestamp_c.get_output_columns()
            + feature_add.get_output_columns()
            + feature_divide.get_output_columns()
        )
def transformer():

    # primary key
    keys = [
        KeyFeature(
            name="customer_id",
            description="Unique identificator code for customer.",
            from_column="customer_id",
            dtype=DataType.STRING,
        )
    ]

    ts_feature = TimestampFeature(from_column="order_created_at")

    # features transformations
    features = [
        #order_total_amount(),
        count_items_in_order(),
        avg_order_total_amount_from_last_1_month(),
        ratio_order_amount_and_items(),
        ratio_order_amount_and_average_ticket()
    ]

    # joining all together
    feature_set = FeatureSet(
        name="orders_feature_master_table",
        entity=
        "orders_feature_master_table",  # entity: to which "business context" this feature set belongs
        description="Features describring events about ifood store.",
        keys=keys,
        timestamp=ts_feature,
        features=features,
    )

    return feature_set
Beispiel #8
0
def feature_set():
    feature_set = FeatureSet(
        name="feature_set",
        entity="entity",
        description="description",
        features=[
            Feature(
                name="feature1",
                description="test",
                transformation=SparkFunctionTransform(functions=[
                    Function(functions.avg, DataType.FLOAT),
                    Function(functions.stddev_pop, DataType.DOUBLE),
                ]).with_window(
                    partition_by="id",
                    order_by=TIMESTAMP_COLUMN,
                    mode="fixed_windows",
                    window_definition=["2 minutes", "15 minutes"],
                ),
            ),
        ],
        keys=[
            KeyFeature(
                name="id",
                description="The user's Main ID or device ID",
                dtype=DataType.BIGINT,
            )
        ],
        timestamp=TimestampFeature(),
    )

    return feature_set
Beispiel #9
0
    def test_getters(self, feature_add, feature_divide, key_id, timestamp_c):
        # arrange
        name = "name"
        entity = "entity"
        description = "description"

        # act
        feature_set = FeatureSet(
            name,
            entity,
            description,
            [key_id],
            timestamp_c,
            [feature_add, feature_divide],
        )

        # assert
        assert name == feature_set.name
        assert entity == feature_set.entity
        assert description == feature_set.description
        assert [key_id] == feature_set.keys
        assert timestamp_c == feature_set.timestamp
        assert [feature_add, feature_divide] == feature_set.features
        assert "timestamp" == feature_set.timestamp_column
        assert ["id"] == feature_set.keys_columns
Beispiel #10
0
 def __init__(self):
     super(FirstPipeline, self).__init__(
         source=Source(
             readers=[TableReader(id="t", database="db", table="table",)],
             query=f"select * from t",  # noqa
         ),
         feature_set=FeatureSet(
             name="first",
             entity="entity",
             description="description",
             features=[
                 Feature(name="feature1", description="test", dtype=DataType.FLOAT,),
                 Feature(
                     name="feature2",
                     description="another test",
                     dtype=DataType.STRING,
                 ),
             ],
             keys=[
                 KeyFeature(
                     name="id", description="identifier", dtype=DataType.BIGINT,
                 )
             ],
             timestamp=TimestampFeature(),
         ),
         sink=Sink(
             writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()]
         ),
     )
Beispiel #11
0
    def apply_migration(self, feature_set: FeatureSet, writer: Writer,
                        debug_mode: bool) -> None:
        """Apply the migration in the respective database.

        Args:
            feature_set: the feature set.
            writer: the writer being used to load the feature set.
            debug_mode: if active, it brings up the queries generated.
        """
        logger.info(f"Migrating feature set: {feature_set.name}")

        table_name = (feature_set.name
                      if not writer.write_to_entity else feature_set.entity)

        fs_schema = writer.db_config.translate(feature_set.get_schema())
        db_schema = self._get_schema(table_name, writer.database)

        queries = self.create_query(fs_schema, table_name, db_schema,
                                    writer.write_to_entity)

        if debug_mode:
            print("#### DEBUG MODE ###\n"
                  f"Feature set: {feature_set.name}\n"
                  "Queries:\n"
                  f"{queries}")
        else:
            for q in queries:
                logger.info(f"Applying this query: {q} ...")
                self._client.sql(q)

            logger.info(f"Feature Set migration finished successfully.")

            # inform in drone console which feature set was migrated
            print(f"The {feature_set.name} feature set was migrated.")
Beispiel #12
0
    def test_construct_invalid_df(self, key_id, timestamp_c, feature_add,
                                  feature_divide):
        spark_client = Mock()

        # arrange
        feature_set = FeatureSet(
            "name",
            "entity",
            "description",
            [key_id],
            timestamp_c,
            [feature_add, feature_divide],
        )

        # act and assert
        with pytest.raises(ValueError):
            _ = feature_set.construct("not a dataframe", spark_client)
    def test_pipeline_with_hooks(self, spark_session):
        # arrange
        hook1 = AddHook(value=1)

        spark_session.sql(
            "select 1 as id, timestamp('2020-01-01') as timestamp, 0 as feature"
        ).createOrReplaceTempView("test")

        target_df = spark_session.sql(
            "select 1 as id, timestamp('2020-01-01') as timestamp, 6 as feature, 2020 "
            "as year, 1 as month, 1 as day")

        historical_writer = HistoricalFeatureStoreWriter(debug_mode=True)

        test_pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id="reader",
                        table="test",
                    ).add_post_hook(hook1)
                ],
                query="select * from reader",
            ).add_post_hook(hook1),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature",
                        description="test",
                        transformation=SQLExpressionTransform(
                            expression="feature + 1"),
                        dtype=DataType.INTEGER,
                    ),
                ],
                keys=[
                    KeyFeature(
                        name="id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(),
            ).add_pre_hook(hook1).add_post_hook(hook1),
            sink=Sink(writers=[historical_writer], ).add_pre_hook(hook1),
        )

        # act
        test_pipeline.run()
        output_df = spark_session.table(
            "historical_feature_store__feature_set")

        # assert
        output_df.show()
        assert_dataframe_equality(output_df, target_df)
    def test_construct_with_date_boundaries(
            self, feature_set_dates_dataframe,
            feature_set_dates_output_dataframe):
        # given

        spark_client = SparkClient()

        # arrange

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature",
                    description="test",
                    dtype=DataType.FLOAT,
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        )

        output_df = (feature_set.construct(
            feature_set_dates_dataframe,
            client=spark_client,
            start_date="2016-04-11",
            end_date="2016-04-12",
        ).orderBy(feature_set.timestamp_column).select(feature_set.columns))

        target_df = feature_set_dates_output_dataframe.orderBy(
            feature_set.timestamp_column).select(feature_set.columns)

        # assert
        assert_dataframe_equality(output_df, target_df)
Beispiel #15
0
    def get_db_schema(self, feature_set: FeatureSet) -> List[Dict[Any, Any]]:
        """Get desired database schema.

        Args:
            feature_set: object processed with feature set metadata.

        Returns:
            Desired database schema.

        """
        db_schema = self.db_config.translate(feature_set.get_schema())
        return db_schema
Beispiel #16
0
    def test_multiple_timestamps(self, feature_add, key_id, timestamp_c):
        # arrange
        name = "name"
        entity = "entity"
        description = "description"
        timestamp_c.get_output_columns = Mock(
            return_value=["timestamp1", "timestamp2"])

        # act and assert
        with pytest.raises(ValueError):
            _ = FeatureSet(name, entity, description, [key_id], timestamp_c,
                           [feature_add])
Beispiel #17
0
def feature_set():
    key_features = [
        KeyFeature(name="id", description="Description", dtype=DataType.INTEGER)
    ]
    ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN)
    features = [
        Feature(name="feature", description="Description", dtype=DataType.BIGINT,)
    ]
    return FeatureSet(
        "feature_set",
        "entity",
        "description",
        keys=key_features,
        timestamp=ts_feature,
        features=features,
    )
Beispiel #18
0
    def test_duplicate_features(self, feature_add, key_id, timestamp_c):
        # arrange
        name = "name"
        entity = "entity"
        description = "description"

        # act and assert
        with pytest.raises(KeyError):
            _ = FeatureSet(
                name,
                entity,
                description,
                [key_id],
                timestamp_c,
                [feature_add, feature_add],
            )
Beispiel #19
0
def feature_set_pipeline(
    spark_context, spark_session,
):

    feature_set_pipeline = FeatureSetPipeline(
        source=Source(
            readers=[
                TableReader(id="b_source", table="b_table",).with_incremental_strategy(
                    incremental_strategy=IncrementalStrategy(column="timestamp")
                ),
            ],
            query=f"select * from b_source ",  # noqa
        ),
        feature_set=FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ],
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["1 day"],
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        ),
        sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]),
    )

    return feature_set_pipeline
Beispiel #20
0
 def test_feature_without_datatype(self, key_id, timestamp_c, dataframe):
     spark_client = SparkClient()
     with pytest.raises(ValueError):
         FeatureSet(
             name="name",
             entity="entity",
             description="description",
             features=[
                 Feature(
                     name="feature1",
                     description="test",
                     transformation=SQLExpressionTransform(
                         expression="feature1 + a"),
                 ),
             ],
             keys=[key_id],
             timestamp=timestamp_c,
         ).construct(dataframe, spark_client)
Beispiel #21
0
    def test__get_features_columns(self):
        # arrange
        feature_1 = Feature("feature1", "description", DataType.FLOAT)
        feature_1.get_output_columns = Mock(return_value=["col_a", "col_b"])

        feature_2 = Feature("feature2", "description", DataType.FLOAT)
        feature_2.get_output_columns = Mock(return_value=["col_c"])

        feature_3 = Feature("feature3", "description", DataType.FLOAT)
        feature_3.get_output_columns = Mock(return_value=["col_d"])

        target_features_columns = ["col_a", "col_b", "col_c", "col_d"]

        # act
        result_features_columns = FeatureSet._get_features_columns(
            feature_1, feature_2, feature_3)

        # assert
        assert target_features_columns == result_features_columns
Beispiel #22
0
 def test_feature_set_with_invalid_feature(self, key_id, timestamp_c,
                                           dataframe):
     spark_client = SparkClient()
     with pytest.raises(ValueError):
         FeatureSet(
             name="name",
             entity="entity",
             description="description",
             features=[
                 Feature(
                     name="feature1",
                     description="test",
                     transformation=AggregatedTransform(
                         functions=[Function(F.avg, DataType.FLOAT)]),
                 ),
             ],
             keys=[key_id],
             timestamp=timestamp_c,
         ).construct(dataframe, spark_client)
Beispiel #23
0
def feature_set():
    key_features = [
        KeyFeature(name="id",
                   description="Description",
                   dtype=DataType.INTEGER)
    ]
    ts_feature = TimestampFeature(from_column="timestamp")
    features = [
        Feature(name="feature",
                description="Description",
                dtype=DataType.FLOAT),
    ]
    return FeatureSet(
        "test_sink_feature_set",
        "test_sink_entity",
        "description",
        keys=key_features,
        timestamp=ts_feature,
        features=features,
    )
    def test_feature_set_args(self):
        # arrange and act
        out_columns = [
            "user_id",
            "timestamp",
            "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows",
            "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows",
            "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows",
            "listing_page_viewed__rent_per_month__"
            "stddev_pop_over_2_weeks_fixed_windows",
            # noqa
        ]
        pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id="source_a",
                        database="db",
                        table="table",
                    ),
                    FileReader(
                        id="source_b",
                        path="path",
                        format="parquet",
                    ),
                ],
                query="select a.*, b.specific_feature "
                "from source_a left join source_b on a.id=b.id",
            ),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                keys=[
                    KeyFeature(
                        name="user_id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(from_column="ts"),
                features=[
                    Feature(
                        name="listing_page_viewed__rent_per_month",
                        description="Average of something.",
                        transformation=SparkFunctionTransform(functions=[
                            Function(functions.avg, DataType.FLOAT),
                            Function(functions.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="user_id",
                            order_by=TIMESTAMP_COLUMN,
                            window_definition=["7 days", "2 weeks"],
                            mode="fixed_windows",
                        ),
                    ),
                ],
            ),
            sink=Sink(writers=[
                HistoricalFeatureStoreWriter(db_config=None),
                OnlineFeatureStoreWriter(db_config=None),
            ], ),
        )

        assert isinstance(pipeline.spark_client, SparkClient)
        assert len(pipeline.source.readers) == 2
        assert all(
            isinstance(reader, Reader) for reader in pipeline.source.readers)
        assert isinstance(pipeline.source.query, str)
        assert pipeline.feature_set.name == "feature_set"
        assert pipeline.feature_set.entity == "entity"
        assert pipeline.feature_set.description == "description"
        assert isinstance(pipeline.feature_set.timestamp, TimestampFeature)
        assert len(pipeline.feature_set.keys) == 1
        assert all(
            isinstance(k, KeyFeature) for k in pipeline.feature_set.keys)
        assert len(pipeline.feature_set.features) == 1
        assert all(
            isinstance(feature, Feature)
            for feature in pipeline.feature_set.features)
        assert pipeline.feature_set.columns == out_columns
        assert len(pipeline.sink.writers) == 2
        assert all(
            isinstance(writer, Writer) for writer in pipeline.sink.writers)
    def test_feature_set_pipeline(
        self,
        mocked_df,
        spark_session,
        fixed_windows_output_feature_set_dataframe,
    ):
        # arrange
        table_reader_id = "a_source"
        table_reader_table = "table"
        table_reader_db = environment.get_variable(
            "FEATURE_STORE_HISTORICAL_DATABASE")
        create_temp_view(dataframe=mocked_df, name=table_reader_id)
        create_db_and_table(
            spark=spark_session,
            table_reader_id=table_reader_id,
            table_reader_db=table_reader_db,
            table_reader_table=table_reader_table,
        )

        dbconfig = Mock()
        dbconfig.mode = "overwrite"
        dbconfig.format_ = "parquet"
        dbconfig.get_options = Mock(
            return_value={"path": "test_folder/historical/entity/feature_set"})

        historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig)

        # act
        test_pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id=table_reader_id,
                        database=table_reader_db,
                        table=table_reader_table,
                    ),
                ],
                query=f"select * from {table_reader_id} ",  # noqa
            ),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature1",
                        description="test",
                        transformation=SparkFunctionTransform(functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="id",
                            order_by=TIMESTAMP_COLUMN,
                            mode="fixed_windows",
                            window_definition=["2 minutes", "15 minutes"],
                        ),
                    ),
                    Feature(
                        name="divided_feature",
                        description="unit test",
                        dtype=DataType.FLOAT,
                        transformation=CustomTransform(
                            transformer=divide,
                            column1="feature1",
                            column2="feature2",
                        ),
                    ),
                ],
                keys=[
                    KeyFeature(
                        name="id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(),
            ),
            sink=Sink(writers=[historical_writer]),
        )
        test_pipeline.run()

        # assert
        path = dbconfig.get_options("historical/entity/feature_set").get(
            "path")
        df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN)

        target_df = fixed_windows_output_feature_set_dataframe.orderBy(
            test_pipeline.feature_set.timestamp_column)

        # assert
        assert_dataframe_equality(df, target_df)

        # tear down
        shutil.rmtree("test_folder")
    def test_pipeline_interval_run(self, mocked_date_df,
                                   pipeline_interval_run_target_dfs,
                                   spark_session):
        """Testing pipeline's idempotent interval run feature.
        Source data:
        +-------+---+-------------------+-------------------+
        |feature| id|                 ts|          timestamp|
        +-------+---+-------------------+-------------------+
        |    200|  1|2016-04-11 11:31:11|2016-04-11 11:31:11|
        |    300|  1|2016-04-12 11:44:12|2016-04-12 11:44:12|
        |    400|  1|2016-04-13 11:46:24|2016-04-13 11:46:24|
        |    500|  1|2016-04-14 12:03:21|2016-04-14 12:03:21|
        +-------+---+-------------------+-------------------+
        The test executes 3 runs for different time intervals. The input data has 4 data
        points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run
        specifications are:
        1)  Interval: from 2016-04-11 to 2016-04-13
            Target table result:
            +---+-------+---+-----+------+-------------------+----+
            |day|feature| id|month|run_id|          timestamp|year|
            +---+-------+---+-----+------+-------------------+----+
            | 11|    200|  1|    4|     1|2016-04-11 11:31:11|2016|
            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
            +---+-------+---+-----+------+-------------------+----+
        2)  Interval: only 2016-04-14.
            Target table result:
            +---+-------+---+-----+------+-------------------+----+
            |day|feature| id|month|run_id|          timestamp|year|
            +---+-------+---+-----+------+-------------------+----+
            | 11|    200|  1|    4|     1|2016-04-11 11:31:11|2016|
            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
            | 14|    500|  1|    4|     2|2016-04-14 12:03:21|2016|
            +---+-------+---+-----+------+-------------------+----+
        3)  Interval: only 2016-04-11.
            Target table result:
            +---+-------+---+-----+------+-------------------+----+
            |day|feature| id|month|run_id|          timestamp|year|
            +---+-------+---+-----+------+-------------------+----+
            | 11|    200|  1|    4|     3|2016-04-11 11:31:11|2016|
            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
            | 14|    500|  1|    4|     2|2016-04-14 12:03:21|2016|
            +---+-------+---+-----+------+-------------------+----+
        """
        # arrange
        create_temp_view(dataframe=mocked_date_df, name="input_data")

        db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE")
        path = "test_folder/historical/entity/feature_set"

        spark_session.conf.set("spark.sql.sources.partitionOverwriteMode",
                               "dynamic")
        spark_session.sql(f"create database if not exists {db}")
        spark_session.sql(
            f"create table if not exists {db}.feature_set_interval "
            f"(id int, timestamp timestamp, feature int, "
            f"run_id int, year int, month int, day int);")

        dbconfig = MetastoreConfig()
        dbconfig.get_options = Mock(return_value={
            "mode": "overwrite",
            "format_": "parquet",
            "path": path
        })

        historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig,
                                                         interval_mode=True)

        first_run_hook = RunHook(id=1)
        second_run_hook = RunHook(id=2)
        third_run_hook = RunHook(id=3)

        (
            first_run_target_df,
            second_run_target_df,
            third_run_target_df,
        ) = pipeline_interval_run_target_dfs

        test_pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id="id",
                        table="input_data",
                    ).with_incremental_strategy(IncrementalStrategy("ts")),
                ],
                query="select * from id ",
            ),
            feature_set=FeatureSet(
                name="feature_set_interval",
                entity="entity",
                description="",
                keys=[
                    KeyFeature(
                        name="id",
                        description="",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(from_column="ts"),
                features=[
                    Feature(name="feature",
                            description="",
                            dtype=DataType.INTEGER),
                    Feature(name="run_id",
                            description="",
                            dtype=DataType.INTEGER),
                ],
            ),
            sink=Sink([historical_writer], ),
        )

        # act and assert
        dbconfig.get_path_with_partitions = Mock(return_value=[
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=11",
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=12",
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=13",
        ])
        test_pipeline.feature_set.add_pre_hook(first_run_hook)
        test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11")
        first_run_output_df = spark_session.read.parquet(path)
        assert_dataframe_equality(first_run_output_df, first_run_target_df)

        dbconfig.get_path_with_partitions = Mock(return_value=[
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=14",
        ])
        test_pipeline.feature_set.add_pre_hook(second_run_hook)
        test_pipeline.run_for_date("2016-04-14")
        second_run_output_df = spark_session.read.parquet(path)
        assert_dataframe_equality(second_run_output_df, second_run_target_df)

        dbconfig.get_path_with_partitions = Mock(return_value=[
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=11",
        ])
        test_pipeline.feature_set.add_pre_hook(third_run_hook)
        test_pipeline.run_for_date("2016-04-11")
        third_run_output_df = spark_session.read.parquet(path)
        assert_dataframe_equality(third_run_output_df, third_run_target_df)

        # tear down
        shutil.rmtree("test_folder")
Beispiel #27
0
    def __init__(self):
        super(AwesomeDatasetPipeline, self).__init__(
            source=Source(
                readers=[
                    FileReader(
                        id="order_events",
                        path="data/order_events/input.csv",
                        format="csv",
                        format_options={"header": True},
                    ),
                    FileReader(
                        id="user_chargebacks",
                        path="data/feature_store/historical/user/user_chargebacks",
                        format="parquet",
                    ),
                    FileReader(
                        id="user_orders",
                        path="data/feature_store/historical/user/user_orders",
                        format="parquet",
                    ),
                ],
                query="""
with feature_sets_merge as(
    select
        user_orders.cpf,
        user_orders.timestamp,
        user_chargebacks.timestamp as chargeback_timestamp,
        cpf_orders__count_over_3_days_rolling_windows,
        cpf_orders__count_over_7_days_rolling_windows,
        cpf_orders__count_over_30_days_rolling_windows,
        cpf_chargebacks__count_over_3_days_rolling_windows,
        cpf_chargebacks__count_over_7_days_rolling_windows,
        cpf_chargebacks__count_over_30_days_rolling_windows,
        row_number() over (
            partition by (user_orders.cpf, user_orders.timestamp)
            order by user_chargebacks.timestamp desc
        ) as rn
    from
        user_orders
        left join user_chargebacks
            on  user_orders.cpf = user_chargebacks.cpf
            and user_orders.timestamp >= user_chargebacks.timestamp
),
feature_sets_rn_filter as(
    select
        *
    from
        feature_sets_merge
    where
        rn = 1
),
orders_with_feature_sets as(
    select
        order_events.order_id,
        timestamp(order_events.order_timestamp) as timestamp,
        timestamp(order_events.chargeback_timestamp) as chargeback_timestamp,
        order_events.cpf,
        feature_sets_rn_filter.cpf_orders__count_over_3_days_rolling_windows,
        feature_sets_rn_filter.cpf_orders__count_over_7_days_rolling_windows,
        feature_sets_rn_filter.cpf_orders__count_over_30_days_rolling_windows,
        feature_sets_rn_filter.cpf_chargebacks__count_over_3_days_rolling_windows,
        feature_sets_rn_filter.cpf_chargebacks__count_over_7_days_rolling_windows,
        feature_sets_rn_filter.cpf_chargebacks__count_over_30_days_rolling_windows,
        row_number() over (
            partition by (order_events.cpf, order_events.order_timestamp)
            order by feature_sets_rn_filter.timestamp desc
        ) as rn
    from
        order_events
        join feature_sets_rn_filter
            on order_events.cpf = feature_sets_rn_filter.cpf
            and timestamp(order_events.order_timestamp) >=
            feature_sets_rn_filter.timestamp
)
select
    order_id,
    timestamp,
    chargeback_timestamp,
    cpf,
    cpf_orders__count_over_3_days_rolling_windows,
    cpf_orders__count_over_7_days_rolling_windows,
    cpf_orders__count_over_30_days_rolling_windows,
    coalesce(
        cpf_chargebacks__count_over_3_days_rolling_windows,
    0) as cpf_chargebacks__count_over_3_days_rolling_windows,
    coalesce(
        cpf_chargebacks__count_over_7_days_rolling_windows,
    0) as cpf_chargebacks__count_over_7_days_rolling_windows,
    coalesce(
        cpf_chargebacks__count_over_30_days_rolling_windows,
    0) as cpf_chargebacks__count_over_30_days_rolling_windows
from
    orders_with_feature_sets
where
    rn = 1
                """,
            ),
            feature_set=FeatureSet(
                name="awesome_dataset",
                entity="user",
                description="Dataset enriching orders events with aggregated features "
                "on total of orders and chargebacks by user.",
                keys=[
                    KeyFeature(
                        name="order_id",
                        description="Orders unique identifier.",
                        dtype=DataType.STRING,
                    )
                ],
                timestamp=TimestampFeature(),
                features=[
                    Feature(
                        name="chargeback_timestamp",
                        description="Timestamp for the order creation.",
                        dtype=DataType.TIMESTAMP,
                    ),
                    Feature(
                        name="cpf",
                        description="User unique identifier, user entity key.",
                        dtype=DataType.STRING,
                    ),
                    Feature(
                        name="cpf_orders__count_over_3_days_rolling_windows",
                        description="Count of orders over 3 days rolling windows group "
                        "by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_orders__count_over_7_days_rolling_windows",
                        description="Count of orders over 7 days rolling windows group "
                        "by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_orders__count_over_30_days_rolling_windows",
                        description="Count of orders over 30 days rolling windows group"
                        " by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_chargebacks__count_over_3_days_rolling_windows",
                        description="Count of chargebacks over 3 days rolling windows "
                        "group by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_chargebacks__count_over_7_days_rolling_windows",
                        description="Count of chargebacks over 7 days rolling windows "
                        "group by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_chargebacks__count_over_30_days_rolling_windows",
                        description="Count of chargebacks over 30 days rolling windows "
                        "group by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                ],
            ),
            sink=Sink(writers=[DatasetWriter()]),
        )
Beispiel #28
0
 def test_cannot_instantiate(self, name, entity, description, keys,
                             timestamp, features):
     # act and assert
     with pytest.raises(ValueError):
         FeatureSet(name, entity, description, keys, timestamp, features)