コード例 #1
0
def test_get_variable_out_of_spec_fails(monkeypatch):
    # given
    not_specified_variable = "not_specified_variable"
    monkeypatch.setenv(not_specified_variable, "anything")
    if not_specified_variable in environment.specification:
        del environment.specification[not_specified_variable]

    # then
    with pytest.raises(environment.UnspecifiedVariableError,
                       match="not listed in the environment"):
        environment.get_variable(not_specified_variable, "anything")
コード例 #2
0
 def kafka_connection_string(self, value: str) -> None:
     input_value = value or environment.get_variable(
         "KAFKA_CONSUMER_CONNECTION_STRING")
     if input_value is None:
         raise ValueError(
             "Config 'kafka connection string' cannot be empty.")
     self.__kafka_connection_string = input_value
コード例 #3
0
 def __init__(
     self,
     id: str,
     topic: str,
     value_schema: StructType,
     connection_string: str = None,
     topic_options: dict = None,
     stream: bool = True,
 ):
     super().__init__(id)
     if not isinstance(topic, str):
         raise ValueError("topic must be a string with the topic name")
     if not isinstance(value_schema, StructType):
         raise ValueError(
             "value_schema must be a StructType with the schema "
             'of the JSON presented in "value" Kafka column')
     self.topic = topic
     self.value_schema = value_schema
     self.connection_string = connection_string or environment.get_variable(
         "KAFKA_CONSUMER_CONNECTION_STRING")
     self.options = dict(
         {
             "kafka.bootstrap.servers": self.connection_string,
             "subscribe": self.topic,
         },
         **topic_options if topic_options else {},
     )
     self.stream = stream
コード例 #4
0
 def __init__(
     self,
     database: str = None,
 ) -> None:
     self._db_config = MetastoreConfig()
     self.database = database or environment.get_variable(
         "FEATURE_STORE_HISTORICAL_DATABASE")
     super(MetastoreMigration, self).__init__(SparkClient())
コード例 #5
0
def test_get_variable_success(monkeypatch):
    # given
    specified_variable = "specified_variable"
    effective_value = "effective_value"
    monkeypatch.setenv(specified_variable, effective_value)
    environment.specification[specified_variable] = "spec_default_value"

    # when
    return_value = environment.get_variable(specified_variable, "anything")

    # then
    assert return_value == effective_value
コード例 #6
0
def test_get_variable_default(monkeypatch):
    # given
    default = "default_value"
    variable = "environment_variable"
    environment.specification[variable] = None
    monkeypatch.setenv(variable, "overwrite")
    monkeypatch.delenv(variable)

    # when
    return_value = environment.get_variable(variable, default)

    # then
    assert return_value == default
コード例 #7
0
def test_get_variable_from_spec_default(monkeypatch):
    # given
    specified_variable = "specified_variable"
    spec_default_value = "default_value"
    monkeypatch.setenv(specified_variable, "overwrite")
    monkeypatch.delenv(specified_variable)
    environment.specification[specified_variable] = spec_default_value

    # when
    return_value = environment.get_variable(specified_variable, "anything")

    # then
    assert return_value == spec_default_value
コード例 #8
0
 def __init__(
     self,
     db_config=None,
     database=None,
     num_partitions=None,
     validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD,
     debug_mode: bool = False,
 ):
     self.db_config = db_config or S3Config()
     self.database = database or environment.get_variable(
         "FEATURE_STORE_HISTORICAL_DATABASE"
     )
     self.num_partitions = num_partitions or DEFAULT_NUM_PARTITIONS
     self.validation_threshold = validation_threshold
     self.debug_mode = debug_mode
コード例 #9
0
 def __init__(
     self,
     db_config: Union[AbstractWriteConfig, MetastoreConfig] = None,
     database: str = None,
     num_partitions: int = None,
     validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD,
     debug_mode: bool = False,
 ):
     super(HistoricalFeatureStoreWriter, self).__init__()
     self.db_config = db_config or MetastoreConfig()
     self.database = database or environment.get_variable(
         "FEATURE_STORE_HISTORICAL_DATABASE")
     self.num_partitions = num_partitions or DEFAULT_NUM_PARTITIONS
     self.validation_threshold = validation_threshold
     self.debug_mode = debug_mode
コード例 #10
0
ファイル: migrate.py プロジェクト: quintoandar/butterfree
    def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None:
        """Send all migration logs to S3."""
        file_name = "../logging.json"

        if not file_local and os.path.exists(file_name):
            s3_client = boto3.client("s3")

            timestamp = datetime.datetime.now()

            if debug_mode:
                object_name = (
                    f"logs/migrate-debug-mode/"
                    f"{timestamp.strftime('%Y-%m-%d')}"
                    f"/logging-{timestamp.strftime('%H:%M:%S')}.json")
            else:
                object_name = (
                    f"logs/migrate/"
                    f"{timestamp.strftime('%Y-%m-%d')}"
                    f"/logging-{timestamp.strftime('%H:%M:%S')}.json")
            bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET")

            try:
                s3_client.upload_file(
                    file_name,
                    bucket,
                    object_name,
                    ExtraArgs={"ACL": "bucket-owner-full-control"},
                )
            except ClientError:
                raise

            os.remove(file_name)
        elif os.path.exists(file_name):
            print("Logs written to ../logging.json")
        else:
            print("No logs were generated.")
コード例 #11
0
 def test_bucket(self, s3_config):
     # expecting
     default = environment.get_variable("FEATURE_STORE_S3_BUCKET")
     assert s3_config.bucket == default
コード例 #12
0
 def stream_checkpoint_path(self, value: str) -> None:
     self.__stream_checkpoint_path = value or environment.get_variable(
         "STREAM_CHECKPOINT_PATH")
コード例 #13
0
 def path(self, value: str) -> None:
     self.__path = value or environment.get_variable("FEATURE_STORE_S3_BUCKET")
コード例 #14
0
 def write_consistency_level(self, value: str) -> None:
     self.__write_consistency_level = value or environment.get_variable(
         "CASSANDRA_WRITE_CONSISTENCY_LEVEL", "LOCAL_QUORUM")
コード例 #15
0
 def local_dc(self, value: str) -> None:
     self.__local_dc = value or environment.get_variable(
         "CASSANDRA_LOCAL_DC")
コード例 #16
0
    def test_pipeline_interval_run(self, mocked_date_df,
                                   pipeline_interval_run_target_dfs,
                                   spark_session):
        """Testing pipeline's idempotent interval run feature.
        Source data:
        +-------+---+-------------------+-------------------+
        |feature| id|                 ts|          timestamp|
        +-------+---+-------------------+-------------------+
        |    200|  1|2016-04-11 11:31:11|2016-04-11 11:31:11|
        |    300|  1|2016-04-12 11:44:12|2016-04-12 11:44:12|
        |    400|  1|2016-04-13 11:46:24|2016-04-13 11:46:24|
        |    500|  1|2016-04-14 12:03:21|2016-04-14 12:03:21|
        +-------+---+-------------------+-------------------+
        The test executes 3 runs for different time intervals. The input data has 4 data
        points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run
        specifications are:
        1)  Interval: from 2016-04-11 to 2016-04-13
            Target table result:
            +---+-------+---+-----+------+-------------------+----+
            |day|feature| id|month|run_id|          timestamp|year|
            +---+-------+---+-----+------+-------------------+----+
            | 11|    200|  1|    4|     1|2016-04-11 11:31:11|2016|
            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
            +---+-------+---+-----+------+-------------------+----+
        2)  Interval: only 2016-04-14.
            Target table result:
            +---+-------+---+-----+------+-------------------+----+
            |day|feature| id|month|run_id|          timestamp|year|
            +---+-------+---+-----+------+-------------------+----+
            | 11|    200|  1|    4|     1|2016-04-11 11:31:11|2016|
            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
            | 14|    500|  1|    4|     2|2016-04-14 12:03:21|2016|
            +---+-------+---+-----+------+-------------------+----+
        3)  Interval: only 2016-04-11.
            Target table result:
            +---+-------+---+-----+------+-------------------+----+
            |day|feature| id|month|run_id|          timestamp|year|
            +---+-------+---+-----+------+-------------------+----+
            | 11|    200|  1|    4|     3|2016-04-11 11:31:11|2016|
            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
            | 14|    500|  1|    4|     2|2016-04-14 12:03:21|2016|
            +---+-------+---+-----+------+-------------------+----+
        """
        # arrange
        create_temp_view(dataframe=mocked_date_df, name="input_data")

        db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE")
        path = "test_folder/historical/entity/feature_set"

        spark_session.conf.set("spark.sql.sources.partitionOverwriteMode",
                               "dynamic")
        spark_session.sql(f"create database if not exists {db}")
        spark_session.sql(
            f"create table if not exists {db}.feature_set_interval "
            f"(id int, timestamp timestamp, feature int, "
            f"run_id int, year int, month int, day int);")

        dbconfig = MetastoreConfig()
        dbconfig.get_options = Mock(return_value={
            "mode": "overwrite",
            "format_": "parquet",
            "path": path
        })

        historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig,
                                                         interval_mode=True)

        first_run_hook = RunHook(id=1)
        second_run_hook = RunHook(id=2)
        third_run_hook = RunHook(id=3)

        (
            first_run_target_df,
            second_run_target_df,
            third_run_target_df,
        ) = pipeline_interval_run_target_dfs

        test_pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id="id",
                        table="input_data",
                    ).with_incremental_strategy(IncrementalStrategy("ts")),
                ],
                query="select * from id ",
            ),
            feature_set=FeatureSet(
                name="feature_set_interval",
                entity="entity",
                description="",
                keys=[
                    KeyFeature(
                        name="id",
                        description="",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(from_column="ts"),
                features=[
                    Feature(name="feature",
                            description="",
                            dtype=DataType.INTEGER),
                    Feature(name="run_id",
                            description="",
                            dtype=DataType.INTEGER),
                ],
            ),
            sink=Sink([historical_writer], ),
        )

        # act and assert
        dbconfig.get_path_with_partitions = Mock(return_value=[
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=11",
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=12",
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=13",
        ])
        test_pipeline.feature_set.add_pre_hook(first_run_hook)
        test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11")
        first_run_output_df = spark_session.read.parquet(path)
        assert_dataframe_equality(first_run_output_df, first_run_target_df)

        dbconfig.get_path_with_partitions = Mock(return_value=[
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=14",
        ])
        test_pipeline.feature_set.add_pre_hook(second_run_hook)
        test_pipeline.run_for_date("2016-04-14")
        second_run_output_df = spark_session.read.parquet(path)
        assert_dataframe_equality(second_run_output_df, second_run_target_df)

        dbconfig.get_path_with_partitions = Mock(return_value=[
            "test_folder/historical/entity/feature_set/year=2016/month=4/day=11",
        ])
        test_pipeline.feature_set.add_pre_hook(third_run_hook)
        test_pipeline.run_for_date("2016-04-11")
        third_run_output_df = spark_session.read.parquet(path)
        assert_dataframe_equality(third_run_output_df, third_run_target_df)

        # tear down
        shutil.rmtree("test_folder")
コード例 #17
0
 def read_consistency_level(self, value: str) -> None:
     self.__read_consistency_level = value or environment.get_variable(
         "CASSANDRA_READ_CONSISTENCY_LEVEL", "LOCAL_ONE")
コード例 #18
0
 def test_path(self, metastore_config):
     # expecting
     default = environment.get_variable("FEATURE_STORE_S3_BUCKET")
     assert metastore_config.path == default
コード例 #19
0
 def bucket(self, value: str):
     self.__bucket = value or environment.get_variable(
         "FEATURE_STORE_S3_BUCKET")
コード例 #20
0
 def host(self, value: str) -> None:
     input_value = value or environment.get_variable("CASSANDRA_HOST")
     if input_value is None:
         raise ValueError("Config 'host' cannot be empty.")
     self.__host = input_value
コード例 #21
0
 def password(self, value: str) -> None:
     input_value = value or environment.get_variable("CASSANDRA_PASSWORD")
     if input_value is None:
         raise ValueError("Config 'password' cannot be empty.")
     self.__password = input_value
コード例 #22
0
 def username(self, value: str) -> None:
     input_value = value or environment.get_variable("CASSANDRA_USERNAME")
     if input_value is None:
         raise ValueError("Config 'username' cannot be empty.")
     self.__username = input_value
コード例 #23
0
    def test_feature_set_pipeline(
        self,
        mocked_df,
        spark_session,
        fixed_windows_output_feature_set_dataframe,
    ):
        # arrange
        table_reader_id = "a_source"
        table_reader_table = "table"
        table_reader_db = environment.get_variable(
            "FEATURE_STORE_HISTORICAL_DATABASE")
        create_temp_view(dataframe=mocked_df, name=table_reader_id)
        create_db_and_table(
            spark=spark_session,
            table_reader_id=table_reader_id,
            table_reader_db=table_reader_db,
            table_reader_table=table_reader_table,
        )

        dbconfig = Mock()
        dbconfig.mode = "overwrite"
        dbconfig.format_ = "parquet"
        dbconfig.get_options = Mock(
            return_value={"path": "test_folder/historical/entity/feature_set"})

        historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig)

        # act
        test_pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id=table_reader_id,
                        database=table_reader_db,
                        table=table_reader_table,
                    ),
                ],
                query=f"select * from {table_reader_id} ",  # noqa
            ),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature1",
                        description="test",
                        transformation=SparkFunctionTransform(functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="id",
                            order_by=TIMESTAMP_COLUMN,
                            mode="fixed_windows",
                            window_definition=["2 minutes", "15 minutes"],
                        ),
                    ),
                    Feature(
                        name="divided_feature",
                        description="unit test",
                        dtype=DataType.FLOAT,
                        transformation=CustomTransform(
                            transformer=divide,
                            column1="feature1",
                            column2="feature2",
                        ),
                    ),
                ],
                keys=[
                    KeyFeature(
                        name="id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(),
            ),
            sink=Sink(writers=[historical_writer]),
        )
        test_pipeline.run()

        # assert
        path = dbconfig.get_options("historical/entity/feature_set").get(
            "path")
        df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN)

        target_df = fixed_windows_output_feature_set_dataframe.orderBy(
            test_pipeline.feature_set.timestamp_column)

        # assert
        assert_dataframe_equality(df, target_df)

        # tear down
        shutil.rmtree("test_folder")
コード例 #24
0
 def keyspace(self, value: str) -> None:
     input_value = value or environment.get_variable("CASSANDRA_KEYSPACE")
     if not input_value:
         raise ValueError("Config 'keyspace' cannot be empty.")
     self.__keyspace = input_value