def test_get_variable_out_of_spec_fails(monkeypatch): # given not_specified_variable = "not_specified_variable" monkeypatch.setenv(not_specified_variable, "anything") if not_specified_variable in environment.specification: del environment.specification[not_specified_variable] # then with pytest.raises(environment.UnspecifiedVariableError, match="not listed in the environment"): environment.get_variable(not_specified_variable, "anything")
def kafka_connection_string(self, value: str) -> None: input_value = value or environment.get_variable( "KAFKA_CONSUMER_CONNECTION_STRING") if input_value is None: raise ValueError( "Config 'kafka connection string' cannot be empty.") self.__kafka_connection_string = input_value
def __init__( self, id: str, topic: str, value_schema: StructType, connection_string: str = None, topic_options: dict = None, stream: bool = True, ): super().__init__(id) if not isinstance(topic, str): raise ValueError("topic must be a string with the topic name") if not isinstance(value_schema, StructType): raise ValueError( "value_schema must be a StructType with the schema " 'of the JSON presented in "value" Kafka column') self.topic = topic self.value_schema = value_schema self.connection_string = connection_string or environment.get_variable( "KAFKA_CONSUMER_CONNECTION_STRING") self.options = dict( { "kafka.bootstrap.servers": self.connection_string, "subscribe": self.topic, }, **topic_options if topic_options else {}, ) self.stream = stream
def __init__( self, database: str = None, ) -> None: self._db_config = MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") super(MetastoreMigration, self).__init__(SparkClient())
def test_get_variable_success(monkeypatch): # given specified_variable = "specified_variable" effective_value = "effective_value" monkeypatch.setenv(specified_variable, effective_value) environment.specification[specified_variable] = "spec_default_value" # when return_value = environment.get_variable(specified_variable, "anything") # then assert return_value == effective_value
def test_get_variable_default(monkeypatch): # given default = "default_value" variable = "environment_variable" environment.specification[variable] = None monkeypatch.setenv(variable, "overwrite") monkeypatch.delenv(variable) # when return_value = environment.get_variable(variable, default) # then assert return_value == default
def test_get_variable_from_spec_default(monkeypatch): # given specified_variable = "specified_variable" spec_default_value = "default_value" monkeypatch.setenv(specified_variable, "overwrite") monkeypatch.delenv(specified_variable) environment.specification[specified_variable] = spec_default_value # when return_value = environment.get_variable(specified_variable, "anything") # then assert return_value == spec_default_value
def __init__( self, db_config=None, database=None, num_partitions=None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, debug_mode: bool = False, ): self.db_config = db_config or S3Config() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) self.num_partitions = num_partitions or DEFAULT_NUM_PARTITIONS self.validation_threshold = validation_threshold self.debug_mode = debug_mode
def __init__( self, db_config: Union[AbstractWriteConfig, MetastoreConfig] = None, database: str = None, num_partitions: int = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, debug_mode: bool = False, ): super(HistoricalFeatureStoreWriter, self).__init__() self.db_config = db_config or MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") self.num_partitions = num_partitions or DEFAULT_NUM_PARTITIONS self.validation_threshold = validation_threshold self.debug_mode = debug_mode
def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: """Send all migration logs to S3.""" file_name = "../logging.json" if not file_local and os.path.exists(file_name): s3_client = boto3.client("s3") timestamp = datetime.datetime.now() if debug_mode: object_name = ( f"logs/migrate-debug-mode/" f"{timestamp.strftime('%Y-%m-%d')}" f"/logging-{timestamp.strftime('%H:%M:%S')}.json") else: object_name = ( f"logs/migrate/" f"{timestamp.strftime('%Y-%m-%d')}" f"/logging-{timestamp.strftime('%H:%M:%S')}.json") bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") try: s3_client.upload_file( file_name, bucket, object_name, ExtraArgs={"ACL": "bucket-owner-full-control"}, ) except ClientError: raise os.remove(file_name) elif os.path.exists(file_name): print("Logs written to ../logging.json") else: print("No logs were generated.")
def test_bucket(self, s3_config): # expecting default = environment.get_variable("FEATURE_STORE_S3_BUCKET") assert s3_config.bucket == default
def stream_checkpoint_path(self, value: str) -> None: self.__stream_checkpoint_path = value or environment.get_variable( "STREAM_CHECKPOINT_PATH")
def path(self, value: str) -> None: self.__path = value or environment.get_variable("FEATURE_STORE_S3_BUCKET")
def write_consistency_level(self, value: str) -> None: self.__write_consistency_level = value or environment.get_variable( "CASSANDRA_WRITE_CONSISTENCY_LEVEL", "LOCAL_QUORUM")
def local_dc(self, value: str) -> None: self.__local_dc = value or environment.get_variable( "CASSANDRA_LOCAL_DC")
def test_pipeline_interval_run(self, mocked_date_df, pipeline_interval_run_target_dfs, spark_session): """Testing pipeline's idempotent interval run feature. Source data: +-------+---+-------------------+-------------------+ |feature| id| ts| timestamp| +-------+---+-------------------+-------------------+ | 200| 1|2016-04-11 11:31:11|2016-04-11 11:31:11| | 300| 1|2016-04-12 11:44:12|2016-04-12 11:44:12| | 400| 1|2016-04-13 11:46:24|2016-04-13 11:46:24| | 500| 1|2016-04-14 12:03:21|2016-04-14 12:03:21| +-------+---+-------------------+-------------------+ The test executes 3 runs for different time intervals. The input data has 4 data points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run specifications are: 1) Interval: from 2016-04-11 to 2016-04-13 Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| +---+-------+---+-----+------+-------------------+----+ 2) Interval: only 2016-04-14. Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| +---+-------+---+-----+------+-------------------+----+ 3) Interval: only 2016-04-11. Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 3|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| +---+-------+---+-----+------+-------------------+----+ """ # arrange create_temp_view(dataframe=mocked_date_df, name="input_data") db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") path = "test_folder/historical/entity/feature_set" spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") spark_session.sql(f"create database if not exists {db}") spark_session.sql( f"create table if not exists {db}.feature_set_interval " f"(id int, timestamp timestamp, feature int, " f"run_id int, year int, month int, day int);") dbconfig = MetastoreConfig() dbconfig.get_options = Mock(return_value={ "mode": "overwrite", "format_": "parquet", "path": path }) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig, interval_mode=True) first_run_hook = RunHook(id=1) second_run_hook = RunHook(id=2) third_run_hook = RunHook(id=3) ( first_run_target_df, second_run_target_df, third_run_target_df, ) = pipeline_interval_run_target_dfs test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="id", table="input_data", ).with_incremental_strategy(IncrementalStrategy("ts")), ], query="select * from id ", ), feature_set=FeatureSet( name="feature_set_interval", entity="entity", description="", keys=[ KeyFeature( name="id", description="", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature(name="feature", description="", dtype=DataType.INTEGER), Feature(name="run_id", description="", dtype=DataType.INTEGER), ], ), sink=Sink([historical_writer], ), ) # act and assert dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", ]) test_pipeline.feature_set.add_pre_hook(first_run_hook) test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") first_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(first_run_output_df, first_run_target_df) dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", ]) test_pipeline.feature_set.add_pre_hook(second_run_hook) test_pipeline.run_for_date("2016-04-14") second_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(second_run_output_df, second_run_target_df) dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", ]) test_pipeline.feature_set.add_pre_hook(third_run_hook) test_pipeline.run_for_date("2016-04-11") third_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(third_run_output_df, third_run_target_df) # tear down shutil.rmtree("test_folder")
def read_consistency_level(self, value: str) -> None: self.__read_consistency_level = value or environment.get_variable( "CASSANDRA_READ_CONSISTENCY_LEVEL", "LOCAL_ONE")
def test_path(self, metastore_config): # expecting default = environment.get_variable("FEATURE_STORE_S3_BUCKET") assert metastore_config.path == default
def bucket(self, value: str): self.__bucket = value or environment.get_variable( "FEATURE_STORE_S3_BUCKET")
def host(self, value: str) -> None: input_value = value or environment.get_variable("CASSANDRA_HOST") if input_value is None: raise ValueError("Config 'host' cannot be empty.") self.__host = input_value
def password(self, value: str) -> None: input_value = value or environment.get_variable("CASSANDRA_PASSWORD") if input_value is None: raise ValueError("Config 'password' cannot be empty.") self.__password = input_value
def username(self, value: str) -> None: input_value = value or environment.get_variable("CASSANDRA_USERNAME") if input_value is None: raise ValueError("Config 'username' cannot be empty.") self.__username = input_value
def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" dbconfig.get_options = Mock( return_value={"path": "test_folder/historical/entity/feature_set"}) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[historical_writer]), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")
def keyspace(self, value: str) -> None: input_value = value or environment.get_variable("CASSANDRA_KEYSPACE") if not input_value: raise ValueError("Config 'keyspace' cannot be empty.") self.__keyspace = input_value