def _synapse_launcher(config: Config) -> JobLauncher: from feast_spark.pyspark.launchers import synapse return synapse.SynapseJobLauncher( synapse_dev_url=config.get(opt.AZURE_SYNAPSE_DEV_URL), pool_name=config.get(opt.AZURE_SYNAPSE_POOL_NAME), datalake_dir=config.get(opt.AZURE_SYNAPSE_DATALAKE_DIR), executor_size=config.get(opt.AZURE_SYNAPSE_EXECUTOR_SIZE), executors=int(config.get(opt.AZURE_SYNAPSE_EXECUTORS)) )
def _k8s_launcher(config: Config) -> JobLauncher: from feast_spark.pyspark.launchers import k8s staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) return k8s.KubernetesJobLauncher( namespace=config.get(opt.SPARK_K8S_NAMESPACE), generic_resource_template_path=config.get(opt.SPARK_K8S_JOB_TEMPLATE_PATH), batch_ingestion_resource_template_path=config.get( opt.SPARK_K8S_BATCH_INGESTION_TEMPLATE_PATH, None ), stream_ingestion_resource_template_path=config.get( opt.SPARK_K8S_STREAM_INGESTION_TEMPLATE_PATH, None ), historical_retrieval_resource_template_path=config.get( opt.SPARK_K8S_HISTORICAL_RETRIEVAL_TEMPLATE_PATH, None ), staging_location=staging_location, incluster=config.getboolean(opt.SPARK_K8S_USE_INCLUSTER_CONFIG), staging_client=get_staging_client(staging_uri.scheme, config), # azure-related arguments are None if not using Azure blob storage azure_account_name=config.get(opt.AZURE_BLOB_ACCOUNT_NAME, None), azure_account_key=config.get(opt.AZURE_BLOB_ACCOUNT_ACCESS_KEY, None), )
def test_init_options_precedence(self): """ Init options > env var > file options > default options """ fd, path = mkstemp() os.environ["FEAST_CORE_URL"] = "env" options = {"core_url": "init", "serving_url": "init"} configuration_string = "[general]\nCORE_URL = file\n" with open(fd, "w") as f: f.write(configuration_string) config = Config(options, path) assert config.get("core_url") == "init" del os.environ["FEAST_CORE_URL"]
def test_env_var_precedence(self): """ Env vars > file options > default options """ fd, path = mkstemp() os.environ["FEAST_CORE_URL"] = "env" configuration_string = "[general]\nCORE_URL = file\n" with open(fd, "w") as f: f.write(configuration_string) config = Config(path=path) assert config.get("CORE_URL") == "env" del os.environ["FEAST_CORE_URL"]
def test_type_casting(self): """ Test type casting of strings to other types """ fd, path = mkstemp() os.environ["FEAST_INT_VAR"] = "1" os.environ["FEAST_FLOAT_VAR"] = "1.0" os.environ["FEAST_BOOLEAN_VAR"] = "True" config = Config(path=path) assert config.getint("INT_VAR") == 1 assert config.getfloat("FLOAT_VAR") == 1.0 assert config.getboolean("BOOLEAN_VAR") is True
def _emr_launcher(config: Config) -> JobLauncher: from feast.pyspark.launchers import aws def _get_optional(option): if config.exists(option): return config.get(option) return aws.EmrClusterLauncher( region=config.get(opt.EMR_REGION), existing_cluster_id=_get_optional(opt.EMR_CLUSTER_ID), new_cluster_template_path=_get_optional(opt.EMR_CLUSTER_TEMPLATE_PATH), staging_location=config.get(opt.SPARK_STAGING_LOCATION), emr_log_location=config.get(opt.EMR_LOG_LOCATION), )
def config_google(): config_dict = { "core_url": "localhost:50051", "enable_auth": True, "auth_provider": "google", } return Config(config_dict)
def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource: """ Helper function to upload a pandas dataframe in parquet format to a temporary location (under SPARK_STAGING_LOCATION) and return it wrapped in a FileSource. Args: event_timestamp_column(str): the name of the timestamp column in the dataframe. config(Config): feast config. """ staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = urlunparse( get_staging_client(staging_uri.scheme, config).upload_fileobj( f, f.name, remote_path_prefix=os.path.join(staging_location, "dataframes"), remote_path_suffix=".parquet", ) ) return FileSource( event_timestamp_column=event_timestamp_column, file_format=ParquetFormat(), file_url=file_url, )
def __init__(self, config: Config): """ Initializes a GoogleOpenIDAuthMetadataPlugin, used to sign gRPC requests Args: config: Feast Configuration object """ super(GoogleOpenIDAuthMetadataPlugin, self).__init__() self._static_token = None self._token = None # If provided, set a static token if config.exists(opt.AUTH_TOKEN): self._static_token = config.get(opt.AUTH_TOKEN) self._request = RequestWithTimeout(timeout=5) self._refresh_token()
def __init__(self, config: Config): """ Initializes an OAuthMetadataPlugin, used to sign gRPC requests Args: config: Feast Configuration object """ super(OAuthMetadataPlugin, self).__init__() self._static_token = None self._token = None # If provided, set a static token if config.exists(CONFIG_CORE_ENABLE_AUTH_TOKEN_KEY): self._static_token = config.get(CONFIG_CORE_ENABLE_AUTH_TOKEN_KEY) self._refresh_token(config) elif (config.exists(CONFIG_OAUTH_GRANT_TYPE_KEY) and config.exists(CONFIG_OAUTH_CLIENT_ID_KEY) and config.exists(CONFIG_OAUTH_CLIENT_SECRET_KEY) and config.exists(CONFIG_OAUTH_AUDIENCE_KEY) and config.exists(CONFIG_OAUTH_TOKEN_REQUEST_URL_KEY)): self._refresh_token(config) else: raise RuntimeError( " Please ensure that the " "necessary parameters are passed to the client - " "oauth_grant_type, oauth_client_id, oauth_client_secret, " "oauth_audience, oauth_token_request_url.")
def _source_to_argument(source: DataSource, config: Config): common_properties = { "field_mapping": dict(source.field_mapping), "event_timestamp_column": source.event_timestamp_column, "created_timestamp_column": source.created_timestamp_column, "date_partition_column": source.date_partition_column, } properties = {**common_properties} if isinstance(source, FileSource): properties["path"] = source.file_options.file_url properties["format"] = dict( json_class=source.file_options.file_format.__class__.__name__) return {"file": properties} if isinstance(source, BigQuerySource): project, dataset_and_table = source.bigquery_options.table_ref.split( ":") dataset, table = dataset_and_table.split(".") properties["project"] = project properties["dataset"] = dataset properties["table"] = table if config.exists( opt.SPARK_BQ_MATERIALIZATION_PROJECT) and config.exists( opt.SPARK_BQ_MATERIALIZATION_DATASET): properties["materialization"] = dict( project=config.get(opt.SPARK_BQ_MATERIALIZATION_PROJECT), dataset=config.get(opt.SPARK_BQ_MATERIALIZATION_DATASET), ) return {"bq": properties} if isinstance(source, KafkaSource): properties[ "bootstrap_servers"] = source.kafka_options.bootstrap_servers properties["topic"] = source.kafka_options.topic properties["format"] = { **source.kafka_options.message_format.__dict__, "json_class": source.kafka_options.message_format.__class__.__name__, } return {"kafka": properties} raise NotImplementedError(f"Unsupported Datasource: {type(source)}")
def __init__(self, config: Config): """ Initializes a GoogleOpenIDAuthMetadataPlugin, used to sign gRPC requests Args: config: Feast Configuration object """ super(GoogleOpenIDAuthMetadataPlugin, self).__init__() from google.auth.transport import requests self._static_token = None self._token = None # If provided, set a static token if config.exists(CONFIG_CORE_ENABLE_AUTH_TOKEN_KEY): self._static_token = config.get(CONFIG_CORE_ENABLE_AUTH_TOKEN_KEY) self._request = requests.Request() self._refresh_token()
def config_list(): """ List Feast properties for the currently active configuration """ try: print(Config()) except Exception as e: _logger.error("Error occurred when reading Feast configuration file") _logger.exception(e) sys.exit(1)
def config_with_missing_variable(): config_dict = { "core_url": "localhost:50051", "core_enable_auth": True, "core_auth_provider": "oauth", "oauth_grant_type": "client_credentials", "oauth_client_id": "fakeID", "oauth_client_secret": "fakeSecret", "oauth_token_request_url": AUTH_URL, } return Config(config_dict)
def __init__(self, options=None, **kwargs): """ JobControllerClient should be initialized with jobcontroller_url: Feast JobController address :param options: Configuration options to initialize client with :param kwargs: options in kwargs style """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self._jobcontroller_service_stub: Optional[ JobControllerServiceStub] = None self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None # Configure Auth Metadata Plugin if auth is enabled if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY): self._auth_metadata = feast_auth.get_auth_metadata_plugin( self._config)
def _parse_additional_spark_options(config: Config) -> Dict[str, str]: options_string = config.get(opt.SPARK_ADDITIONAL_OPTS, None) if options_string is None: return {} try: return dict( _quoted_split(opt_val, "=") for opt_val in _quoted_split(options_string, ";") ) except ValueError: raise ValueError(f"Cannot parse {opt.SPARK_ADDITIONAL_OPTS}: {options_string}")
def get_auth_metadata_plugin(config: Config) -> grpc.AuthMetadataPlugin: """ Get an Authentication Metadata Plugin. This plugin is used in gRPC to sign requests. Please see the following URL for more details https://grpc.github.io/grpc/python/_modules/grpc.html#AuthMetadataPlugin New plugins can be added to this function. For the time being we only support Google Open ID authentication. Returns: Returns an implementation of grpc.AuthMetadataPlugin Args: config: Feast Configuration object """ if AuthProvider(config.get(opt.AUTH_PROVIDER)) == AuthProvider.GOOGLE: return GoogleOpenIDAuthMetadataPlugin(config) elif AuthProvider(config.get(opt.AUTH_PROVIDER)) == AuthProvider.OAUTH: return OAuthMetadataPlugin(config) else: raise RuntimeError("Could not determine OAuth provider." 'Must be set to either "google" or "oauth"')
def config_google(): config_dict = { "core_url": "localhost:50051", "core_enable_auth": True, "core_auth_provider": "google", "oauth_grant_type": "client_credentials", "oauth_client_id": "fakeID", "oauth_client_secret": "fakeSecret", "oauth_audience": AUDIENCE, "oauth_token_request_url": AUTH_URL, } return Config(config_dict)
def _dataproc_launcher(config: Config) -> JobLauncher: from feast.pyspark.launchers import gcloud return gcloud.DataprocClusterLauncher( cluster_name=config.get(opt.DATAPROC_CLUSTER_NAME), staging_location=config.get(opt.SPARK_STAGING_LOCATION), region=config.get(opt.DATAPROC_REGION), project_id=config.get(opt.DATAPROC_PROJECT), executor_instances=config.get(opt.DATAPROC_EXECUTOR_INSTANCES), executor_cores=config.get(opt.DATAPROC_EXECUTOR_CORES), executor_memory=config.get(opt.DATAPROC_EXECUTOR_MEMORY), )
def get_hash(self) -> str: source = _source_to_argument(self._feature_table.stream_source, Config()) feature_table = _feature_table_to_argument( None, "default", self._feature_table) # type: ignore job_json = json.dumps( { "source": source, "feature_table": feature_table }, sort_keys=True, ) return hashlib.md5(job_json.encode()).hexdigest()
def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None
def stop_stream_to_online(feature_table: str): """ Stop stream to online sync job """ spark_launcher = Config().get(CONFIG_SPARK_LAUNCHER) if spark_launcher == "emr": import feast.pyspark.aws.jobs feast.pyspark.aws.jobs.stop_stream_to_online(feature_table) else: raise NotImplementedError( f"Feast currently does not provide support for the specified spark launcher: {spark_launcher}" )
def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Please see constants.py for configuration options. Commonly used options or arguments include: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API enable_auth: Enable authentication and authorization auth_provider: Authentication provider – "google" or "oauth" if auth_provider is "oauth", the following fields are mandatory – oauth_grant_type, oauth_client_id, oauth_client_secret, oauth_audience, oauth_token_request_url Args: options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self._core_service_stub: Optional[CoreServiceStub] = None self._serving_service_stub: Optional[ServingServiceStub] = None self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None self._registry_impl: Optional[Registry] = None # Configure Auth Metadata Plugin if auth is enabled if self._config.getboolean(opt.ENABLE_AUTH): self._auth_metadata = feast_auth.get_auth_metadata_plugin( self._config) self._configure_telemetry()
def test_defaults_are_not_written(self): """ default values are not written to config file """ fd, path = mkstemp() config = Config(path=path) config.set("option", "value") config.save() with open(path) as f: assert f.read() == "[general]\noption = value\n\n"
def test_exists(self): """ Test type casting of strings to other types """ fd, path = mkstemp() config = Config(path=path) config.set("my_val_exist", 1) assert config.exists("my_val_exist") is True assert config.exists("my_val_not_exist") is False
def config_set(prop, value): """ Set a Feast properties for the currently active configuration """ try: conf = Config() conf.set(option=prop.strip(), value=value.strip()) conf.save() except Exception as e: _logger.error("Error in reading config file") _logger.exception(e) sys.exit(1)
def list_jobs(): """ List jobs """ from tabulate import tabulate spark_launcher = Config().get(CONFIG_SPARK_LAUNCHER) if spark_launcher == "emr": import feast.pyspark.aws.jobs jobs = feast.pyspark.aws.jobs.list_jobs(None, None) print( tabulate(jobs, headers=feast.pyspark.aws.jobs.JobInfo._fields, tablefmt="plain")) else: raise NotImplementedError( f"Feast currently does not provide support for the specified spark launcher: {spark_launcher}" )
def _k8s_launcher(config: Config) -> JobLauncher: from feast.pyspark.launchers import k8s staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) return k8s.KubernetesJobLauncher( namespace=config.get(opt.SPARK_K8S_NAMESPACE), resource_template_path=config.get(opt.SPARK_K8S_JOB_TEMPLATE_PATH, None), staging_location=staging_location, incluster=config.getboolean(opt.SPARK_K8S_USE_INCLUSTER_CONFIG), staging_client=get_staging_client(staging_uri.scheme, config), # azure-related arguments are None if not using Azure blob storage azure_account_name=config.get(opt.AZURE_BLOB_ACCOUNT_NAME), azure_account_key=config.get(opt.AZURE_BLOB_ACCOUNT_ACCESS_KEY), )
def resolve_launcher(config: Config) -> JobLauncher: return _launchers[config.get(opt.SPARK_LAUNCHER)](config)
def _standalone_launcher(config: Config) -> JobLauncher: from feast.pyspark.launchers import standalone return standalone.StandaloneClusterLauncher( config.get(opt.SPARK_STANDALONE_MASTER), config.get(opt.SPARK_HOME), )