def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource: """ Helper function to upload a pandas dataframe in parquet format to a temporary location (under SPARK_STAGING_LOCATION) and return it wrapped in a FileSource. Args: event_timestamp_column(str): the name of the timestamp column in the dataframe. config(Config): feast config. """ staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = urlunparse( get_staging_client(staging_uri.scheme, config).upload_fileobj( f, f.name, remote_path_prefix=os.path.join(staging_location, "dataframes"), remote_path_suffix=".parquet", ) ) return FileSource( event_timestamp_column=event_timestamp_column, file_format=ParquetFormat(), file_url=file_url, )
def _k8s_launcher(config: Config) -> JobLauncher: from feast_spark.pyspark.launchers import k8s staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) return k8s.KubernetesJobLauncher( namespace=config.get(opt.SPARK_K8S_NAMESPACE), generic_resource_template_path=config.get(opt.SPARK_K8S_JOB_TEMPLATE_PATH), batch_ingestion_resource_template_path=config.get( opt.SPARK_K8S_BATCH_INGESTION_TEMPLATE_PATH, None ), stream_ingestion_resource_template_path=config.get( opt.SPARK_K8S_STREAM_INGESTION_TEMPLATE_PATH, None ), historical_retrieval_resource_template_path=config.get( opt.SPARK_K8S_HISTORICAL_RETRIEVAL_TEMPLATE_PATH, None ), staging_location=staging_location, incluster=config.getboolean(opt.SPARK_K8S_USE_INCLUSTER_CONFIG), staging_client=get_staging_client(staging_uri.scheme, config), # azure-related arguments are None if not using Azure blob storage azure_account_name=config.get(opt.AZURE_BLOB_ACCOUNT_NAME, None), azure_account_key=config.get(opt.AZURE_BLOB_ACCOUNT_ACCESS_KEY, None), )
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str, config: Config) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme, config) with tempfile.NamedTemporaryFile() as df_export_path: # prevent casting ns -> ms exception inside pyarrow entity_source["event_timestamp"] = entity_source[ "event_timestamp"].dt.floor("ms") entity_source.to_parquet(df_export_path.name) with open(df_export_path.name, "rb") as f: staging_client.upload_fileobj(f, df_export_path.name, remote_uri=entity_staging_uri) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( get_staging_client("s3").upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments(), output_file_uri=job_params.get_destination_path(), ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, job_params.get_destination_path(), )
def _upload_to_file_source(file_url: str, with_partitions: bool, dest_path: str) -> None: """ Uploads data into a FileSource. Currently supports GCS, S3 and Local FS. Args: file_url: file url of FileSource defined for FeatureTable """ from urllib.parse import urlparse uri = urlparse(file_url) staging_client = get_staging_client(uri.scheme) if with_partitions: for path in glob.glob(os.path.join(dest_path, "**/*")): file_name = path.split("/")[-1] partition_col = path.split("/")[-2] staging_client.upload_file( path, uri.hostname, str(uri.path).strip("/") + "/" + partition_col + "/" + file_name, ) else: file_name = dest_path.split("/")[-1] staging_client.upload_file( dest_path, uri.hostname, str(uri.path).strip("/") + "/" + file_name, )
def _upload_jar(jar_s3_prefix: str, local_path: str) -> str: with open(local_path, "rb") as f: uri = urlparse(os.path.join(jar_s3_prefix, os.path.basename(local_path))) return urlunparse( get_staging_client(uri.scheme).upload_fileobj( f, local_path, remote_uri=uri, ) )
def _stage_file(self, file_path: str, job_id: str) -> str: if not os.path.isfile(file_path): return file_path staging_client = get_staging_client("gs") blob_path = os.path.join(self.remote_path, job_id, os.path.basename(file_path),) staging_client.upload_file(file_path, self.staging_bucket, blob_path) return f"gs://{self.staging_bucket}/{blob_path}"
def _upload_jar(jar_s3_prefix: str, jar_path: str) -> str: if jar_path.startswith("https://"): return jar_path with open(jar_path, "rb") as f: uri = urlparse(os.path.join(jar_s3_prefix, os.path.basename(jar_path))) return urlunparse( get_staging_client(uri.scheme).upload_fileobj(f, jar_path, remote_uri=uri))
def _stage_files(self, pyspark_script: str, job_id: str) -> str: staging_client = get_staging_client("gs") blob_path = os.path.join( self.remote_path, job_id, os.path.basename(pyspark_script), ) staging_client.upload_file(pyspark_script, self.staging_bucket, blob_path) return f"gs://{self.staging_bucket}/{blob_path}"
def _download_jar(remote_jar: str) -> str: remote_jar_parts = urlparse(remote_jar) local_temp_jar = tempfile.NamedTemporaryFile(suffix=".jar", delete=False) with local_temp_jar: shutil.copyfileobj( get_staging_client(remote_jar_parts.scheme).download_file(remote_jar_parts), local_temp_jar, ) return local_temp_jar.name
def _stage_file(self, file_path: str, job_id: str) -> str: if not os.path.isfile(file_path): return file_path staging_client = get_staging_client("gs") blob_path = os.path.join( self.remote_path, job_id, os.path.basename(file_path), ).lstrip("/") blob_uri_str = f"gs://{self.staging_bucket}/{blob_path}" with open(file_path, "rb") as f: staging_client.upload_fileobj( f, file_path, remote_uri=urlparse(blob_uri_str) ) return blob_uri_str
def _k8s_launcher(config: Config) -> JobLauncher: from feast.pyspark.launchers import k8s staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) return k8s.KubernetesJobLauncher( namespace=config.get(opt.SPARK_K8S_NAMESPACE), resource_template_path=config.get(opt.SPARK_K8S_JOB_TEMPLATE_PATH, None), staging_location=staging_location, incluster=config.getboolean(opt.SPARK_K8S_USE_INCLUSTER_CONFIG), staging_client=get_staging_client(staging_uri.scheme, config), # azure-related arguments are None if not using Azure blob storage azure_account_name=config.get(opt.AZURE_BLOB_ACCOUNT_NAME), azure_account_key=config.get(opt.AZURE_BLOB_ACCOUNT_ACCESS_KEY), )
def _upload_to_file_source( file_url: str, with_partitions: bool, dest_path: str, config: Config ) -> None: """ Uploads data into a FileSource. Currently supports GCS, S3 and Local FS. Args: file_url: file url of FileSource defined for FeatureTable with_partitions: whether to treat dest_path as dir with partitioned table dest_path: path to file or dir to be uploaded config: Config instance to configure FileSource """ from urllib.parse import urlparse uri = urlparse(file_url) staging_client = get_staging_client(uri.scheme, config) if with_partitions: for path in glob.glob(os.path.join(dest_path, "**/*")): file_name = path.split("/")[-1] partition_col = path.split("/")[-2] with open(path, "rb") as f: staging_client.upload_fileobj( f, path, remote_uri=uri._replace( path=str(uri.path).rstrip("/") + "/" + partition_col + "/" + file_name ), ) else: file_name = dest_path.split("/")[-1] with open(dest_path, "rb") as f: staging_client.upload_fileobj( f, dest_path, remote_uri=uri._replace( path=str(uri.path).rstrip("/") + "/" + file_name ), )
def stage_dataframe(self, df: pandas.DataFrame, event_timestamp: str) -> FileSource: with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = urlunparse( get_staging_client("s3").upload_fileobj( f, f.name, remote_path_prefix=os.path.join( self._staging_location, "dataframes" ), remote_path_suffix=".parquet", ) ) return FileSource( event_timestamp_column=event_timestamp, file_format=ParquetFormat(), file_url=file_url, )
def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])): """ Wait until job is done to get an iterable rows of result. The row can only represent an Avro row in Feast 0.3. Args: timeout_sec (int): Max no of seconds to wait until job is done. If "timeout_sec" is exceeded, an exception will be raised. Returns: Iterable of Avro rows. """ uris = self.get_avro_files(timeout_sec) for file_uri in uris: file_obj = get_staging_client(file_uri.scheme).download_file(file_uri) file_obj.seek(0) avro_reader = fastavro.reader(file_obj) for record in avro_reader: yield record
def apply_validation( client: "Client", feature_table: "FeatureTable", udf: ValidationUDF, validation_window_secs: int, include_py_libs=_UNSET, ): """ Uploads validation udf code to staging location & stores path to udf code and required python libraries as FeatureTable labels. """ include_py_libs = ( include_py_libs if include_py_libs is not _UNSET else GE_PACKED_ARCHIVE ) staging_location = client._config.get(ConfigOptions.SPARK_STAGING_LOCATION).rstrip( "/" ) staging_scheme = urlparse(staging_location).scheme staging_client = get_staging_client(staging_scheme, client._config) pickled_code_fp = io.BytesIO(udf.pickled_code) remote_path = f"{staging_location}/udfs/{feature_table.name}/{udf.name}.pickle" staging_client.upload_fileobj( pickled_code_fp, f"{udf.name}.pickle", remote_uri=urlparse(remote_path) ) feature_table.labels.update( { "_validation": json.dumps( dict( name=udf.name, pickled_code_path=remote_path, include_archive_path=include_py_libs, ) ), "_streaming_trigger_secs": str(validation_window_secs), } ) client.apply_feature_table(feature_table)
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme) with tempfile.NamedTemporaryFile() as df_export_path: entity_source.to_parquet(df_export_path.name) bucket = (None if entity_staging_uri.scheme == "file" else entity_staging_uri.netloc) staging_client.upload_file(df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )
def export_source_to_staging_location( source: Union[pd.DataFrame, str], staging_location_uri: str ) -> List[str]: """ Uploads a DataFrame as an Avro file to a remote staging location. The local staging location specified in this function is used for E2E tests, please do not use it. Args: source (Union[pd.DataFrame, str]: Source of data to be staged. Can be a pandas DataFrame or a file path. Only four types of source are allowed: * Pandas DataFrame * Local Avro file * GCS Avro file * S3 Avro file * Azure Blob storage Avro file staging_location_uri (str): Remote staging location where DataFrame should be written. Examples: * gs://bucket/path/ * s3://bucket/path/ * wasbs://bucket@account_name.blob.core.windows.net/path/ * file:///data/subfolder/ Returns: List[str]: Returns a list containing the full path to the file(s) in the remote staging location. """ uri = urlparse(staging_location_uri) # Prepare Avro file to be exported to staging location if isinstance(source, pd.DataFrame): # Remote gs staging location provided by serving dir_path, file_name, source_path = export_dataframe_to_local(df=source) elif isinstance(source, str): source_uri = urlparse(source) if source_uri.scheme in ["", "file"]: # Local file provided as a source dir_path = "" file_name = os.path.basename(source) source_path = os.path.abspath( os.path.join(source_uri.netloc, source_uri.path) ) else: # gs, s3, azure blob file provided as a source. assert source_uri.hostname is not None return get_staging_client(source_uri.scheme).list_files(uri=source_uri) else: raise Exception( f"Only string and DataFrame types are allowed as a " f"source, {type(source)} was provided." ) # Push data to required staging location with open(source_path, "rb") as f: get_staging_client(uri.scheme).upload_fileobj( f, source_path, remote_uri=uri._replace(path=str(uri.path).strip("/") + "/" + file_name), ) # Clean up, remove local staging file if dir_path and isinstance(source, pd.DataFrame) and len(dir_path) > 4: shutil.rmtree(dir_path) return [staging_location_uri.rstrip("/") + "/" + file_name]
def _get_staging_client(self): uri = urlparse(self._staging_location) return get_staging_client(uri.scheme)
def get_historical_features( self, feature_refs: List[str], entity_source: Union[pd.DataFrame, FileSource, BigQuerySource], project: str = None, ) -> RetrievalJob: """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows. If entity_source is a Panda DataFrame, the dataframe will be exported to the staging location as parquet file. It is also assumed that the column event_timestamp is present in the dataframe, and is of type datetime without timezone information. The user needs to make sure that the source (or staging location, if entity_source is a Panda DataFrame) is accessible from the Spark cluster that will be used for the retrieval job. project: Specifies the project that contains the feature tables which the requested features belong to. Returns: Returns a retrieval job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer") >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_source, project="my_project") >>> output_file_uri = feature_retrieval_job.get_output_file_uri() "gs://some-bucket/output/ """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, project) output_location = os.path.join( self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION), str(uuid.uuid4()), ) output_format = self._config.get( CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT) if isinstance(entity_source, pd.DataFrame): staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION) entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme) with tempfile.NamedTemporaryFile() as df_export_path: entity_source.to_parquet(df_export_path.name) bucket = (None if entity_staging_uri.scheme == "file" else entity_staging_uri.netloc) staging_client.upload_file(df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")) entity_source = FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), entity_staging_uri.geturl(), ) return start_historical_feature_retrieval_job( self, entity_source, feature_tables, output_format, os.path.join(output_location, str(uuid.uuid4())), )