def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( get_staging_client("s3").upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments(), output_file_uri=job_params.get_destination_path(), ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters ) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = _s3_upload( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", ) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments() ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, os.path.join(job_params.get_destination_path(), _random_string(8)), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: job, refresh_fn, cancel_fn = self.dataproc_submit( job_params, {"dev.feast.outputuri": job_params.get_destination_path()}) return DataprocRetrievalJob(job, refresh_fn, cancel_fn, job_params.get_destination_path())
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: job_id = str(uuid.uuid4()) return StandaloneClusterRetrievalJob( job_id, job_params.get_name(), self.spark_submit(job_params), job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: job, refresh_fn, cancel_fn = self.dataproc_submit( job_params, {"dev.feast.outputuri": job_params.get_destination_path()}) return DataprocRetrievalJob( job=job, refresh_fn=refresh_fn, cancel_fn=cancel_fn, project=self.project_id, region=self.region, output_file_uri=job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: """ Submits a historical feature retrieval job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: RetrievalJob: wrapper around remote job that returns file uri to the result file. """ with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( self._get_staging_client().upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._resource_template, job_id=job_id, job_type=HISTORICAL_RETRIEVAL_JOB_TYPE, main_application_file=pyspark_script_path, main_class=None, packages=[], jars=[], extra_metadata={ METADATA_OUTPUT_URI: job_params.get_destination_path() }, arguments=job_params.get_arguments(), namespace=self._namespace, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(RetrievalJob, self._job_from_job_info(job_info))
def start_historical_feature_retrieval_job( client: "Client", project: str, entity_source: Union[FileSource, BigQuerySource], feature_tables: List[FeatureTable], output_format: str, output_path: str, ) -> RetrievalJob: launcher = resolve_launcher(client._config) feature_sources = [ _source_to_argument( replace_bq_table_with_joined_view(feature_table, entity_source), client._config, ) for feature_table in feature_tables ] return launcher.historical_feature_retrieval( RetrievalJobParameters( entity_source=_source_to_argument(entity_source, client._config), feature_tables_sources=feature_sources, feature_tables=[ _feature_table_to_argument(client, project, feature_table) for feature_table in feature_tables ], destination={"format": output_format, "path": output_path}, ) )
def start_historical_feature_retrieval_job( client: "Client", project: str, entity_source: Union[FileSource, BigQuerySource], feature_tables: List[FeatureTable], output_format: str, output_path: str, ) -> RetrievalJob: launcher = resolve_launcher(client._config) feature_sources = [ _source_to_argument( replace_bq_table_with_joined_view(feature_table, entity_source), client._config, ) for feature_table in feature_tables ] extra_packages = [] if output_format == "tfrecord": extra_packages.append( "com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.0") return launcher.historical_feature_retrieval( RetrievalJobParameters( entity_source=_source_to_argument(entity_source, client._config), feature_tables_sources=feature_sources, feature_tables=[ _feature_table_to_argument(client, project, feature_table) for feature_table in feature_tables ], destination={ "format": output_format, "path": output_path }, extra_packages=extra_packages, ))
def start_historical_feature_retrieval_job( client: "Client", entity_source: Union[FileSource, BigQuerySource], feature_tables: List[FeatureTable], output_format: str, output_path: str, ) -> RetrievalJob: launcher = resolve_launcher(client._config) return launcher.historical_feature_retrieval( RetrievalJobParameters( entity_source=_source_to_argument(entity_source), feature_tables_sources=[ _source_to_argument(feature_table.batch_source) for feature_table in feature_tables ], feature_tables=[ _feature_table_to_argument(client, feature_table) for feature_table in feature_tables ], destination={ "format": output_format, "path": output_path }, extra_options=client._config.get(CONFIG_SPARK_EXTRA_OPTIONS), ))
def new_retrieval_job_params( entity_source_uri: str, feature_source_uri: str, destination_uri: str, output_format: str, ) -> RetrievalJobParameters: entity_source = { "file": { "format": { "json_class": "ParquetFormat" }, "path": entity_source_uri, "event_timestamp_column": "event_timestamp", } } feature_tables_sources = [{ "file": { "format": { "json_class": "ParquetFormat" }, "path": feature_source_uri, "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", } }] feature_tables = [{ "name": "customer_transactions", "entities": [{ "name": "customer", "type": "int64" }], "features": [{ "name": "total_transactions", "type": "double" }], }] destination = {"format": output_format, "path": destination_uri} return RetrievalJobParameters( feature_tables=feature_tables, feature_tables_sources=feature_tables_sources, entity_source=entity_source, destination=destination, extra_packages=[ "com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.0" ], )
def new_retrieval_job_params( entity_source_uri: str, feature_source_uri: str, destination_uri: str ) -> RetrievalJobParameters: entity_source = { "file": { "format": "parquet", "path": entity_source_uri, "event_timestamp_column": "event_timestamp", } } feature_tables_sources = [ { "file": { "format": "parquet", "path": feature_source_uri, "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", } } ] feature_tables = [ { "name": "customer_transactions", "entities": [{"name": "customer", "type": "int32"}], } ] destination = {"format": "parquet", "path": destination_uri} return RetrievalJobParameters( feature_tables=feature_tables, feature_tables_sources=feature_tables_sources, entity_source=entity_source, destination=destination, )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: operation = self.dataproc_submit(job_params) cancel_fn = partial(self.dataproc_cancel, operation.metadata.job_id) return DataprocRetrievalJob(operation, cancel_fn, job_params.get_destination_path())
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: return DataprocRetrievalJob(self.dataproc_submit(job_params), job_params.get_destination_path())