def historical_feature_retrieval( self, job_params: RetrievalJobParameters ) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( get_staging_client("s3").upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", ) ) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments(), output_file_uri=job_params.get_destination_path(), packages=job_params.get_extra_packages(), ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: """ Submits a historical feature retrieval job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: RetrievalJob: wrapper around remote job that returns file uri to the result file. """ main_file = self._datalake.upload_file(job_params.get_main_file_path()) job_info = _submit_job(self._api, "Historical-Retrieval", main_file, arguments=job_params.get_arguments(), tags={ LABEL_JOBTYPE: HISTORICAL_RETRIEVAL_JOB_TYPE, METADATA_OUTPUT_URI: job_params.get_destination_path() }) return cast(RetrievalJob, self._job_from_job_info(job_info))
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: job_id = str(uuid.uuid4()) job = StandaloneClusterRetrievalJob( job_id, job_params.get_name(), self.spark_submit(job_params), job_params.get_destination_path(), ) global_job_cache.add_job(job) return job
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: job, refresh_fn, cancel_fn = self.dataproc_submit( job_params, {"dev.feast.outputuri": job_params.get_destination_path()}) return DataprocRetrievalJob( job=job, refresh_fn=refresh_fn, cancel_fn=cancel_fn, project=self.project_id, region=self.region, output_file_uri=job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: """ Submits a historical feature retrieval job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: RetrievalJob: wrapper around remote job that returns file uri to the result file. """ with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( self._staging_client.upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._historical_retrieval_template, job_id=job_id, job_type=HISTORICAL_RETRIEVAL_JOB_TYPE, main_application_file=pyspark_script_path, main_class=None, packages=[], jars=[], extra_metadata={ METADATA_OUTPUT_URI: job_params.get_destination_path() }, azure_credentials=self._get_azure_credentials(), arguments=job_params.get_arguments(), namespace=self._namespace, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(RetrievalJob, self._job_from_job_info(job_info))
def start_historical_feature_retrieval_job( client: "Client", project: str, entity_source: Union[FileSource, BigQuerySource], feature_tables: List[FeatureTable], output_format: str, output_path: str, ) -> RetrievalJob: launcher = resolve_launcher(client.config) feature_sources = [ _source_to_argument( replace_bq_table_with_joined_view(feature_table, entity_source), client.config, ) for feature_table in feature_tables ] extra_packages = [] if output_format == "tfrecord": extra_packages.append( "com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.0") return launcher.historical_feature_retrieval( RetrievalJobParameters( entity_source=_source_to_argument(entity_source, client.config), feature_tables_sources=feature_sources, feature_tables=[ _feature_table_to_argument(client, project, feature_table) for feature_table in feature_tables ], destination={ "format": output_format, "path": output_path }, extra_packages=extra_packages, ))