def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: """ Starts a stream ingestion job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: StreamIngestionJob: wrapper around remote job. """ jar_s3_path = self._upload_jar( ingestion_job_params.get_main_file_path()) extra_jar_paths: List[str] = [] for extra_jar in ingestion_job_params.get_extra_jar_paths(): extra_jar_paths.append(self._upload_jar(extra_jar)) job_hash = ingestion_job_params.get_job_hash() job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._stream_ingestion_template, job_id=job_id, job_type=STREAM_TO_ONLINE_JOB_TYPE, main_application_file=jar_s3_path, main_class=ingestion_job_params.get_class_name(), packages=[], jars=extra_jar_paths, extra_metadata={METADATA_JOBHASH: job_hash}, azure_credentials=self._get_azure_credentials(), arguments=ingestion_job_params.get_arguments(), namespace=self._namespace, extra_labels={ LABEL_FEATURE_TABLE: _truncate_label(ingestion_job_params.get_feature_table_name()), LABEL_FEATURE_TABLE_HASH: _generate_project_table_hash( ingestion_job_params.get_project(), ingestion_job_params.get_feature_table_name(), ), LABEL_PROJECT: ingestion_job_params.get_project(), }, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(StreamIngestionJob, self._job_from_job_info(job_info))
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: """ Starts a stream ingestion job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: StreamIngestionJob: wrapper around remote job. """ main_file = self._datalake.upload_file( ingestion_job_params.get_main_file_path()) extra_jar_paths: List[str] = [] for extra_jar in ingestion_job_params.get_extra_jar_paths(): extra_jar_paths.append(self._datalake.upload_file(extra_jar)) tags = _prepare_job_tags(ingestion_job_params, STREAM_TO_ONLINE_JOB_TYPE) tags[METADATA_JOBHASH] = ingestion_job_params.get_job_hash() job_info = _submit_job( self._api, ingestion_job_params.get_project() + "_stream_to_online_ingestion", main_file, main_class=ingestion_job_params.get_class_name(), arguments=ingestion_job_params.get_arguments(), reference_files=extra_jar_paths, configuration=None, tags=tags) return cast(StreamIngestionJob, self._job_from_job_info(job_info))
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: """ Starts a stream ingestion job on a Spark cluster. Returns: StreamIngestionJob: wrapper around remote job that can be used to check on the job. """ jar_s3_path = _upload_jar(self._staging_location, ingestion_job_params.get_main_file_path()) extra_jar_paths: List[str] = [] for extra_jar in ingestion_job_params.get_extra_jar_paths(): if extra_jar.startswith("s3://"): extra_jar_paths.append(extra_jar) else: extra_jar_paths.append( _upload_jar(self._staging_location, extra_jar)) job_hash = ingestion_job_params.get_job_hash() step = _stream_ingestion_step( jar_path=jar_s3_path, extra_jar_paths=extra_jar_paths, project=ingestion_job_params.get_project(), feature_table_name=ingestion_job_params.get_feature_table_name(), args=ingestion_job_params.get_arguments(), job_hash=job_hash, ) job_ref = self._submit_emr_job(step) return EmrStreamIngestionJob( self._emr_client(), job_ref, job_hash, ingestion_job_params.get_project(), ingestion_job_params.get_feature_table_name(), )