Ejemplo n.º 1
0
    def offline_to_online_ingestion(
        self, ingestion_job_params: BatchIngestionJobParameters
    ) -> BatchIngestionJob:
        """
        Submits a batch ingestion job to a Spark cluster.

        Raises:
            SparkJobFailure: The spark job submission failed, encountered error
                during execution, or timeout.

        Returns:
            BatchIngestionJob: wrapper around remote job that can be used to check when job completed.
        """

        jar_s3_path = _upload_jar(
            self._staging_location, ingestion_job_params.get_main_file_path()
        )
        step = _sync_offline_to_online_step(
            jar_s3_path,
            ingestion_job_params.get_feature_table_name(),
            args=ingestion_job_params.get_arguments(),
        )

        job_ref = self._submit_emr_job(step)

        return EmrBatchIngestionJob(
            self._emr_client(), job_ref, ingestion_job_params.get_feature_table_name()
        )
Ejemplo n.º 2
0
    def offline_to_online_ingestion(
        self, ingestion_job_params: BatchIngestionJobParameters
    ) -> BatchIngestionJob:
        """
        Submits a batch ingestion job to a Spark cluster.

        Raises:
            SparkJobFailure: The spark job submission failed, encountered error
                during execution, or timeout.

        Returns:
            BatchIngestionJob: wrapper around remote job that can be used to check when job completed.
        """

        jar_s3_path = self._upload_jar(
            ingestion_job_params.get_main_file_path())

        job_id = _generate_job_id()

        resource = _prepare_job_resource(
            job_template=self._batch_ingestion_template,
            job_id=job_id,
            job_type=OFFLINE_TO_ONLINE_JOB_TYPE,
            main_application_file=jar_s3_path,
            main_class=ingestion_job_params.get_class_name(),
            packages=[],
            jars=[],
            extra_metadata={},
            azure_credentials=self._get_azure_credentials(),
            arguments=ingestion_job_params.get_arguments(),
            namespace=self._namespace,
            extra_labels={
                LABEL_FEATURE_TABLE:
                _truncate_label(ingestion_job_params.get_feature_table_name()),
                LABEL_FEATURE_TABLE_HASH:
                _generate_project_table_hash(
                    ingestion_job_params.get_project(),
                    ingestion_job_params.get_feature_table_name(),
                ),
                LABEL_PROJECT:
                ingestion_job_params.get_project(),
            },
        )

        job_info = _submit_job(
            api=self._api,
            resource=resource,
            namespace=self._namespace,
        )

        return cast(BatchIngestionJob, self._job_from_job_info(job_info))
Ejemplo n.º 3
0
 def offline_to_online_ingestion(
     self, ingestion_job_params: BatchIngestionJobParameters
 ) -> BatchIngestionJob:
     job_id = str(uuid.uuid4())
     ui_port = _find_free_port()
     job = StandaloneClusterBatchIngestionJob(
         job_id,
         ingestion_job_params.get_name(),
         self.spark_submit(ingestion_job_params, ui_port),
         ui_port,
         ingestion_job_params.get_feature_table_name(),
     )
     global_job_cache.add_job(job)
     return job