def get_os_client_ex(spark, region): from oci_core import dfapp_get_os_client, get_delegation_token, get_os_client if USE_INSTANCE_PRINCIPLE: delegation_token = get_delegation_token(spark) os_client = dfapp_get_os_client(region, delegation_token) else: with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as key_f: key_f.write(OCI_KEY) _oci_config = dict(OCI_CONFIG) _oci_config['key_file'] = key_f.name os_client = get_os_client(None, config=_oci_config) return os_client
def run(self, deployment_location, options={}, args={}, handlers=None, on_job_submitted=None, cli_mode=False): # options fields # num_executors : number # driver_shape : string # executor_shape : string # lib_url_duration : number (repre the number of minutes) # on_job_submitted : callback, on_job_submitted(run_id, vendor_info={'oci_run_id': 'xxxyyy'}) o = urlparse(deployment_location) if o.scheme != 'oci': raise SparkETLLaunchFailure("deployment_location must be in OCI") run_dir = self.config.get('run_dir') or self.config.get('run_base_dir') run_id = str(uuid.uuid4()) namespace = o.netloc.split('@')[1] bucket = o.netloc.split('@')[0] root_path = o.path[1:] # remove the leading "/" # let's get the deployment.json os_client = get_os_client(self.region, self.config.get("oci_config")) deployment = os_download_json( os_client, namespace, bucket, os.path.join(root_path, "deployment.json")) # let's upload the args client_channel = ClientChannel(self.region, self.config.get("oci_config"), run_dir, run_id) client_channel.write_json("args.json", args) o = urlparse(run_dir) namespace = o.netloc.split('@')[1] bucket = o.netloc.split('@')[0] root_path = o.path[1:] # remove the leading "/" os_upload_json(os_client, args, namespace, bucket, f"{root_path}/{run_id}/args.json") df_client = get_df_client(self.region, self.config.get("oci_config")) crd_argv = { 'compartment_id': deployment['compartment_id'], 'application_id': deployment['application_id'], 'display_name': options["display_name"], 'arguments': [ "--deployment-location", deployment_location, "--run-id", run_id, "--run-dir", os.path.join(run_dir, run_id), "--app-region", self.region, ], } for key in ['num_executors', 'driver_shape', 'executor_shape']: if key in options: crd_argv[key] = options[key] create_run_details = oci.data_flow.models.CreateRunDetails(**crd_argv) r = df_client.create_run(create_run_details=create_run_details) check_response( r, lambda: SparkETLLaunchFailure( "dataflow failed to run the application")) run = r.data oci_run_id = run.id print(f"Job launched, run_id = {run_id}, oci_run_id = {run.id}") if on_job_submitted is not None: on_job_submitted(run_id, vendor_info={'oci_run_id': run.id}) cli_entered = False while True: time.sleep(10) r = df_client.get_run(run_id=run.id) check_response( r, lambda: SparkETLGetStatusFailure( "dataflow failed to get run status")) run = r.data print(f"Status: {run.lifecycle_state}") if run.lifecycle_state in ('FAILED', 'SUCCEEDED', 'CANCELED'): break handle_server_ask(client_channel, handlers) if cli_mode and not cli_entered and run.lifecycle_state == 'IN_PROGRESS': cli_entered = True cli_handler = CLIHandler(client_channel, None, handlers) cli_handler.loop() if run.lifecycle_state in ('FAILED', 'CANCELED'): raise Exception(f"Job failed with status: {run.lifecycle_state}") return client_channel.read_json('result.json')
def delete_json(self, name): os_client = get_os_client(self.region, self.oci_config) object_name = os.path.join(self.root_path, self.run_id, name) os_delete_object(os_client, self.namespace, self.bucket, object_name)
def write_json(self, name, payload): os_client = get_os_client(self.region, self.oci_config) object_name = os.path.join(self.root_path, self.run_id, name) os_upload_json(os_client, payload, self.namespace, self.bucket, object_name)
def read_json(self, name): os_client = get_os_client(self.region, self.oci_config) object_name = os.path.join(self.root_path, self.run_id, name) result = os_download_json(os_client, self.namespace, self.bucket, object_name) return result
def deploy(self, build_dir, destination_location): o = urlparse(destination_location) if o.scheme != 'oci': raise SparkETLDeploymentFailure("destination_location must be in OCI") namespace = o.netloc.split('@')[1] bucket = o.netloc.split('@')[0] root_path = o.path[1:] # remove the leading "/" build = Build(build_dir) print("Uploading files:") # Data flow want to call python lib python.zip os_client = get_os_client(self.region, config=self.config.get("oci_config")) for artifact in build.artifacts: os_upload( os_client, f"{build_dir}/{artifact}", namespace, bucket, f"{root_path}/{build.version}/{artifact}" ) # let's upload the job loader job_loader_filename = get_job_loader(self.config.get("oci_config")) os_upload( os_client, job_loader_filename, namespace, bucket, f"{root_path}/{build.version}/job_loader.py" ) application = self.create_application(build.manifest, destination_location) app_info = { "application_id": application.id, "compartment_id": application.compartment_id } os_upload_json( os_client, app_info, namespace, bucket, f"{root_path}/{build.version}/deployment.json" ) oci_config = self.config.get("oci_config") if oci_config is not None: os_upload( os_client, _save_json_temp(oci_config), namespace, bucket, "oci_config.json" ) os_upload( os_client, oci_config['key_file'], namespace, bucket, "oci_api_key.pem", )