def _upload_artifacts(self, log, step_run_ref, run_id, step_key): '''Upload the step run ref and pyspark code to DBFS to run as a job.''' log.info('Uploading main file to DBFS') main_local_path = self._main_file_local_path() with open(main_local_path, 'rb') as infile: self.databricks_runner.client.put_file( infile, self._dbfs_path(run_id, step_key, self._main_file_name())) log.info('Uploading pipeline to DBFS') with seven.TemporaryDirectory() as temp_dir: # Zip and upload package containing pipeline zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME) build_pyspark_zip(zip_local_path, self.local_pipeline_package_path) with open(zip_local_path, 'rb') as infile: self.databricks_runner.client.put_file( infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME)) log.info('Uploading step run ref file to DBFS') step_pickle_file = io.BytesIO() pickle.dump(step_run_ref, step_pickle_file) step_pickle_file.seek(0) self.databricks_runner.client.put_file( step_pickle_file, self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME), )
def _post_artifacts(self, log, step_run_ref, run_id, step_key): """ Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR. For the zip file, consider the following toy example: # Folder: my_pyspark_project/ # a.py def foo(): print(1) # b.py def bar(): print(2) # main.py from a import foo from b import bar foo() bar() This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will print 1, 2. """ with seven.TemporaryDirectory() as temp_dir: s3 = boto3.client("s3", region_name=self.region_name) # Upload step run ref def _upload_file_to_s3(local_path, s3_filename): key = self._artifact_s3_key(run_id, step_key, s3_filename) s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename) log.debug( "Uploading file {local_path} to {s3_uri}".format( local_path=local_path, s3_uri=s3_uri ) ) s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key) # Upload main file. # The remote Dagster installation should also have the file, but locating it there # could be a pain. main_local_path = self._main_file_local_path() _upload_file_to_s3(main_local_path, self._main_file_name()) if self.deploy_local_pipeline_package: # Zip and upload package containing pipeline zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME) build_pyspark_zip(zip_local_path, self.local_pipeline_package_path) _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME) # Create step run ref pickle file step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME) with open(step_run_ref_local_path, "wb") as step_pickle_file: pickle.dump(step_run_ref, step_pickle_file) _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)
def _upload_artifacts(self, log, step_run_ref, run_id, step_key): """Upload the step run ref and pyspark code to DBFS to run as a job.""" log.info("Uploading main file to DBFS") main_local_path = self._main_file_local_path() with open(main_local_path, "rb") as infile: self.databricks_runner.client.put_file( infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True ) log.info("Uploading dagster job to DBFS") with tempfile.TemporaryDirectory() as temp_dir: # Zip and upload package containing dagster job zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME) build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path) with open(zip_local_path, "rb") as infile: self.databricks_runner.client.put_file( infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True ) log.info("Uploading step run ref file to DBFS") step_pickle_file = io.BytesIO() pickle.dump(step_run_ref, step_pickle_file) step_pickle_file.seek(0) self.databricks_runner.client.put_file( step_pickle_file, self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME), overwrite=True, ) databricks_config = DatabricksConfig( storage=self.storage, secrets=self.secrets, ) log.info("Uploading Databricks configuration to DBFS") databricks_config_file = io.BytesIO() pickle.dump(databricks_config, databricks_config_file) databricks_config_file.seek(0) self.databricks_runner.client.put_file( databricks_config_file, self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME), overwrite=True, )