def _upload_artifacts(self, log, step_run_ref, run_id, step_key):
        '''Upload the step run ref and pyspark code to DBFS to run as a job.'''

        log.info('Uploading main file to DBFS')
        main_local_path = self._main_file_local_path()
        with open(main_local_path, 'rb') as infile:
            self.databricks_runner.client.put_file(
                infile,
                self._dbfs_path(run_id, step_key, self._main_file_name()))

        log.info('Uploading pipeline to DBFS')
        with seven.TemporaryDirectory() as temp_dir:
            # Zip and upload package containing pipeline
            zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)
            build_pyspark_zip(zip_local_path, self.local_pipeline_package_path)
            with open(zip_local_path, 'rb') as infile:
                self.databricks_runner.client.put_file(
                    infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME))

        log.info('Uploading step run ref file to DBFS')
        step_pickle_file = io.BytesIO()

        pickle.dump(step_run_ref, step_pickle_file)
        step_pickle_file.seek(0)
        self.databricks_runner.client.put_file(
            step_pickle_file,
            self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),
        )
Ejemplo n.º 2
0
    def _post_artifacts(self, log, step_run_ref, run_id, step_key):
        """
        Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.

        For the zip file, consider the following toy example:

            # Folder: my_pyspark_project/
            # a.py
            def foo():
                print(1)

            # b.py
            def bar():
                print(2)

            # main.py
            from a import foo
            from b import bar

            foo()
            bar()

        This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running
        `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will
        print 1, 2.
        """

        with seven.TemporaryDirectory() as temp_dir:
            s3 = boto3.client("s3", region_name=self.region_name)

            # Upload step run ref
            def _upload_file_to_s3(local_path, s3_filename):
                key = self._artifact_s3_key(run_id, step_key, s3_filename)
                s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)
                log.debug(
                    "Uploading file {local_path} to {s3_uri}".format(
                        local_path=local_path, s3_uri=s3_uri
                    )
                )
                s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)

            # Upload main file.
            # The remote Dagster installation should also have the file, but locating it there
            # could be a pain.
            main_local_path = self._main_file_local_path()
            _upload_file_to_s3(main_local_path, self._main_file_name())

            if self.deploy_local_pipeline_package:
                # Zip and upload package containing pipeline
                zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)
                build_pyspark_zip(zip_local_path, self.local_pipeline_package_path)
                _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)

            # Create step run ref pickle file
            step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)
            with open(step_run_ref_local_path, "wb") as step_pickle_file:
                pickle.dump(step_run_ref, step_pickle_file)

            _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)
    def _upload_artifacts(self, log, step_run_ref, run_id, step_key):
        """Upload the step run ref and pyspark code to DBFS to run as a job."""

        log.info("Uploading main file to DBFS")
        main_local_path = self._main_file_local_path()
        with open(main_local_path, "rb") as infile:
            self.databricks_runner.client.put_file(
                infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True
            )

        log.info("Uploading dagster job to DBFS")
        with tempfile.TemporaryDirectory() as temp_dir:
            # Zip and upload package containing dagster job
            zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)
            build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)
            with open(zip_local_path, "rb") as infile:
                self.databricks_runner.client.put_file(
                    infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True
                )

        log.info("Uploading step run ref file to DBFS")
        step_pickle_file = io.BytesIO()

        pickle.dump(step_run_ref, step_pickle_file)
        step_pickle_file.seek(0)
        self.databricks_runner.client.put_file(
            step_pickle_file,
            self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),
            overwrite=True,
        )

        databricks_config = DatabricksConfig(
            storage=self.storage,
            secrets=self.secrets,
        )
        log.info("Uploading Databricks configuration to DBFS")
        databricks_config_file = io.BytesIO()

        pickle.dump(databricks_config, databricks_config_file)
        databricks_config_file.seek(0)
        self.databricks_runner.client.put_file(
            databricks_config_file,
            self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),
            overwrite=True,
        )