def _get_execute_steps(self, context, solid_name): '''From the local Dagster instance, construct EMR steps that will kick off execution on a remote EMR cluster. ''' action_on_failure = self.config['action_on_failure'] staging_bucket = self.config['staging_bucket'] run_id = context.run_id local_root = os.path.dirname( os.path.abspath(self.config['pipeline_file'])) steps = [] # Install Python dependencies if a requirements file exists requirements_file = self.config.get('requirements_file_path') if requirements_file and not os.path.exists(requirements_file): raise DagsterInvalidDefinitionError( 'The requirements.txt file that was specified does not exist') if not requirements_file: requirements_file = os.path.join(local_root, 'requirements.txt') if os.path.exists(requirements_file): with open(requirements_file, 'rb') as f: python_dependencies = six.ensure_str(f.read()).split('\n') steps.append( get_install_requirements_step(python_dependencies, action_on_failure)) # Execute Solid via spark-submit conf = dict(flatten_dict(self.config.get('spark_conf'))) conf['spark.app.name'] = conf.get('spark.app.name', solid_name) check.invariant( conf.get('spark.master', 'yarn') == 'yarn', desc= 'spark.master is configured as %s; cannot set Spark master on EMR to anything ' 'other than "yarn"' % conf.get('spark.master'), ) steps.append({ 'Name': 'Execute Solid %s' % solid_name, 'ActionOnFailure': action_on_failure, 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ EMR_SPARK_HOME + 'bin/spark-submit', '--master', 'yarn', '--deploy-mode', conf.get('spark.submit.deployMode', 'client'), ] + format_for_cli(list(flatten_dict(conf))) + [ '--py-files', 's3://%s/%s/pyspark.zip' % (staging_bucket, run_id), 's3://%s/%s/main.py' % (staging_bucket, run_id), ], }, }) return steps
def spark_session_from_config(spark_conf=None): spark_conf = check.opt_dict_param(spark_conf, 'spark_conf') builder = SparkSession.builder flat = flatten_dict(spark_conf) for key, value in flat: builder = builder.config(key, value) return builder.getOrCreate()
def spark_session_resource(init_context): builder = SparkSession.builder flat = flatten_dict(init_context.resource_config['spark_conf']) for key, value in flat: builder = builder.config(key, value) spark = builder.getOrCreate() try: yield spark finally: spark.stop()
def _get_emr_step_def(self, run_id, step_key, solid_name): """From the local Dagster instance, construct EMR steps that will kick off execution on a remote EMR cluster. """ from dagster_spark.utils import flatten_dict, format_for_cli action_on_failure = self.action_on_failure # Execute Solid via spark-submit conf = dict(flatten_dict(self.spark_config)) conf["spark.app.name"] = conf.get("spark.app.name", solid_name) check.invariant( conf.get("spark.master", "yarn") == "yarn", desc="spark.master is configured as %s; cannot set Spark master on EMR to anything " 'other than "yarn"' % conf.get("spark.master"), ) command = ( [ EMR_SPARK_HOME + "bin/spark-submit", "--master", "yarn", "--deploy-mode", conf.get("spark.submit.deployMode", "client"), ] + format_for_cli(list(flatten_dict(conf))) + [ "--py-files", self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME), self._artifact_s3_uri(run_id, step_key, self._main_file_name()), self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME), ] ) return EmrJobRunner.construct_step_dict_for_command( "Execute Solid %s" % solid_name, command, action_on_failure=action_on_failure )
def _get_emr_step_def(self, run_id, step_key, solid_name): '''From the local Dagster instance, construct EMR steps that will kick off execution on a remote EMR cluster. ''' action_on_failure = self.action_on_failure # Execute Solid via spark-submit conf = dict(flatten_dict(self.spark_config)) conf['spark.app.name'] = conf.get('spark.app.name', solid_name) check.invariant( conf.get('spark.master', 'yarn') == 'yarn', desc= 'spark.master is configured as %s; cannot set Spark master on EMR to anything ' 'other than "yarn"' % conf.get('spark.master'), ) command = ([ EMR_SPARK_HOME + 'bin/spark-submit', '--master', 'yarn', '--deploy-mode', conf.get('spark.submit.deployMode', 'client'), ] + format_for_cli(list(flatten_dict(conf))) + [ '--py-files', self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME), self._artifact_s3_uri(run_id, step_key, self._main_file_name()), self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME), ]) return EmrJobRunner.construct_step_dict_for_command( 'Execute Solid %s' % solid_name, command, action_on_failure=action_on_failure)