コード例 #1
0
    def stop(self):
        self._spark_session.stop()

    @abc.abstractmethod
    def get_compute_fn(self, fn, solid_name):
        pass


class SystemPySparkResource(PySparkResourceDefinition):
    def get_compute_fn(self, fn, solid_name):
        return fn


@resource({
    'spark_conf':
    spark_config(),
    'stop_session':
    Field(
        bool,
        is_optional=True,
        default_value=True,
        description='Whether to stop the Spark session on pipeline completion. '
        'Defaults to True.',
    ),
})
def pyspark_resource(init_context):
    pyspark = SystemPySparkResource(init_context.resource_config['spark_conf'])
    try:
        yield pyspark
    finally:
        if init_context.resource_config['stop_session']:
コード例 #2
0
    }))
def write_rdd(context, file_type, file_options, spark_rdd):
    if file_type == 'csv':
        df = context.resources.spark.createDataFrame(spark_rdd)
        context.log.info('DF: {}'.format(df))
        df.write.csv(file_options['path'],
                     header=file_options.get('header'),
                     sep=file_options.get('sep'))
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


SparkRDD = as_dagster_type(RDD,
                           'SparkRDD',
                           input_schema=load_rdd,
                           output_schema=write_rdd)


@resource(config_field=Field(Dict({'spark_conf': spark_config()})))
def spark_session_resource(init_context):
    builder = SparkSession.builder
    flat = flatten_dict(init_context.resource_config['spark_conf'])
    for key, value in flat:
        builder = builder.config(key, value)

    spark = builder.getOrCreate()
    try:
        yield spark
    finally:
        spark.stop()
コード例 #3
0
ファイル: resources.py プロジェクト: zuik/dagster

class PySparkResource:
    def __init__(self, spark_conf):
        self._spark_session = spark_session_from_config(spark_conf)

    @property
    def spark_session(self):
        return self._spark_session

    @property
    def spark_context(self):
        return self.spark_session.sparkContext


@resource({"spark_conf": spark_config()})
def pyspark_resource(init_context):
    """This resource provides access to a PySpark SparkSession for executing PySpark code within
    Dagster.

    Example:

        .. code-block:: python

            @solid(required_resource_keys={"pyspark"})
            def my_solid(context):
                spark_session = context.pyspark.spark_session
                dataframe = spark_session.read.json("examples/src/main/resources/people.json")

            my_pyspark_resource = pyspark_resource.configured(
                {"spark_conf": {"spark.executor.memory": "2g"}}
コード例 #4
0
                           'SparkRDD',
                           input_hydration_config=load_rdd,
                           output_materialization_config=write_rdd)


def spark_session_from_config(spark_conf=None):
    spark_conf = check.opt_dict_param(spark_conf, 'spark_conf')
    builder = SparkSession.builder
    flat = flatten_dict(spark_conf)
    for key, value in flat:
        builder = builder.config(key, value)

    return builder.getOrCreate()


@resource({'spark_conf': spark_config()})
def spark_session_resource(init_context):
    spark = spark_session_from_config(
        init_context.resource_config['spark_conf'])
    try:
        yield spark
    finally:
        spark.stop()


@output_selector_schema(
    Selector({
        'csv':
        Field(
            Dict({
                'path': Field(Path),
コード例 #5
0
        return steps

    @property
    def running_on_emr(self):
        '''Detects whether we are running on the EMR cluster
        '''
        if os.path.exists('/mnt/var/lib/info/job-flow.json'):
            return True
        return False


@resource(
    {
        'pipeline_file': Field(str, description='Path to the file where the pipeline is defined'),
        'pipeline_fn_name': Field(str),
        'spark_config': spark_config(),
        'cluster_id': Field(str, description='Name of the job flow (cluster) on which to execute'),
        'region_name': Field(str),
        'action_on_failure': Field(str, is_required=False, default_value='CANCEL_AND_WAIT'),
        'staging_bucket': Field(
            str,
            is_required=True,
            description='S3 staging bucket to use for staging the produced main.py and zip file of'
            ' Python code',
        ),
        'requirements_file_path': Field(
            str,
            is_required=False,
            description='Path to a requirements.txt file; the current directory is searched if none'
            ' is specified.',
        ),