Exemple #1
0
@op(required_resource_keys={"pyspark_step_launcher"})
def filter_over_50(people: DataFrame) -> DataFrame:
    return people.filter(people["age"] > 50)


@op(required_resource_keys={"pyspark_step_launcher"})
def count_people(people: DataFrame) -> int:
    return people.count()


emr_resource_defs = {
    "pyspark_step_launcher": emr_pyspark_step_launcher.configured(
        {
            "cluster_id": {"env": "EMR_CLUSTER_ID"},
            "local_pipeline_package_path": str(Path(__file__).parent),
            "deploy_local_pipeline_package": True,
            "region_name": "us-west-1",
            "staging_bucket": "my_staging_bucket",
            "wait_for_logs": True,
        }
    ),
    "pyspark": pyspark_resource.configured({"spark_conf": {"spark.executor.memory": "2g"}}),
    "s3": s3_resource,
    "io_manager": s3_pickle_io_manager.configured(
        {"s3_bucket": "my_staging_bucket", "s3_prefix": "simple-pyspark"}
    ),
}

local_resource_defs = {
    "pyspark_step_launcher": no_step_launcher,
    "pyspark": pyspark_resource.configured({"spark_conf": {"spark.default.parallelism": 1}}),
}
Exemple #2
0
@solid(required_resource_keys={"pyspark_step_launcher"})
def count_people(_, people: DataFrame) -> int:
    return people.count()


emr_mode = ModeDefinition(
    name="emr",
    resource_defs={
        "pyspark_step_launcher":
        emr_pyspark_step_launcher.configured({
            "cluster_id": {
                "env": "EMR_CLUSTER_ID"
            },
            "local_pipeline_package_path":
            ".",
            "deploy_local_pipeline_package":
            True,
            "region_name":
            "us-west-1",
            "staging_bucket":
            "dagster-scratch-80542c2",
        }),
        "pyspark":
        pyspark_resource,
        "s3":
        s3_resource,
    },
    intermediate_storage_defs=[
        s3_intermediate_storage.configured({
            "s3_bucket": "dagster-scratch-80542c2",
            "s3_prefix": "simple-pyspark"