@op(required_resource_keys={"pyspark_step_launcher"}) def filter_over_50(people: DataFrame) -> DataFrame: return people.filter(people["age"] > 50) @op(required_resource_keys={"pyspark_step_launcher"}) def count_people(people: DataFrame) -> int: return people.count() emr_resource_defs = { "pyspark_step_launcher": emr_pyspark_step_launcher.configured( { "cluster_id": {"env": "EMR_CLUSTER_ID"}, "local_pipeline_package_path": str(Path(__file__).parent), "deploy_local_pipeline_package": True, "region_name": "us-west-1", "staging_bucket": "my_staging_bucket", "wait_for_logs": True, } ), "pyspark": pyspark_resource.configured({"spark_conf": {"spark.executor.memory": "2g"}}), "s3": s3_resource, "io_manager": s3_pickle_io_manager.configured( {"s3_bucket": "my_staging_bucket", "s3_prefix": "simple-pyspark"} ), } local_resource_defs = { "pyspark_step_launcher": no_step_launcher, "pyspark": pyspark_resource.configured({"spark_conf": {"spark.default.parallelism": 1}}), }
@solid(required_resource_keys={"pyspark_step_launcher"}) def count_people(_, people: DataFrame) -> int: return people.count() emr_mode = ModeDefinition( name="emr", resource_defs={ "pyspark_step_launcher": emr_pyspark_step_launcher.configured({ "cluster_id": { "env": "EMR_CLUSTER_ID" }, "local_pipeline_package_path": ".", "deploy_local_pipeline_package": True, "region_name": "us-west-1", "staging_bucket": "dagster-scratch-80542c2", }), "pyspark": pyspark_resource, "s3": s3_resource, }, intermediate_storage_defs=[ s3_intermediate_storage.configured({ "s3_bucket": "dagster-scratch-80542c2", "s3_prefix": "simple-pyspark"