Beispiel #1
0
from .common_bucket_s3_pickle_io_manager import common_bucket_s3_pickle_io_manager
from .parquet_io_manager import (
    local_partitioned_parquet_io_manager,
    s3_partitioned_parquet_io_manager,
)
from .snowflake_io_manager import snowflake_io_manager

configured_pyspark = pyspark_resource.configured({
    "spark_conf": {
        "spark.jars.packages":
        ",".join([
            "net.snowflake:snowflake-jdbc:3.8.0",
            "net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0",
            "com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.7",
        ]),
        "spark.hadoop.fs.s3.impl":
        "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
        "spark.hadoop.fs.s3.awsAccessKeyId":
        os.getenv("AWS_ACCESS_KEY_ID", ""),
        "spark.hadoop.fs.s3.awsSecretAccessKey":
        os.getenv("AWS_SECRET_ACCESS_KEY", ""),
        "spark.hadoop.fs.s3.buffer.dir":
        "/tmp",
    }
})

snowflake_io_manager_prod = snowflake_io_manager.configured(
    {"database": "DEMO_DB_ASSETS"})

RESOURCES_PROD = {
    "s3_bucket":
    ResourceDefinition.hardcoded_resource("hackernews-elementl-prod"),
Beispiel #2
0
def count_people(people: DataFrame) -> int:
    return people.count()


emr_resource_defs = {
    "pyspark_step_launcher": emr_pyspark_step_launcher.configured(
        {
            "cluster_id": {"env": "EMR_CLUSTER_ID"},
            "local_pipeline_package_path": str(Path(__file__).parent),
            "deploy_local_pipeline_package": True,
            "region_name": "us-west-1",
            "staging_bucket": "my_staging_bucket",
            "wait_for_logs": True,
        }
    ),
    "pyspark": pyspark_resource.configured({"spark_conf": {"spark.executor.memory": "2g"}}),
    "s3": s3_resource,
    "io_manager": s3_pickle_io_manager.configured(
        {"s3_bucket": "my_staging_bucket", "s3_prefix": "simple-pyspark"}
    ),
}

local_resource_defs = {
    "pyspark_step_launcher": no_step_launcher,
    "pyspark": pyspark_resource.configured({"spark_conf": {"spark.default.parallelism": 1}}),
}


@graph
def count_people_over_50():
    count_people(filter_over_50(make_people()))
Beispiel #3
0
    "warehouse_io_manager": fs_io_manager,
    "pyspark": pyspark_resource,
    "hn_client": hn_api_subsample_client.configured({"sample_rate": 10}),
}


PROD_RESOURCES = {
    "io_manager": s3_pickle_io_manager.configured({"s3_bucket": "hackernews-elementl-prod"}),
    "s3": s3_resource,
    "partition_start": ResourceDefinition.string_resource(),
    "partition_end": ResourceDefinition.string_resource(),
    "parquet_io_manager": partitioned_parquet_io_manager.configured(
        {"base_path": "s3://hackernews-elementl-prod"}
    ),
    "warehouse_io_manager": time_partitioned_snowflake_io_manager_prod,
    "pyspark": pyspark_resource.configured(S3_SPARK_CONF),
    "hn_client": hn_api_subsample_client.configured({"sample_rate": 10}),
}

download_pipeline_properties = {
    "description": "#### Owners:\n"
    "[email protected], [email protected]\n "
    "#### About\n"
    "This pipeline downloads all items from the HN API for a given day, "
    "splits the items into stories and comment types using Spark, and uploads filtered items to "
    "the corresponding stories or comments Snowflake table",
    "tags": {
        "dagster-k8s/config": {
            "container_config": {
                "resources": {
                    "requests": {"cpu": "500m", "memory": "2Gi"},