def test_download():
    with tempfile.TemporaryDirectory() as temp_dir:
        test_job = build_assets_job(
            "test_job",
            assets=ASSETS,
            resource_defs={
                "io_manager":
                fs_io_manager,
                "partition_start":
                ResourceDefinition.string_resource(),
                "partition_end":
                ResourceDefinition.string_resource(),
                "parquet_io_manager":
                local_partitioned_parquet_io_manager.configured(
                    {"base_path": temp_dir}),
                "warehouse_io_manager":
                mem_io_manager,
                "pyspark":
                pyspark_resource,
                "hn_client":
                hn_snapshot_client,
            },
        )
        result = test_job.execute_in_process(partition_key="2020-12-30-00:00")

        assert result.success
def test_download():
    with tempfile.TemporaryDirectory() as temp_dir:
        result = download_comments_and_stories_dev.graph.execute_in_process(
            run_config={
                "resources": {
                    "partition_start": {
                        "config": "2020-12-30 00:00:00"
                    },
                    "partition_end": {
                        "config": "2020-12-30 01:00:00"
                    },
                    "parquet_io_manager": {
                        "config": {
                            "base_path": temp_dir
                        }
                    },
                }
            },
            resources={
                "io_manager": fs_io_manager,
                "partition_start": ResourceDefinition.string_resource(),
                "partition_end": ResourceDefinition.string_resource(),
                "parquet_io_manager": partitioned_parquet_io_manager,
                "warehouse_io_manager": mem_io_manager,
                "pyspark": pyspark_resource,
                "hn_client": hn_snapshot_client,
            },
        )

        assert result.success
Beispiel #3
0
def test_string_resource():
    called = {}

    @solid(required_resource_keys={'test_string'})
    def solid_test_string(context):
        assert context.resources.test_string == 'foo'
        called['yup'] = True

    pipeline = PipelineDefinition(
        name='test_string_resource',
        solid_defs=[solid_test_string],
        mode_defs=[
            ModeDefinition(resource_defs={
                'test_string': ResourceDefinition.string_resource()
            })
        ],
    )

    result = execute_pipeline(
        pipeline, {'resources': {
            'test_string': {
                'config': 'foo'
            }
        }})

    assert result.success
    assert called['yup']
def test_string_resource():
    called = {}

    @solid(required_resource_keys={"test_string"})
    def solid_test_string(context):
        assert context.resources.test_string == "foo"
        called["yup"] = True

    the_pipeline = PipelineDefinition(
        name="test_string_resource",
        solid_defs=[solid_test_string],
        mode_defs=[
            ModeDefinition(resource_defs={
                "test_string": ResourceDefinition.string_resource()
            })
        ],
    )

    result = execute_pipeline(
        the_pipeline, {"resources": {
            "test_string": {
                "config": "foo"
            }
        }})

    assert result.success
    assert called["yup"]
def test_string_resource():
    called = {}

    @solid
    def solid_test_string(info):
        assert info.context.resources.test_string == 'foo'
        called['yup'] = True

    pipeline = PipelineDefinition(
        name='test_string_resource',
        solids=[solid_test_string],
        context_definitions={
            'default':
            PipelineContextDefinition(
                resources={
                    'test_string': ResourceDefinition.string_resource()
                })
        },
    )

    result = execute_pipeline(pipeline, {
        'context': {
            'default': {
                'resources': {
                    'test_string': {
                        'config': 'foo'
                    }
                }
            }
        }
    })

    assert result.success
    assert called['yup']
Beispiel #6
0
def test_download():
    with tempfile.TemporaryDirectory() as temp_dir:
        test_job = AssetGroup.from_package_name(
            "hacker_news_assets.assets",
            resource_defs={
                "io_manager": fs_io_manager,
                "partition_start": ResourceDefinition.string_resource(),
                "partition_end": ResourceDefinition.string_resource(),
                "parquet_io_manager": local_partitioned_parquet_io_manager.configured(
                    {"base_path": temp_dir}
                ),
                "warehouse_io_manager": mem_io_manager,
                "pyspark": pyspark_resource,
                "hn_client": hn_snapshot_client,
                "dbt": ResourceDefinition.none_resource(),
            },
        ).build_job(
            "test_job",
            selection=["*comments", "*stories"],
        )

        result = test_job.execute_in_process(partition_key="2020-12-30-00:00")

        assert result.success
Beispiel #7
0
                "net.snowflake:snowflake-jdbc:3.8.0",
                "net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0",
                "com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.7",
            ]
        ),
        "spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
        "spark.hadoop.fs.s3.awsAccessKeyId": os.getenv("AWS_ACCESS_KEY_ID", ""),
        "spark.hadoop.fs.s3.awsSecretAccessKey": os.getenv("AWS_SECRET_ACCESS_KEY", ""),
        "spark.hadoop.fs.s3.buffer.dir": "/tmp",
    }
}


DEV_RESOURCES = {
    "io_manager": fs_io_manager,
    "partition_start": ResourceDefinition.string_resource(),
    "partition_end": ResourceDefinition.string_resource(),
    "parquet_io_manager": partitioned_parquet_io_manager.configured(
        {"base_path": get_system_temp_directory()}
    ),
    "warehouse_io_manager": fs_io_manager,
    "pyspark": pyspark_resource,
    "hn_client": hn_api_subsample_client.configured({"sample_rate": 10}),
}


PROD_RESOURCES = {
    "io_manager": s3_pickle_io_manager.configured({"s3_bucket": "hackernews-elementl-prod"}),
    "s3": s3_resource,
    "partition_start": ResourceDefinition.string_resource(),
    "partition_end": ResourceDefinition.string_resource(),
Beispiel #8
0
        "spark.hadoop.fs.s3.awsSecretAccessKey":
        os.getenv("AWS_SECRET_ACCESS_KEY", ""),
        "spark.hadoop.fs.s3.buffer.dir":
        "/tmp",
    }
}

MODE_TEST = ModeDefinition(
    name="test_local_data",
    description=
    "This mode queries snapshotted HN data and does all writes locally.",
    resource_defs={
        "io_manager":
        fs_io_manager,
        "partition_start":
        ResourceDefinition.string_resource(),
        "partition_end":
        ResourceDefinition.string_resource(),
        "parquet_io_manager":
        partitioned_parquet_io_manager,
        "db_io_manager":
        mem_io_manager,
        "pyspark":
        pyspark_resource,
        "hn_client":
        hn_snapshot_client,
        "slack":
        ResourceDefinition.mock_resource(),
        "base_url":
        ResourceDefinition.hardcoded_resource("http://localhost:3000",
                                              "Dagit URL"),