Exemple #1
0
def test_composite_config_driven_materialization():
    @lambda_solid
    def one():
        return 1

    @composite_solid
    def wrap_one():
        return one()

    @pipeline
    def composite_config_driven_materialization_pipeline():
        wrap_one()

    with get_temp_dir() as write_directory:
        write_location = os.path.join(write_directory, 'wrap_one.json')
        execute_pipeline(
            composite_config_driven_materialization_pipeline,
            run_config={
                'solids': {
                    'wrap_one': {
                        'outputs': [{
                            'result': {
                                'json': {
                                    'path': write_location
                                }
                            }
                        }]
                    }
                }
            },
        )

        assert os.path.exists(write_location)
Exemple #2
0
def test_dataframe_outputs(file_type, read, kwargs):
    df = create_dask_df()

    @solid(output_defs=[
        OutputDefinition(dagster_type=DataFrame, name='output_df')
    ])
    def return_df(_):
        return df

    with get_temp_dir() as temp_path:
        shutil.rmtree(temp_path)
        result = execute_solid(
            return_df,
            run_config={
                'solids': {
                    'return_df': {
                        'outputs': [{
                            'output_df': {
                                file_type: {
                                    'path': temp_path,
                                    **kwargs
                                }
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(f"{temp_path}/*")
        assert assert_eq(actual, df)
Exemple #3
0
def test_missing_resources():
    with pytest.raises(DagsterInvalidDefinitionError):
        with get_temp_dir() as temp_dir:
            execute_solid_with_resources(
                cache_file_from_s3,
                resources={'file_cache': fs_file_cache},
                environment_dict={
                    'solids': {
                        'cache_file_from_s3': {
                            'inputs': {
                                'bucket_data': {
                                    'bucket': 'some-bucket',
                                    'key': 'some-key'
                                }
                            }
                        }
                    },
                    'resources': {
                        'file_cache': {
                            'config': {
                                'target_folder': temp_dir
                            }
                        }
                    },
                },
            )
Exemple #4
0
def test_composite_config_driven_materialization(composition_decorator):
    @lambda_solid
    def one():
        return 1

    @composition_decorator(output_defs=[OutputDefinition()])
    def wrap_one():
        return one()

    @pipeline
    def composite_config_driven_materialization_pipeline():
        wrap_one()

    with get_temp_dir() as write_directory:
        write_location = os.path.join(write_directory, "wrap_one.json")
        execute_pipeline(
            composite_config_driven_materialization_pipeline,
            run_config={
                "solids": {
                    "wrap_one": {"outputs": [{"result": {"json": {"path": write_location}}}]}
                }
            },
        )

        assert os.path.exists(write_location)
Exemple #5
0
def test_fs_file_cache_write_binary_data():
    with get_temp_dir() as temp_dir:
        file_store = FSFileCache(temp_dir)
        assert not file_store.has_file_object('foo')
        assert file_store.write_binary_data('foo', 'bar'.encode())
        file_handle = file_store.get_file_handle('foo')
        assert isinstance(file_handle, LocalFileHandle)
        assert file_handle.path_desc == os.path.join(temp_dir, 'foo')
Exemple #6
0
def test_fs_file_cache_write_data():
    bytes_object = io.BytesIO('bar'.encode())
    with get_temp_dir() as temp_dir:
        file_cache = FSFileCache(temp_dir)
        assert not file_cache.has_file_object('foo')
        assert file_cache.write_file_object('foo', bytes_object)
        file_handle = file_cache.get_file_handle('foo')
        assert isinstance(file_handle, LocalFileHandle)
        assert file_handle.path_desc == os.path.join(temp_dir, 'foo')
Exemple #7
0
def test_dataframe_outputs(file_type, read, other):
    df = create_pyspark_df()

    @solid(output_defs=[
        OutputDefinition(dagster_type=DagsterPySparkDataFrame, name="df")
    ])
    def return_df(_):
        return df

    with get_temp_dir() as temp_path:
        shutil.rmtree(temp_path)

        options = {"path": temp_path}
        if other:
            options["format"] = file_type
            file_type = "other"

        result = execute_solid(
            return_df,
            run_config={
                "solids": {
                    "return_df": {
                        "outputs": [{
                            "df": {
                                file_type: options
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(options["path"], **dict_without_keys(options, "path"))
        assert sorted(df.collect()) == sorted(actual.collect())

        result = execute_solid(
            return_df,
            run_config={
                "solids": {
                    "return_df": {
                        "outputs": [{
                            "df": {
                                file_type:
                                dict(
                                    {
                                        "mode": "overwrite",
                                        "compression": "gzip",
                                    }, **options)
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(options["path"], **dict_without_keys(options, "path"))
        assert sorted(df.collect()) == sorted(actual.collect())
Exemple #8
0
def test_basic_file_manager_copy_handle_to_local_temp():
    foo_data = 'foo'.encode()
    with get_temp_dir() as temp_dir:
        with get_temp_file_handle_with_data(foo_data) as foo_handle:
            with local_file_manager(temp_dir) as manager:
                local_temp = manager.copy_handle_to_local_temp(foo_handle)
                assert local_temp != foo_handle.path
                with open(local_temp, 'rb') as ff:
                    assert ff.read() == foo_data
Exemple #9
0
def test_dataframe_outputs(file_type, read):
    df = create_pyspark_df()

    @solid(output_defs=[
        OutputDefinition(dagster_type=DagsterPySparkDataFrame, name='df')
    ])
    def return_df(_):
        return df

    with get_temp_dir() as temp_path:
        shutil.rmtree(temp_path)

        result = execute_solid(
            return_df,
            run_config={
                'solids': {
                    'return_df': {
                        'outputs': [{
                            'df': {
                                file_type: {
                                    'path': temp_path
                                }
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(temp_path)
        assert sorted(df.collect()) == sorted(actual.collect())

        result = execute_solid(
            return_df,
            run_config={
                'solids': {
                    'return_df': {
                        'outputs': [{
                            'df': {
                                file_type: {
                                    'path': temp_path,
                                    'mode': 'overwrite',
                                    'compression': 'gzip',
                                }
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(temp_path)
        assert sorted(df.collect()) == sorted(actual.collect())
Exemple #10
0
def test_cache_file_from_s3_basic():
    s3_session = mock.MagicMock()
    with get_temp_dir() as temp_dir:
        pipeline_result = execute_solid_with_resources(
            cache_file_from_s3,
            resources={
                'file_cache':
                fs_file_cache,
                's3':
                ResourceDefinition.hardcoded_resource(S3Resource(s3_session)),
            },
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {
                            'bucket_data': {
                                'bucket': 'some-bucket',
                                'key': 'some-key'
                            }
                        }
                    }
                },
                'resources': {
                    'file_cache': {
                        'config': {
                            'target_folder': temp_dir
                        }
                    }
                },
            },
        )

        # assert the download occured
        assert s3_session.download_file.call_count == 1

        assert pipeline_result.success

        solid_result = pipeline_result.result_for_solid('cache_file_from_s3')

        assert solid_result.success

        expectation_results = solid_result.expectation_results_during_compute
        assert len(expectation_results) == 1
        expectation_result = expectation_results[0]
        assert expectation_result.success
        assert expectation_result.label == 'file_handle_exists'
        path_in_metadata = expectation_result.metadata_entries[
            0].entry_data.path
        assert isinstance(path_in_metadata, str)
        assert os.path.exists(path_in_metadata)

        assert isinstance(solid_result.result_value(), LocalFileHandle)
        assert 'some-key' in solid_result.result_value().path_desc
Exemple #11
0
def test_cache_file_from_s3_specify_target_key():
    s3_session = mock.MagicMock()
    with get_temp_dir() as temp_dir:
        pipeline_result = execute_solid_with_resources(
            cache_file_from_s3,
            resources={
                'file_cache':
                fs_file_cache,
                's3':
                ResourceDefinition.hardcoded_resource(S3Resource(s3_session)),
            },
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {
                            'bucket_data': {
                                'bucket': 'some-bucket',
                                'key': 'some-key'
                            }
                        },
                        'config': {
                            'file_key': 'specified-file-key'
                        },
                    }
                },
                'resources': {
                    'file_cache': {
                        'config': {
                            'target_folder': temp_dir
                        }
                    }
                },
            },
        )

        # assert the download occured
        assert s3_session.download_file.call_count == 1
        assert pipeline_result.success
        solid_result = pipeline_result.result_for_solid('cache_file_from_s3')
        assert solid_result.success
        assert isinstance(solid_result.result_value(), LocalFileHandle)
        assert 'specified-file-key' in solid_result.result_value().path_desc
Exemple #12
0
def test_dataframe_outputs(file_type, read, kwargs):
    df = create_dask_df()

    @solid(output_defs=[
        OutputDefinition(dagster_type=DataFrame, name="output_df")
    ])
    def return_df(_):
        return df

    # https://github.com/dagster-io/dagster/issues/2872
    with pytest.warns(
            UserWarning,
            match=re.escape(
                "Specifying {key}: is deprecated. Use to:{key}: instead.".
                format(key=file_type)),
    ):
        with get_temp_dir() as temp_path:
            shutil.rmtree(temp_path)
            result = execute_solid(
                return_df,
                run_config={
                    "solids": {
                        "return_df": {
                            "outputs": [{
                                "output_df": {
                                    file_type: {
                                        "path": temp_path,
                                        **kwargs
                                    }
                                }
                            }]
                        }
                    }
                },
            )
            assert result.success
            actual = read(f"{temp_path}/*")
            assert assert_eq(actual, df)
Exemple #13
0
def test_empty_file_cache():
    with get_temp_dir() as temp_dir:
        file_cache = FSFileCache(temp_dir)
        assert not file_cache.has_file_object('kjdfkd')
Exemple #14
0
def test_cache_file_from_s3_overwrite():
    with get_temp_dir() as temp_dir:
        s3_session_one = mock.MagicMock()
        pipeline_result_one = execute_solid_with_resources(
            cache_file_from_s3,
            resources={
                'file_cache':
                fs_file_cache,
                's3':
                ResourceDefinition.hardcoded_resource(
                    S3Resource(s3_session_one)),
            },
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {
                            'bucket_data': {
                                'bucket': 'some-bucket',
                                'key': 'some-key'
                            }
                        }
                    }
                },
                'resources': {
                    'file_cache': {
                        'config': {
                            'target_folder': temp_dir,
                            'overwrite': True
                        }
                    }
                },
            },
        )

        assert pipeline_result_one.success
        # assert the download occured
        assert s3_session_one.download_file.call_count == 1

        s3_session_two = mock.MagicMock()
        pipeline_result_two = execute_solid_with_resources(
            cache_file_from_s3,
            resources={
                'file_cache': fs_file_cache,
                's3': ResourceDefinition.hardcoded_resource(s3_session_two),
            },
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {
                            'bucket_data': {
                                'bucket': 'some-bucket',
                                'key': 'some-key'
                            }
                        }
                    }
                },
                'resources': {
                    'file_cache': {
                        'config': {
                            'target_folder': temp_dir,
                            'overwrite': True
                        }
                    }
                },
            },
        )

        assert pipeline_result_two.success
        # assert the download did not occur because file is already there
        assert s3_session_two.download_file.call_count == 0