def test_cache_file_from_s3_step_three_fake(snapshot): # https://github.com/spulec/moto/issues/3292 s3 = boto3.client("s3", region_name="us-east-1") s3.create_bucket(Bucket="some-bucket") s3.put_object(Bucket="some-bucket", Key="some-key", Body=b"foo") with get_temp_dir() as temp_dir: execute_solid( cache_file_from_s3, unittest_for_local_mode_def(temp_dir, s3), input_values={ "s3_coord": { "bucket": "some-bucket", "key": "some-key" } }, ) target_file = os.path.join(temp_dir, "some-key") assert os.path.exists(target_file) with open(target_file, "rb") as ff: assert ff.read() == b"foo" snapshot.assert_match({ "some-bucket": { k: s3.get_object(Bucket="some-bucket", Key=k)["Body"].read() for k in [ obj["Key"] for obj in s3.list_objects(Bucket="some-bucket")["Contents"] ] } })
def get_external_repository_from_image(image): check.str_param(image, 'image') with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir: output_file_name = "{}.json".format(uuid4()) command = 'dagster api snapshot repository'.format( output_file=os.path.join(DEFAULT_INTERNAL_VOLUME, output_file_name) ) output = run_serialized_container_command( image=image, command=command, volumes={tmp_dir: {'bind': DEFAULT_INTERNAL_VOLUME, 'mode': DEFAULT_MODE}}, ) if len(output) != 1: print(output) raise DagsterInvariantViolationError( "Running command {command} in container {image} resulted in output of length " "{actual} lines, expected {expected} lines".format( command=command, image=image, actual=len(output), expected=1 ) ) serialized_external_repo_data = output[0] external_repo_data = deserialize_json_to_dagster_namedtuple(serialized_external_repo_data) if not isinstance(external_repo_data, ExternalRepositoryData): raise DagsterInvariantViolationError( "Deserialized snapshot is of type {received} must be a ExternalRepositoryData".format( received=type(external_repo_data) ) ) return ExternalRepository(external_repo_data)
def test_execute_byfeature_parquet_lakehouse(): with get_temp_dir() as temp_dir: lakehouse = ByFeatureParquetLakehouse(temp_dir) pipeline_def = construct_lakehouse_pipeline( name='test', lakehouse_tables=[TableOne, TableTwo, TableThree], mode_defs=[ ModeDefinition( resource_defs={ 'spark': spark_session_resource, 'lakehouse': ResourceDefinition.hardcoded_resource(lakehouse), }) ], ) pipeline_result = execute_pipeline(pipeline_def) assert pipeline_result.success def get_table(table_def): spark = spark_session_from_config() return spark.read.parquet( os.path.join(temp_dir, table_def.metadata[FEATURE_AREA], table_def.name)).collect() assert get_table(TableOne) == [Row(num=1)] assert get_table(TableTwo) == [Row(num=2)] assert set(get_table(TableThree)) == set([Row(num=1), Row(num=2)])
def test_execute_file_system_lakehouse(local_on_disk_spark_lakehouse, execute_spark_lakehouse_build): with get_temp_dir() as temp_dir: pipeline_result = execute_spark_lakehouse_build( tables=[TableOne, TableTwo, TableThree], lakehouse=local_on_disk_spark_lakehouse(temp_dir), environment_dict={ 'solids': { 'TableOne': { 'inputs': { 'num': { 'value': 1 } } } } }, ) assert pipeline_result.success def get_table(name): spark = spark_session_from_config() return spark.read.csv(os.path.join(temp_dir, name), header=True, inferSchema=True).collect() assert get_table('TableOne') == [Row(num=1)] assert get_table('TableTwo') == [Row(num=2)] assert set(get_table('TableThree')) == set([Row(num=1), Row(num=2)])
def test_cache_file_from_s3_specify_target_key(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: solid_result = execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(S3Resource(s3_session)), } ), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': {'s3_coordinate': {'bucket': 'some-bucket', 'key': 'some-key'}}, 'config': {'file_key': 'specified-file-key'}, } }, 'resources': {'file_cache': {'config': {'target_folder': temp_dir}}}, }, ) # assert the download occured assert s3_session.download_file.call_count == 1 assert solid_result.success assert isinstance(solid_result.output_value(), LocalFileHandle) assert 'specified-file-key' in solid_result.output_value().path_desc
def test_cache_file_from_s3_step_two_use_config(): boto_s3 = mock.MagicMock() with get_temp_dir() as temp_dir, mock.patch( 'boto3.client', new=lambda *_args, **_kwargs: boto_s3): execute_solid( cache_file_from_s3, ModeDefinition(resource_defs={'file_cache': fs_file_cache}), environment_dict={ 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir } } }, 'solids': { 'cache_file_from_s3': { 'inputs': { 's3_coord': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, }, ) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, 'some-key'))
def test_missing_resources(): with pytest.raises(DagsterInvalidDefinitionError): with get_temp_dir() as temp_dir: execute_solid( cache_file_from_s3, ModeDefinition(resource_defs={"file_cache": fs_file_cache}), run_config={ "solids": { "cache_file_from_s3": { "inputs": { "s3_coordinate": { "bucket": "some-bucket", "key": "some-key" } } } }, "resources": { "file_cache": { "config": { "target_folder": temp_dir } } }, }, )
def test_missing_resources(): with pytest.raises(DagsterInvalidDefinitionError): with get_temp_dir() as temp_dir: execute_solid( cache_file_from_s3, ModeDefinition(resource_defs={'file_cache': fs_file_cache}), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 's3_coordinate': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir } } }, }, )
def get_container_snapshot(image): check.str_param(image, 'image') # Done to avoid memory leaks with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir: # TODO: Add better error handling when we move towards integrating with dagit. output_file_name = "{}.json".format(uuid4()) run_serialized_container_command( image=image, command='dagster repository snapshot {output_file}'.format( output_file=os.path.join(DEFAULT_INTERNAL_VOLUME, output_file_name)), volumes={ tmp_dir: { 'bind': DEFAULT_INTERNAL_VOLUME, 'mode': DEFAULT_MODE, } }, ) with open(os.path.join(tmp_dir, output_file_name), 'r') as fp: snapshot = deserialize_json_to_dagster_namedtuple(fp.read()) if not isinstance(snapshot, RepositorySnapshot): raise DagsterInvariantViolationError( "Deserialized snapshot is of type {received} must be a RepositorySnapshot" .format(received=type(snapshot))) return snapshot
def _execute_pipeline_command( repository_file, pipeline_name, environment_dict, mode=None, solid_subset=None ): with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir: output_file_name = "{}.json".format(uuid4()) output_file = os.path.join(tmp_dir, output_file_name) command = ( "dagster api execute_pipeline -y {repository_file} {pipeline_name} " "{output_file} --environment-dict='{environment_dict}' --mode={mode}".format( repository_file=repository_file, pipeline_name=pipeline_name, output_file=output_file, environment_dict=json.dumps(environment_dict), mode=mode, ) ) if solid_subset: command += " --solid_subset={solid_subset}".format(solid_subset=",".join(solid_subset)) os.popen(command) for message in ipc_read_event_stream(output_file): yield message
def test_cache_file_from_s3_step_three_fake(snapshot): s3 = boto3.client('s3') s3.create_bucket(Bucket='some-bucket') s3.put_object(Bucket='some-bucket', Key='some-key', Body=b'foo') with get_temp_dir() as temp_dir: execute_solid( cache_file_from_s3, unittest_for_local_mode_def(temp_dir, s3), input_values={ 's3_coord': { 'bucket': 'some-bucket', 'key': 'some-key' } }, ) target_file = os.path.join(temp_dir, 'some-key') assert os.path.exists(target_file) with open(target_file, 'rb') as ff: assert ff.read() == b'foo' snapshot.assert_match({ 'some-bucket': { k: s3.get_object(Bucket='some-bucket', Key=k)['Body'].read() for k in [ obj['Key'] for obj in s3.list_objects(Bucket='some-bucket')['Contents'] ] } })
def test_pyspark_assets_pipeline(): with get_temp_dir() as temp_dir: run_config = { "solids": { "get_max_temp_per_station": { "config": {"temperature_file": "temperature.csv", "version_salt": "foo",} }, "get_consolidated_location": { "config": {"station_file": "stations.csv", "version_salt": "foo",} }, "combine_dfs": {"config": {"version_salt": "foo",}}, "pretty_output": {"config": {"version_salt": "foo",}}, }, "resources": { "source_data_dir": { "config": { "dir": "python_modules/dagster-test/dagster_test/toys/pyspark_assets/asset_pipeline_files" } }, "savedir": {"config": {"dir": temp_dir}}, }, } result = execute_pipeline(pyspark_assets_pipeline, run_config=run_config,) assert result.success
def get_active_repository_data_from_image(image): check.str_param(image, 'image') with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir: output_file_name = "{}.json".format(uuid4()) run_serialized_container_command( image=image, command='dagster repository snapshot {output_file}'.format( output_file=os.path.join(DEFAULT_INTERNAL_VOLUME, output_file_name)), volumes={ tmp_dir: { 'bind': DEFAULT_INTERNAL_VOLUME, 'mode': DEFAULT_MODE } }, ) active_repo_data = _get_active_repo_data( os.path.join(tmp_dir, output_file_name)) if not isinstance(active_repo_data, ActiveRepositoryData): raise DagsterInvariantViolationError( "Deserialized snapshot is of type {received} must be a ActiveRepositoryData" .format(received=type(active_repo_data))) return active_repo_data
def test_fs_file_cache_write_binary_data(): with get_temp_dir() as temp_dir: file_store = FSFileCache(temp_dir) assert not file_store.has_file_object("foo") assert file_store.write_binary_data("foo", b"bar") file_handle = file_store.get_file_handle("foo") assert isinstance(file_handle, LocalFileHandle) assert file_handle.path_desc == os.path.join(temp_dir, "foo")
def test_fs_file_cache_write_binary_data(): with get_temp_dir() as temp_dir: file_store = FSFileCache(temp_dir) assert not file_store.has_file_object('foo') assert file_store.write_binary_data('foo', 'bar'.encode()) file_handle = file_store.get_file_handle('foo') assert isinstance(file_handle, LocalFileHandle) assert file_handle.path_desc == os.path.join(temp_dir, 'foo')
def test_basic_file_manager_copy_handle_to_local_temp(): foo_data = 'foo'.encode() with get_temp_dir() as temp_dir: with get_temp_file_handle_with_data(foo_data) as foo_handle: with local_file_manager(temp_dir) as manager: local_temp = manager.copy_handle_to_local_temp(foo_handle) assert local_temp != foo_handle.path with open(local_temp, 'rb') as ff: assert ff.read() == foo_data
def test_fs_file_cache_write_data(): bytes_object = io.BytesIO(b"bar") with get_temp_dir() as temp_dir: file_cache = FSFileCache(temp_dir) assert not file_cache.has_file_object("foo") assert file_cache.write_file_object("foo", bytes_object) file_handle = file_cache.get_file_handle("foo") assert isinstance(file_handle, LocalFileHandle) assert file_handle.path_desc == os.path.join(temp_dir, "foo")
def test_cache_file_from_s3_step_three_mock(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: execute_solid( cache_file_from_s3, unittest_for_local_mode_def(temp_dir, s3_session), input_values={'s3_coord': {'bucket': 'some-bucket', 'key': 'some-key'}}, ) assert s3_session.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, 'some-key'))
def test_cache_file_from_s3_basic(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: solid_result = execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource( S3Resource(s3_session)), }), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 's3_coordinate': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir } } }, }, ) # assert the download occured assert s3_session.download_file.call_count == 1 assert solid_result.success expectation_results = solid_result.expectation_results_during_compute assert len(expectation_results) == 1 expectation_result = expectation_results[0] assert expectation_result.success assert expectation_result.label == 'file_handle_exists' path_in_metadata = expectation_result.metadata_entries[ 0].entry_data.path assert isinstance(path_in_metadata, str) assert os.path.exists(path_in_metadata) assert isinstance(solid_result.output_value(), LocalFileHandle) assert 'some-key' in solid_result.output_value().path_desc
def test_cache_file_from_s3_step_one_two(): boto_s3 = mock.MagicMock() # mock.patch is difficult to get right and requires monkeypatching of global artifacts with get_temp_dir() as temp_dir, mock.patch( file_cache_folder.__module__ + ".file_cache_folder", new=lambda: temp_dir ), mock.patch("boto3.client", new=lambda *_args, **_kwargs: boto_s3): execute_solid( cache_file_from_s3, input_values=dict(s3_coord={"bucket": "some-bucket", "key": "some-key"}), ) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, "some-key"))
def test_pyspark_assets_job(executor_def): with get_temp_dir() as temp_dir: run_config = { "solids": { "get_max_temp_per_station": { "config": { "temperature_file": "temperature.csv", "version_salt": "foo", } }, "get_consolidated_location": { "config": { "station_file": "stations.csv", "version_salt": "foo", } }, "combine_dfs": { "config": { "version_salt": "foo", } }, "pretty_output": { "config": { "version_salt": "foo", } }, }, "resources": { "source_data_dir": { "config": { "dir": file_relative_path( __file__, "../dagster_test/graph_job_op_toys/pyspark_assets/asset_job_files", ), } }, "savedir": { "config": { "dir": temp_dir } }, }, } result = pyspark_assets.to_job( config=run_config, resource_defs=dir_resources, executor_def=executor_def).execute_in_process() assert result.success
def test_cache_file_from_s3_step_two_skip_config(): boto_s3 = mock.MagicMock() with get_temp_dir() as temp_dir, mock.patch( "boto3.client", new=lambda *_args, **_kwargs: boto_s3 ): execute_solid( cache_file_from_s3, ModeDefinition.from_resources({"file_cache": FSFileCache(temp_dir)}), input_values={"s3_coord": {"bucket": "some-bucket", "key": "some-key"}}, ) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, "some-key"))
def test_cache_file_from_s3_step_two_skip_config(): boto_s3 = mock.MagicMock() with get_temp_dir() as temp_dir, mock.patch( 'boto3.client', new=lambda *_args, **_kwargs: boto_s3 ): execute_solid( cache_file_from_s3, ModeDefinition.from_resources({'file_cache': FSFileCache(temp_dir)}), input_values={'s3_coord': {'bucket': 'some-bucket', 'key': 'some-key'}}, ) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, 'some-key'))
def test_cache_file_from_s3_overwrite(): with get_temp_dir() as temp_dir: s3_session_one = mock.MagicMock() execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(S3Resource(s3_session_one)), } ), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': {'s3_coordinate': {'bucket': 'some-bucket', 'key': 'some-key'}} } }, 'resources': { 'file_cache': {'config': {'target_folder': temp_dir, 'overwrite': True}} }, }, ) # assert the download occured assert s3_session_one.download_file.call_count == 1 s3_session_two = mock.MagicMock() execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(s3_session_two), } ), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': {'s3_coordinate': {'bucket': 'some-bucket', 'key': 'some-key'}} } }, 'resources': { 'file_cache': {'config': {'target_folder': temp_dir, 'overwrite': True}} }, }, ) # assert the download did not occur because file is already there assert s3_session_two.download_file.call_count == 0
def test_cache_file_from_s3_basic(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: solid_result = execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ "file_cache": fs_file_cache, "s3": ResourceDefinition.hardcoded_resource(s3_session), }), run_config={ "solids": { "cache_file_from_s3": { "inputs": { "s3_coordinate": { "bucket": "some-bucket", "key": "some-key" } } } }, "resources": { "file_cache": { "config": { "target_folder": temp_dir } } }, }, ) # assert the download occurred assert s3_session.download_file.call_count == 1 assert solid_result.success expectation_results = solid_result.expectation_results_during_compute assert len(expectation_results) == 1 expectation_result = expectation_results[0] assert expectation_result.success assert expectation_result.label == "file_handle_exists" path_in_metadata = expectation_result.metadata_entries[ 0].entry_data.path assert isinstance(path_in_metadata, str) assert os.path.exists(path_in_metadata) assert isinstance(solid_result.output_value(), LocalFileHandle) assert "some-key" in solid_result.output_value().path_desc
def test_cache_file_from_s3_step_three_fake(snapshot): s3_session = S3FakeSession({'some-bucket': {'some-key': b'foo'}}) with get_temp_dir() as temp_dir: execute_solid( cache_file_from_s3, unittest_for_local_mode_def(temp_dir, s3_session), input_values={'s3_coord': {'bucket': 'some-bucket', 'key': 'some-key'}}, ) target_file = os.path.join(temp_dir, 'some-key') assert os.path.exists(target_file) with open(target_file, 'rb') as ff: assert ff.read() == b'foo' snapshot.assert_match(s3_session.buckets)
def test_cache_file_from_s3_step_two_use_config(): boto_s3 = mock.MagicMock() with get_temp_dir() as temp_dir, mock.patch( "boto3.client", new=lambda *_args, **_kwargs: boto_s3 ): execute_solid( cache_file_from_s3, ModeDefinition(resource_defs={"file_cache": fs_file_cache}), run_config={ "resources": {"file_cache": {"config": {"target_folder": temp_dir}}}, "solids": { "cache_file_from_s3": { "inputs": {"s3_coord": {"bucket": "some-bucket", "key": "some-key"}} } }, }, ) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, "some-key"))
def test_cache_file_from_s3_step_one_one(): boto_s3 = mock.MagicMock() # mock.patch is difficult to get right and requires monkeypatching of global artifacts with get_temp_dir() as temp_dir, mock.patch( file_cache_folder.__module__ + ".file_cache_folder", new=lambda: temp_dir ), mock.patch("boto3.client", new=lambda *_args, **_kwargs: boto_s3): @solid def emit_value(): return {"bucket": "some-bucket", "key": "some-key"} @pipeline def pipe(): return cache_file_from_s3(emit_value()) execute_pipeline(pipe) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, "some-key"))
def test_cache_file_from_s3_specify_target_key(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: solid_result = execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ "file_cache": fs_file_cache, "s3": ResourceDefinition.hardcoded_resource(s3_session), }), run_config={ "solids": { "cache_file_from_s3": { "inputs": { "s3_coordinate": { "bucket": "some-bucket", "key": "some-key" } }, "config": { "file_key": "specified-file-key" }, } }, "resources": { "file_cache": { "config": { "target_folder": temp_dir } } }, }, ) # assert the download occurred assert s3_session.download_file.call_count == 1 assert solid_result.success assert isinstance(solid_result.output_value(), LocalFileHandle) assert "specified-file-key" in solid_result.output_value().path_desc
def test_cache_file_from_s3_step_one_one(): boto_s3 = mock.MagicMock() # mock.patch is difficult to get right and requires monkeypatching of global artifacts with get_temp_dir() as temp_dir, mock.patch( file_cache_folder.__module__ + '.file_cache_folder', new=lambda: temp_dir), mock.patch( 'boto3.client', new=lambda *_args, **_kwargs: boto_s3): @solid def emit_value(_): return {'bucket': 'some-bucket', 'key': 'some-key'} @pipeline def pipe(): # pylint: disable=no-value-for-parameter return cache_file_from_s3(emit_value()) execute_pipeline(pipe) assert boto_s3.download_file.call_count == 1 assert os.path.exists(os.path.join(temp_dir, 'some-key'))