def test_basic_int_and_string_json_materialization(): pipeline = multiple_output_pipeline() with get_temp_file_names(2) as file_tuple: filename_one, filename_two = file_tuple # pylint: disable=E0632 result = execute_pipeline( pipeline, { "solids": { "return_one_and_foo": { "outputs": [ {"string": {"json": {"path": filename_one}}}, {"number": {"json": {"path": filename_two}}}, ] } } }, ) assert result.success with open(filename_one, "r") as ff_1: value = json.loads(ff_1.read()) assert value == {"value": "foo"} with open(filename_two, "r") as ff_2: value = json.loads(ff_2.read()) assert value == {"value": 1}
def test_basic_int_json_multiple_materializations(): pipeline = single_int_output_pipeline() with get_temp_file_names(2) as file_tuple: filename_one, filename_two = file_tuple # pylint: disable=E0632 result = execute_pipeline( pipeline, { 'solids': { 'return_one': { 'outputs': [ {'result': {'json': {'path': filename_one}}}, {'result': {'json': {'path': filename_two}}}, ] } } }, ) assert result.success with open(filename_one, 'r') as ff: value = json.loads(ff.read()) assert value == {'value': 1} with open(filename_two, 'r') as ff: value = json.loads(ff.read()) assert value == {'value': 1}
def test_basic_int_and_string_json_materialization(): pipeline = multiple_output_pipeline() with get_temp_file_names(2) as file_tuple: filename_one, filename_two = file_tuple # pylint: disable=E0632 result = execute_pipeline( pipeline, { 'solids': { 'return_one_and_foo': { 'outputs': [ {'string': {'json': {'path': filename_one}}}, {'number': {'json': {'path': filename_two}}}, ] } } }, ) assert result.success with open(filename_one, 'r') as ff_1: value = json.loads(ff_1.read()) assert value == {'value': 'foo'} with open(filename_two, 'r') as ff_2: value = json.loads(ff_2.read()) assert value == {'value': 1}
def test_basic_int_json_multiple_materializations(): pipeline = single_int_output_pipeline() with get_temp_file_names(2) as file_tuple: filename_one, filename_two = file_tuple # pylint: disable=E0632 result = execute_pipeline( pipeline, { "solids": { "return_one": { "outputs": [ {"result": {"json": {"path": filename_one}}}, {"result": {"json": {"path": filename_two}}}, ] } } }, ) assert result.success with open(filename_one, "r") as ff: value = json.loads(ff.read()) assert value == {"value": 1} with open(filename_two, "r") as ff: value = json.loads(ff.read()) assert value == {"value": 1}
def test_pandas_multiple_outputs(): with get_temp_file_names(2) as temp_tuple: # false positive on pylint error csv_file, parquet_file = temp_tuple # pylint: disable=E0632 pipeline = create_diamond_pipeline() write_sum_mult_csv = dagster_pd.to_csv_solid('write_sum_mult_csv') write_sum_mult_parquet = dagster_pd.to_parquet_solid( 'write_sum_mult_parquet') pipeline = create_diamond_pipeline( extra_solids=[write_sum_mult_csv, write_sum_mult_parquet], extra_dependencies={ write_sum_mult_csv.name: { 'df': DependencyDefinition('sum_mult_table'), }, write_sum_mult_parquet.name: { 'df': DependencyDefinition('sum_mult_table'), } }) environment = get_num_csv_environment({ 'load_csv': config.Solid({ 'path': script_relative_path('num.csv'), }), write_sum_mult_csv.name: config.Solid({ 'path': csv_file, }), write_sum_mult_parquet.name: config.Solid({ 'path': parquet_file, }), }) execute_pipeline(pipeline, environment) assert os.path.exists(csv_file) output_csv_df = pd.read_csv(csv_file) assert output_csv_df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'mult': [2, 12], 'sum_mult': [6, 84], } assert os.path.exists(parquet_file) output_parquet_df = pd.read_parquet(parquet_file) assert output_parquet_df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'mult': [2, 12], 'sum_mult': [6, 84], }
def test_complex_type_materialization(): pipeline = multiple_output_pipeline() with get_temp_file_names(2) as file_tuple: filename_one, filename_two = file_tuple # pylint: disable=E0632 result = execute_pipeline( pipeline, { 'solids': { 'return_one_and_foo': { 'outputs': [ { 'string': { 'json': { 'path': filename_one } } }, { 'number': { 'json': { 'path': filename_two } } }, ] } } }, ) assert result.success for event in result.event_list: if event.event_type == DagsterEventType.STEP_MATERIALIZATION: event = event.event_specific_data.materialization assert len(event.metadata_entries) == 3 assert event.metadata_entries[1] == EventMetadataEntry( label='system-type-name', description=None, entry_data=TextMetadataEntryData(text='String'), ) or event.metadata_entries[1] == EventMetadataEntry( label='system-type-name', description=None, entry_data=TextMetadataEntryData(text='Int'), ) assert event.metadata_entries[2] == EventMetadataEntry( label='system-type-description', description=None, entry_data=TextMetadataEntryData(text='Any'), )
def test_pandas_output_intermediate_parquet_files(): pipeline = create_diamond_pipeline() with get_temp_file_names(2) as temp_tuple: # false positive on pylint error sum_file, mult_file = temp_tuple # pylint: disable=E0632 write_sum_table = dagster_pd.to_parquet_solid('write_sum_table') write_mult_table = dagster_pd.to_parquet_solid('write_mult_table') pipeline = create_diamond_pipeline( extra_solids=[write_sum_table, write_mult_table], extra_dependencies={ write_sum_table.name: { 'df': DependencyDefinition('sum_table'), }, write_mult_table.name: { 'df': DependencyDefinition('mult_table'), } }) environment = get_num_csv_environment({ 'load_csv': config.Solid({ 'path': script_relative_path('num.csv'), }), write_sum_table.name: config.Solid({'path': sum_file}), write_mult_table.name: config.Solid({'path': mult_file}), }) pipeline_result = execute_pipeline( pipeline, environment, ) assert pipeline_result.success expected_sum = { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], } assert pd.read_parquet(sum_file).to_dict('list') == expected_sum
def test_basic_pipeline_external_plan_execution(): pipeline = define_inty_pipeline() with get_temp_file_names(2) as temp_files: temp_path, write_path = temp_files # pylint: disable=W0632 int_type = resolve_to_runtime_type(Int) serialize_to_file(int_type.serialization_strategy, 5, temp_path) execution_plan = create_execution_plan(pipeline) results = execute_externalized_plan( pipeline, execution_plan, ['add_one.transform'], inputs_to_marshal={'add_one.transform': { 'num': temp_path }}, outputs_to_marshal={ 'add_one.transform': [{ 'output': 'result', 'path': write_path }] }, execution_metadata=ExecutionMetadata(), ) assert deserialize_from_file(int_type.serialization_strategy, write_path) == 6 assert len(results) == 2 thunk_step_result = results[0] assert thunk_step_result.kind == StepKind.VALUE_THUNK transform_step_result = results[1] assert transform_step_result.kind == StepKind.TRANSFORM assert transform_step_result.success assert transform_step_result.success_data.output_name == 'result' assert transform_step_result.success_data.value == 6
def test_basic_int_and_string_json_multiple_materialization(): pipeline = multiple_output_pipeline() with get_temp_file_names(4) as file_tuple: # False positive for unbalanced tuple unpacking # pylint: disable=E0632 filename_one, filename_two, filename_three, filename_four = file_tuple result = execute_pipeline( pipeline, { 'solids': { 'return_one_and_foo': { 'outputs': [ {'string': {'json': {'path': filename_one}}}, {'string': {'json': {'path': filename_two}}}, {'number': {'json': {'path': filename_three}}}, {'number': {'json': {'path': filename_four}}}, ] } } }, ) assert result.success with open(filename_one, 'r') as ff: value = json.loads(ff.read()) assert value == {'value': 'foo'} with open(filename_two, 'r') as ff: value = json.loads(ff.read()) assert value == {'value': 'foo'} with open(filename_three, 'r') as ff: value = json.loads(ff.read()) assert value == {'value': 1} with open(filename_four, 'r') as ff: value = json.loads(ff.read()) assert value == {'value': 1}
def test_basic_int_and_string_json_multiple_materialization(): pipeline = multiple_output_pipeline() with get_temp_file_names(4) as file_tuple: # False positive for unbalanced tuple unpacking # pylint: disable=E0632 filename_one, filename_two, filename_three, filename_four = file_tuple result = execute_pipeline( pipeline, { "solids": { "return_one_and_foo": { "outputs": [ {"string": {"json": {"path": filename_one}}}, {"string": {"json": {"path": filename_two}}}, {"number": {"json": {"path": filename_three}}}, {"number": {"json": {"path": filename_four}}}, ] } } }, ) assert result.success with open(filename_one, "r") as ff: value = json.loads(ff.read()) assert value == {"value": "foo"} with open(filename_two, "r") as ff: value = json.loads(ff.read()) assert value == {"value": "foo"} with open(filename_three, "r") as ff: value = json.loads(ff.read()) assert value == {"value": 1} with open(filename_four, "r") as ff: value = json.loads(ff.read()) assert value == {"value": 1}
def test_basic_pipeline_external_plan_execution(): pipeline = define_inty_pipeline() with get_temp_file_names(2) as temp_files: temp_path, write_path = temp_files # pylint: disable=W0632 int_type = resolve_to_runtime_type(Int) serialize_to_file(int_type.serialization_strategy, 5, temp_path) step_events = execute_marshalling( pipeline, ['add_one.transform'], inputs_to_marshal={'add_one.transform': { 'num': temp_path }}, outputs_to_marshal={ 'add_one.transform': [MarshalledOutput('result', write_path)] }, ) assert deserialize_from_file(int_type.serialization_strategy, write_path) == 6 assert len(step_events) == 2 thunk_step_output_event = step_events[0] assert thunk_step_output_event.kind == StepKind.UNMARSHAL_INPUT transform_step_output_event = step_events[1] assert transform_step_output_event.kind == StepKind.TRANSFORM assert transform_step_output_event.is_successful_output assert transform_step_output_event.success_data.output_name == 'result' assert transform_step_output_event.success_data.value == 6
def get_temp_file_locations(num): with get_temp_file_names(num) as paths: for path in paths: os.unlink(path) yield paths
def test_pandas_output_intermediate_csv_files(): with get_temp_file_names(2) as temp_tuple: sum_file, mult_file = temp_tuple # pylint: disable=E0632 write_sum_table = dagster_pd.to_csv_solid('write_sum_table') write_mult_table = dagster_pd.to_csv_solid('write_mult_table') pipeline = create_diamond_pipeline( extra_solids=[write_sum_table, write_mult_table], extra_dependencies={ write_sum_table.name: { 'df': DependencyDefinition('sum_table'), }, write_mult_table.name: { 'df': DependencyDefinition('mult_table'), } }) environment = get_num_csv_environment({ 'load_csv': config.Solid({ 'path': script_relative_path('num.csv'), }), write_sum_table.name: config.Solid({'path': sum_file}), write_mult_table.name: config.Solid({'path': mult_file}), }) subgraph_one_result = execute_pipeline(pipeline, environment=environment) assert len(subgraph_one_result.result_list) == 5 expected_sum = { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], } assert pd.read_csv(sum_file).to_dict('list') == expected_sum sum_table_result = subgraph_one_result.result_for_solid('sum_table') assert sum_table_result.transformed_value().to_dict( 'list') == expected_sum expected_mult = { 'num1': [1, 3], 'num2': [2, 4], 'mult': [2, 12], } assert pd.read_csv(mult_file).to_dict('list') == expected_mult mult_table_result = subgraph_one_result.result_for_solid('mult_table') assert mult_table_result.transformed_value().to_dict( 'list') == expected_mult injected_solids = { 'sum_mult_table': { 'sum_table': dagster_pd.load_csv_solid('load_sum_table'), 'mult_table': dagster_pd.load_csv_solid('load_mult_table'), } } pipeline_result = execute_pipeline( PipelineDefinition.create_sub_pipeline( pipeline, ['sum_mult_table'], ['sum_mult_table'], injected_solids, ), environment=config.Environment(solids={ 'load_sum_table': config.Solid({'path': sum_file}, ), 'load_mult_table': config.Solid({'path': mult_file}, ), }, ), ) assert pipeline_result.success subgraph_two_result_list = pipeline_result.result_list assert len(subgraph_two_result_list) == 3 output_df = pipeline_result.result_for_solid( 'sum_mult_table').transformed_value() assert output_df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'mult': [2, 12], 'sum_mult': [6, 84], }