コード例 #1
0
def test_pandas_output_csv_pipeline():
    with get_temp_file_name() as temp_file_name:
        write_solid = dagster_pd.to_csv_solid('write_sum_mult_table')
        pipeline = create_diamond_pipeline(
            extra_solids=[write_solid],
            extra_dependencies={
                write_solid.name: {
                    'df': DependencyDefinition('sum_mult_table')
                }
            })
        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_solid.name:
            config.Solid({'path': temp_file_name}),
        })

        for _result in execute_pipeline_iterator(pipeline=pipeline,
                                                 environment=environment):
            pass

        assert os.path.exists(temp_file_name)
        output_df = pd.read_csv(temp_file_name)
        assert output_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }
コード例 #2
0
def execute_transform_in_temp_csv_files(solid_inst):
    load_csv_solid = dagster_pd.load_csv_solid('load_csv')
    to_csv_solid = dagster_pd.to_csv_solid('to_csv')

    key = solid_inst.input_defs[0].name

    pipeline = PipelineDefinition(
        solids=[load_csv_solid, solid_inst, to_csv_solid],
        dependencies={
            solid_inst.name: {
                key: DependencyDefinition('load_csv'),
            },
            'to_csv': {
                'df': DependencyDefinition(solid_inst.name),
            }
        })
    with get_temp_file_name() as temp_file_name:
        result = execute_pipeline(
            pipeline,
            get_num_csv_environment({
                load_csv_solid.name:
                config.Solid({'path': script_relative_path('num.csv')}),
                to_csv_solid.name:
                config.Solid({'path': temp_file_name}),
            }),
        )

        assert result.success

        output_df = pd.read_csv(temp_file_name)

    return output_df
コード例 #3
0
def test_pandas_multiple_outputs():
    with get_temp_file_names(2) as temp_tuple:
        # false positive on pylint error
        csv_file, parquet_file = temp_tuple  # pylint: disable=E0632
        pipeline = create_diamond_pipeline()

        write_sum_mult_csv = dagster_pd.to_csv_solid('write_sum_mult_csv')
        write_sum_mult_parquet = dagster_pd.to_parquet_solid(
            'write_sum_mult_parquet')

        pipeline = create_diamond_pipeline(
            extra_solids=[write_sum_mult_csv, write_sum_mult_parquet],
            extra_dependencies={
                write_sum_mult_csv.name: {
                    'df': DependencyDefinition('sum_mult_table'),
                },
                write_sum_mult_parquet.name: {
                    'df': DependencyDefinition('sum_mult_table'),
                }
            })

        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_sum_mult_csv.name:
            config.Solid({
                'path': csv_file,
            }),
            write_sum_mult_parquet.name:
            config.Solid({
                'path': parquet_file,
            }),
        })

        execute_pipeline(pipeline, environment)

        assert os.path.exists(csv_file)
        output_csv_df = pd.read_csv(csv_file)
        assert output_csv_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }

        assert os.path.exists(parquet_file)
        output_parquet_df = pd.read_parquet(parquet_file)
        assert output_parquet_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }
コード例 #4
0
def run_hello_world(hello_world):
    assert len(hello_world.input_defs) == 1

    pipeline = PipelineDefinition(solids=[
        dagster_pd.load_csv_solid('load_csv'),
        hello_world,
    ],
                                  dependencies={
                                      'hello_world': {
                                          'num_csv':
                                          DependencyDefinition('load_csv'),
                                      },
                                  })

    pipeline_result = execute_pipeline(
        pipeline,
        environment=create_num_csv_environment(),
    )

    result = pipeline_result.result_for_solid('hello_world')

    assert result.success

    assert result.transformed_value().to_dict('list') == {
        'num1': [1, 3],
        'num2': [2, 4],
        'sum': [3, 7],
    }

    pipeline_two = PipelineDefinition(
        solids=[
            dagster_pd.load_csv_solid('load_csv'),
            hello_world,
            dagster_pd.to_csv_solid('to_csv'),
        ],
        dependencies={
            'hello_world': {
                'num_csv': DependencyDefinition('load_csv'),
            },
            'to_csv': {
                'df': DependencyDefinition('hello_world'),
            }
        })

    with get_temp_file_name() as temp_file_name:
        environment = config.Environment(solids={
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            'to_csv':
            config.Solid({
                'path': temp_file_name,
            })
        }, )
        pipeline_result = execute_pipeline(
            pipeline_two,
            environment,
        )

        output_result = pipeline_result.result_for_solid('hello_world')

        assert output_result.success

        assert pd.read_csv(temp_file_name).to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
        }
コード例 #5
0
def test_pandas_output_intermediate_csv_files():

    with get_temp_file_names(2) as temp_tuple:
        sum_file, mult_file = temp_tuple  # pylint: disable=E0632

        write_sum_table = dagster_pd.to_csv_solid('write_sum_table')
        write_mult_table = dagster_pd.to_csv_solid('write_mult_table')

        pipeline = create_diamond_pipeline(
            extra_solids=[write_sum_table, write_mult_table],
            extra_dependencies={
                write_sum_table.name: {
                    'df': DependencyDefinition('sum_table'),
                },
                write_mult_table.name: {
                    'df': DependencyDefinition('mult_table'),
                }
            })

        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_sum_table.name:
            config.Solid({'path': sum_file}),
            write_mult_table.name:
            config.Solid({'path': mult_file}),
        })

        subgraph_one_result = execute_pipeline(pipeline,
                                               environment=environment)

        assert len(subgraph_one_result.result_list) == 5

        expected_sum = {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
        }

        assert pd.read_csv(sum_file).to_dict('list') == expected_sum
        sum_table_result = subgraph_one_result.result_for_solid('sum_table')
        assert sum_table_result.transformed_value().to_dict(
            'list') == expected_sum

        expected_mult = {
            'num1': [1, 3],
            'num2': [2, 4],
            'mult': [2, 12],
        }
        assert pd.read_csv(mult_file).to_dict('list') == expected_mult
        mult_table_result = subgraph_one_result.result_for_solid('mult_table')
        assert mult_table_result.transformed_value().to_dict(
            'list') == expected_mult

        injected_solids = {
            'sum_mult_table': {
                'sum_table': dagster_pd.load_csv_solid('load_sum_table'),
                'mult_table': dagster_pd.load_csv_solid('load_mult_table'),
            }
        }

        pipeline_result = execute_pipeline(
            PipelineDefinition.create_sub_pipeline(
                pipeline,
                ['sum_mult_table'],
                ['sum_mult_table'],
                injected_solids,
            ),
            environment=config.Environment(solids={
                'load_sum_table':
                config.Solid({'path': sum_file}, ),
                'load_mult_table':
                config.Solid({'path': mult_file}, ),
            }, ),
        )

        assert pipeline_result.success

        subgraph_two_result_list = pipeline_result.result_list

        assert len(subgraph_two_result_list) == 3
        output_df = pipeline_result.result_for_solid(
            'sum_mult_table').transformed_value()
        assert output_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }