def LR_solid(): return dagstermill.define_dagstermill_solid( 'linear_regression', nb_test_path('tutorial_LR'), input_defs=[InputDefinition(name='df', dagster_type=DataFrame)], )
import re from operator import add from dagster_pyspark import SparkRDD, pyspark_resource from dagster import InputDefinition, Int, ModeDefinition, OutputDefinition, Path, pipeline, solid def parseNeighbors(urls): """Parses a urls pair string into urls pair.""" parts = re.split(r'\s+', urls) return parts[0], parts[1] @solid( input_defs=[InputDefinition('pagerank_data', Path)], output_defs=[OutputDefinition(SparkRDD)] ) def parse_pagerank_data_step_five(context, pagerank_data): lines = context.resources.spark.spark_session.read.text(pagerank_data).rdd.map(lambda r: r[0]) return lines.map(parseNeighbors) @solid(input_defs=[InputDefinition('urls', SparkRDD)], output_defs=[OutputDefinition(SparkRDD)]) def compute_links_step_five(_context, urls): return urls.distinct().groupByKey().cache() def computeContribs(urls, rank): """Calculates URL contributions to the rank of other URLs.""" num_urls = len(urls) for url in urls:
def test_failure_propagation(): ''' B =========== C // \\ A F (skipped) \\ // D (fails) == E (skipped) ''' solid_a = create_root_success_solid('A') def fail_fn(_context, inputs): check.failed('user error') return inputs def success_fn(_context, inputs): return inputs solid_b = single_output_solid( name='B', input_defs=[InputDefinition(name='A')], compute_fn=success_fn, output_def=OutputDefinition(), ) solid_c = single_output_solid( name='C', input_defs=[InputDefinition(name='B')], compute_fn=success_fn, output_def=OutputDefinition(), ) solid_d = single_output_solid( name='D', input_defs=[InputDefinition(name='A')], compute_fn=fail_fn, output_def=OutputDefinition(), ) solid_e = single_output_solid( name='E', input_defs=[InputDefinition(name='D')], compute_fn=success_fn, output_def=OutputDefinition(), ) solid_f = single_output_solid( name='F', input_defs=[InputDefinition(name='C'), InputDefinition(name='E')], compute_fn=success_fn, output_def=OutputDefinition(), ) pipeline_def = PipelineDefinition( solid_defs=[solid_a, solid_b, solid_c, solid_d, solid_e, solid_f], dependencies={ 'B': { 'A': DependencyDefinition(solid_a.name) }, 'D': { 'A': DependencyDefinition(solid_a.name) }, 'C': { 'B': DependencyDefinition(solid_b.name) }, 'E': { 'D': DependencyDefinition(solid_d.name) }, 'F': { 'C': DependencyDefinition(solid_c.name), 'E': DependencyDefinition(solid_e.name) }, }, ) pipeline_result = execute_pipeline( pipeline_def, run_config=RunConfig.nonthrowing_in_process()) assert pipeline_result.result_for_solid('A').success assert pipeline_result.result_for_solid('B').success assert pipeline_result.result_for_solid('C').success assert not pipeline_result.result_for_solid('D').success assert pipeline_result.result_for_solid( 'D').failure_data.error.cls_name == 'CheckError' assert not pipeline_result.result_for_solid('E').success assert pipeline_result.result_for_solid('E').skipped assert not pipeline_result.result_for_solid('F').success assert pipeline_result.result_for_solid('F').skipped
return {'solids': solids_config} def create_sum_table(): def compute(_context, inputs): num_csv = inputs['num_csv'] check.inst_param(num_csv, 'num_csv', pd.DataFrame) num_csv['sum'] = num_csv['num1'] + num_csv['num2'] return num_csv return _dataframe_solid(name='sum_table', input_defs=[InputDefinition('num_csv', DataFrame)], compute_fn=compute) @lambda_solid(input_defs=[InputDefinition('num_csv', DataFrame)], output_def=OutputDefinition(DataFrame)) def sum_table(num_csv): check.inst_param(num_csv, 'num_csv', pd.DataFrame) num_csv['sum'] = num_csv['num1'] + num_csv['num2'] return num_csv @lambda_solid(input_defs=[InputDefinition('sum_df', DataFrame)], output_def=OutputDefinition(DataFrame)) def sum_sq_table(sum_df): sum_df['sum_squared'] = sum_df['sum'] * sum_df['sum'] return sum_df @lambda_solid(
Row(name="John", age=19), Row(name="Jennifer", age=29), Row(name="Henry", age=50) ] return context.resources.pyspark.spark_session.createDataFrame( rows, schema) @solid( name="blah", description="this is a test", config_schema={ "foo": str, "bar": int }, input_defs=[InputDefinition("people", DataFrame)], output_defs=[OutputDefinition(DataFrame)], required_resource_keys={"pyspark_step_launcher"}, ) def filter_df_solid(_, people): return people.filter(people["age"] < 30) MODE_DEFS = [ ModeDefinition( "prod", resource_defs={ "pyspark_step_launcher": emr_pyspark_step_launcher, "pyspark": pyspark_resource, "s3": s3_resource, },
def test_solid_definition_errors(): with pytest.raises(DagsterInvalidDefinitionError, match="positional vararg"): @solid(input_defs=[InputDefinition(name="foo")], output_defs=[OutputDefinition()]) def vargs(context, foo, *args): pass with pytest.raises(DagsterInvalidDefinitionError): @solid(input_defs=[InputDefinition(name="foo")], output_defs=[OutputDefinition()]) def wrong_name(context, bar): pass with pytest.raises(DagsterInvalidDefinitionError): @solid( input_defs=[ InputDefinition(name="foo"), InputDefinition(name="bar") ], output_defs=[OutputDefinition()], ) def wrong_name_2(context, foo): pass with pytest.raises(DagsterInvalidDefinitionError): @solid(input_defs=[InputDefinition(name="foo")], output_defs=[OutputDefinition()]) def no_context(foo): pass with pytest.raises(DagsterInvalidDefinitionError): @solid(input_defs=[InputDefinition(name="foo")], output_defs=[OutputDefinition()]) def extras(_context, foo, bar): pass @solid( input_defs=[InputDefinition(name="foo"), InputDefinition(name="bar")], output_defs=[OutputDefinition()], ) def valid_kwargs(context, **kwargs): pass @solid( input_defs=[InputDefinition(name="foo"), InputDefinition(name="bar")], output_defs=[OutputDefinition()], ) def valid(context, foo, bar): pass @solid def valid_because_inference(context, foo, bar): pass
def _define_nothing_dep_pipeline(): @lambda_solid(output_def=OutputDefinition(Nothing, "complete")) def start_nothing(): pass @lambda_solid(input_defs=[ InputDefinition("add_complete", Nothing), InputDefinition("yield_complete", Nothing), ]) def end_nothing(): pass @lambda_solid(output_def=OutputDefinition(Int)) def emit_value(): return 1 @lambda_solid( input_defs=[ InputDefinition("on_complete", Nothing), InputDefinition("num", Int) ], output_def=OutputDefinition(Int), ) def add_value(num): return 1 + num @solid( name="yield_values", input_defs=[InputDefinition("on_complete", Nothing)], output_defs=[ OutputDefinition(Int, "num_1"), OutputDefinition(Int, "num_2"), OutputDefinition(Nothing, "complete"), ], ) def yield_values(_context): yield Output(1, "num_1") yield Output(2, "num_2") yield Output(None, "complete") return PipelineDefinition( name="simple_exc", solid_defs=[ emit_value, add_value, start_nothing, end_nothing, yield_values ], dependencies={ "add_value": { "on_complete": DependencyDefinition("start_nothing", "complete"), "num": DependencyDefinition("emit_value"), }, "yield_values": { "on_complete": DependencyDefinition("start_nothing", "complete") }, "end_nothing": { "add_complete": DependencyDefinition("add_value"), "yield_complete": DependencyDefinition("yield_values", "complete"), }, }, )
working_directory=None, location_name=main_repo_location_name(), ), ) as workspace_process_context: yield workspace_process_context.create_request_context() @contextmanager def get_main_external_repo(instance): with get_main_workspace(instance) as workspace: location = workspace.get_repository_location(main_repo_location_name()) yield location.get_repository(main_repo_name()) @lambda_solid( input_defs=[InputDefinition("num", PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_solid(num): sum_df = deepcopy(num) for x in sum_df: x["sum"] = int(x["num1"]) + int(x["num2"]) return sum_df @lambda_solid( input_defs=[InputDefinition("sum_df", PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_sq_solid(sum_df): sum_sq_df = deepcopy(sum_df)