def giver(context): chase_duration = context.solid_config yield Result(chase_duration, 'out_1') yield Result(chase_duration, 'out_2') yield Result(chase_duration, 'out_3') yield Result(chase_duration, 'out_4')
def test_multiple_double_result(): mr = MultipleResults(Result('value_one', 'output_one'), Result('value_two', 'output_two')) assert mr.results == [ Result('value_one', 'output_one'), Result('value_two', 'output_two') ]
def conditional(context): if context.solid_config == 'out_one': yield Result(23, 'out_one') elif context.solid_config == 'out_two': yield Result(45, 'out_two') else: raise Exception('invalid config')
def conditional(info): if info.config == 'out_one': yield Result(23, 'out_one') elif info.config == 'out_two': yield Result(45, 'out_two') else: raise Exception('invalid config')
def test_multiple_dict(): mr = MultipleResults.from_dict({ 'output_one': 'value_one', 'output_two': 'value_two' }) assert set(mr.results) == set( [Result('value_one', 'output_one'), Result('value_two', 'output_two')])
def giver(context): units = context.solid_config queues = [[], [], [], []] for i, sec in enumerate(units): queues[i % 4].append(sec) yield Result(queues[0], 'out_1') yield Result(queues[1], 'out_2') yield Result(queues[2], 'out_3') yield Result(queues[3], 'out_4')
def _new_compute_fn(context, inputs): value = compute_fn(context, inputs) if isinstance(value, Result): raise DagsterInvariantViolationError( '''Single output compute Solid {name} returned a Result. Just return value directly without wrapping it in Result''') yield Result(value=value)
def load_data_to_database_from_spark(context, data_frame): context.resources.db_info.load_table(data_frame, context.solid_config['table_name']) # TODO Flow more information down to the client # We should be able to flow multiple key value pairs down to dagit # See https://github.com/dagster-io/dagster/issues/1408 yield Materialization(path='Persisted Db Table: {table_name}'.format( table_name=context.solid_config['table_name'])) yield Result(data_frame)
def process_q2_data(context, april_data, may_data, june_data, master_cord_data): dfs = {'april': april_data, 'may': may_data, 'june': june_data} missing_things = [] for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({ 'month': month, 'missing_column': required_column }) yield ExpectationResult( success=not bool(missing_things), name='airport_ids_present', message='Sequence IDs present in incoming monthly flight data.', result_metadata={'missing_columns': missing_things}, ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set( june_data.columns), name='flight_data_same_shape', result_metadata={'columns': april_data.columns}, ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0) sampled_q2_data.createOrReplaceTempView('q2_data') dest_prefixed_master_cord_data = do_prefix_column_names( master_cord_data, 'DEST_') dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data') origin_prefixed_master_cord_data = do_prefix_column_names( master_cord_data, 'ORIGIN_') origin_prefixed_master_cord_data.createOrReplaceTempView( 'origin_cord_data') full_data = context.resources.spark.sql(''' SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID ''') yield Result(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def test_single_transform_returning_result(): solid_inst = single_output_transform( 'test_return_result', inputs=[], transform_fn=lambda *_args, **_kwargs: Result(None), output=OutputDefinition(), ) with pytest.raises(DagsterInvariantViolationError): execute_single_solid_in_isolation(ExecutionContext(), solid_inst)
def _compute_fn(context, _): (dataset, delete_contents, not_found_ok) = [ context.solid_config.get(k) for k in ('dataset', 'delete_contents', 'not_found_ok') ] context.log.info('executing BQ delete_dataset for dataset %s' % dataset) context.resources.bq.delete_dataset( dataset, delete_contents=delete_contents, not_found_ok=not_found_ok ) yield Result(None)
def upload_to_s3(context, file_obj): '''Upload a file to s3. Args: info (ExpectationExecutionInfo): Must expose a boto3 S3 client as its `s3` resource. Returns: (str, str): The bucket and key to which the file was uploaded. ''' bucket = context.solid_config['bucket'] key = context.solid_config['key'] context.resources.s3.put_object(Bucket=bucket, Body=file_obj.read(), Key=key, **(context.solid_config.get('kwargs') or {})) yield Result(bucket, 'bucket') yield Result(key, 'key')
def upload_to_s3(info, file_path): '''Upload a file to s3. Args: info (ExpectationExecutionInfo): Must expose a boto3 S3 client as its `s3` resource. Returns: (str, str): The bucket and key to which the file was uploaded. ''' bucket = info.config['bucket'] key = info.config['key'] with open(file_path, 'rb') as fd: info.resources.s3.put_object(Bucket=bucket, Body=fd, Key=key, **(info.config.get('kwargs') or {})) yield Result(bucket, 'bucket') yield Result(key, 'key')
def test_do_not_yield_result(): solid_inst = SolidDefinition( name='do_not_yield_result', inputs=[], outputs=[OutputDefinition()], transform_fn=lambda *_args, **_kwargs: Result('foo'), ) with pytest.raises( DagsterInvariantViolationError, match='Transform for solid do_not_yield_result returned a Result', ): execute_single_solid_in_isolation(ExecutionContext(), solid_inst)
def put_object_to_s3_bytes(context, file_obj): '''Upload file contents to s3. Args: file_obj (Bytes): The bytes of a file object. Returns: (str, str): The bucket and key to which the file was uploaded. ''' bucket = context.solid_config['Bucket'] key = context.solid_config['Key'] # the s3 put_object API expects the actual bytes to be on the 'Body' key in kwargs; since we # get all other fields from config, we copy the config object and add 'Body' here. cfg = context.solid_config.copy() cfg['Body'] = file_obj.read() context.resources.s3.put_object(**cfg) yield Result(bucket, 'bucket') yield Result(key, 'key')
def _transform_fn(info, inputs): passed_rows = [] seen = set() for row in inputs.values(): for item in row: key = list(item.keys())[0] if key not in seen: seen.add(key) passed_rows.append(item) result = [] result.extend(passed_rows) result.append({info.solid.name: 'transform_called'}) yield Result(result)
def _compute_fn(context, inputs): destination = context.solid_config.get('destination') load_job_config = _preprocess_config(context.solid_config.get('load_job_config', {})) cfg = LoadJobConfig(**load_job_config) if load_job_config else None context.log.info( 'executing BQ load with config: %s' % (cfg.to_api_repr() if cfg else '(no config provided)') ) context.resources.bq.load_table_from_source( source, inputs, destination, job_config=cfg ).result() yield Result(None)
def transform_fn(context, _inputs): '''Inner function defining the new solid. Args: context (TransformExecutionContext): Must expose a `db` resource with an `execute` method, like a SQLAlchemy engine, that can execute raw SQL against a database. Returns: str: The table name of the newly materialized SQL select statement. ''' context.log.info('Executing sql statement:\n{sql_statement}'.format( sql_statement=sql_statement)) context.resources.db_info.engine.execute(text(sql_statement)) yield Result(value=table_name, output_name='result')
def _compute_fn(context, _): query_job_config = _preprocess_config(context.solid_config.get('query_job_config', {})) # Retrieve results as pandas DataFrames results = [] for sql_query in sql_queries: # We need to construct a new QueryJobConfig for each query. # See: https://bit.ly/2VjD6sl cfg = QueryJobConfig(**query_job_config) if query_job_config else None context.log.info( 'executing query %s with config: %s' % (sql_query, cfg.to_api_repr() if cfg else '(no config provided)') ) results.append(context.resources.bq.query(sql_query, job_config=cfg).to_dataframe()) yield Result(results)
def _compute_fn(context, _): # pylint: disable=too-many-locals client = boto3.client( 'emr', region_name=context.solid_config.get('aws_region')) # kick off the EMR job flow response = client.run_job_flow( **context.solid_config['job_config']) # Job flow IDs and cluster IDs are interchangable job_flow_id = response.get('JobFlowId') context.log.info('waiting for EMR cluster job flow completion...') max_iter = max_wait_time_sec / poll_interval_sec # wait for the task done = False curr_iter = 0 while not done and curr_iter < max_iter: cluster = client.describe_cluster(ClusterId=job_flow_id) status = cluster.get('Cluster', {}).get('Status', {}) state = status.get('State') state_change_reason = status.get('StateChangeReason', {}).get('Message') context.log.info( 'EMR cluster %s state: %s state change reason: %s' % (job_flow_id, state, state_change_reason)) # This will take a while... cluster creation usually > 5 minutes time.sleep(poll_interval_sec) # Note that the user can specify Instances.KeepJobFlowAliveWhenNoSteps, which will # keep the cluster alive after the job completes. In such cases where the cluster # continues in waiting state, we stop waiting here and yield. # See: https://bit.ly/2UUq1G9 # pylint: disable=no-member done = state in { EmrClusterState.Waiting.value, EmrClusterState.Terminated.value, EmrClusterState.TerminatedWithErrors.value, } curr_iter += 1 yield Result(job_flow_id)
def _spark_compute_fn(context, _): '''Define Spark execution. This function defines how we'll execute the Spark job and invokes spark-submit. ''' spark_shell_cmd = create_spark_shell_cmd(context.solid_config, main_class) context.log.info("Running spark-submit: " + ' '.join(spark_shell_cmd)) retcode = run_spark_subprocess(spark_shell_cmd, context.log) if retcode != 0: raise SparkSolidError( 'Spark job failed. Please consult your logs.') yield Result(context.solid_config.get('spark_outputs'), 'paths')
def _snowflake_compute_fn(context, _): # pylint: disable=too-many-locals '''Define Snowflake execution. This function defines how we'll execute the Snowflake SQL query. ''' with context.resources.snowflake.get_connection( context.log) as conn: with closing(conn.cursor()) as cursor: results = [] for query in sql_queries: if sys.version_info[0] < 3: query = query.encode('utf-8') context.log.info( 'Executing SQL query %s %s' % (query, 'with parameters ' + str(parameters) if parameters else '')) cursor.execute(query, parameters) # pylint: disable=E1101 fetchall_results = cursor.fetchall() # pylint: disable=E1101 results.append(pd.DataFrame(fetchall_results)) yield Result(results)
def _t_fn(info, inputs): base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=info.context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) try: _source_nb = pm.execute_notebook( notebook_path, temp_path, parameters=dict(dm_context=serialize_dm_context(info, inputs)), ) output_nb = pm.read_notebook(temp_path) info.context.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: value = unmarshal_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def _t_fn(info, inputs): if not os.path.exists('/tmp/dagstermill/'): os.mkdir('/tmp/dagstermill/') temp_path = '/tmp/dagstermill/{prefix}-out.ipynb'.format( prefix=str(uuid.uuid4())) try: _source_nb = pm.execute_notebook( notebook_path, temp_path, parameters=dict( inputs=serialize_dm_object(inputs), config=serialize_dm_object(info.config), ), ) output_nb = pm.read_notebook(temp_path) info.context.debug( 'Notebook execution complete for {name}. Data is {data}'. format( name=name, data=output_nb.data, )) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: yield Result( deserialize_dm_object(output_nb.data[output_def.name]), output_def.name, ) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def _t_fn(info, inputs): check.param_invariant( isinstance(info.context.environment_config, dict), 'info', 'TransformExecutionInfo must have valid environment_config', ) base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=info.context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) output_log_path = os.path.join(base_dir, 'run.log') try: nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( info, nb, get_papermill_parameters(info, inputs, output_log_path)) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) with open(output_log_path, 'a') as f: f.close() # info.log.info("Output log path is {}".format(output_log_path)) # info.log.info("info.context.event_callback {}".format(info.context.event_callback)) process = subprocess.Popen( ["papermill", intermediate_path, temp_path]) # _source_nb = pm.execute_notebook(intermediate_path, temp_path) while process.poll() is None: # while subprocess alive if info.context.event_callback: with open(output_log_path, 'r') as ff: current_time = os.path.getmtime(output_log_path) while process.poll() is None: new_time = os.path.getmtime(output_log_path) if new_time != current_time: line = ff.readline() if not line: break event_record_dict = json.loads(line) event_record_dict['event_type'] = EventType( event_record_dict['event_type']) info.context.event_callback( EventRecord(**event_record_dict)) current_time = new_time if process.returncode != 0: # Throw event that is an execution error! info.log.debug("There was an error in Papermill!") info.log.debug('stderr was None' if process.stderr is None else process.stderr) exit() output_nb = pm.read_notebook(temp_path) info.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) info.log.info( "Output notebook path is {}".format(output_notebook_dir)) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: value = read_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def transform_fn(info, _args): yield Result(execute_sql_text_on_context(info, sql_text))
def hello_world(_info): yield Result(value={'foo': 'bar'})
def test_multiple_single_result(): mr = MultipleResults(Result('value', 'output_one')) assert mr.results == [Result('value', 'output_one')]
def hello_world(_info): return MultipleResults( Result(value={'foo': 'left'}, output_name='left'), Result(value={'foo': 'right'}, output_name='right'), )
def hello_world(_info): return Result(value={'foo': 'bar'})