Exemple #1
0
def giver(context):
    chase_duration = context.solid_config

    yield Result(chase_duration, 'out_1')
    yield Result(chase_duration, 'out_2')
    yield Result(chase_duration, 'out_3')
    yield Result(chase_duration, 'out_4')
Exemple #2
0
def test_multiple_double_result():
    mr = MultipleResults(Result('value_one', 'output_one'),
                         Result('value_two', 'output_two'))
    assert mr.results == [
        Result('value_one', 'output_one'),
        Result('value_two', 'output_two')
    ]
def conditional(context):
    if context.solid_config == 'out_one':
        yield Result(23, 'out_one')
    elif context.solid_config == 'out_two':
        yield Result(45, 'out_two')
    else:
        raise Exception('invalid config')
Exemple #4
0
def conditional(info):
    if info.config == 'out_one':
        yield Result(23, 'out_one')
    elif info.config == 'out_two':
        yield Result(45, 'out_two')
    else:
        raise Exception('invalid config')
Exemple #5
0
def test_multiple_dict():
    mr = MultipleResults.from_dict({
        'output_one': 'value_one',
        'output_two': 'value_two'
    })
    assert set(mr.results) == set(
        [Result('value_one', 'output_one'),
         Result('value_two', 'output_two')])
Exemple #6
0
def giver(context):
    units = context.solid_config
    queues = [[], [], [], []]
    for i, sec in enumerate(units):
        queues[i % 4].append(sec)

    yield Result(queues[0], 'out_1')
    yield Result(queues[1], 'out_2')
    yield Result(queues[2], 'out_3')
    yield Result(queues[3], 'out_4')
Exemple #7
0
 def _new_compute_fn(context, inputs):
     value = compute_fn(context, inputs)
     if isinstance(value, Result):
         raise DagsterInvariantViolationError(
             '''Single output compute Solid {name} returned a Result. Just return
             value directly without wrapping it in Result''')
     yield Result(value=value)
Exemple #8
0
def load_data_to_database_from_spark(context, data_frame):
    context.resources.db_info.load_table(data_frame,
                                         context.solid_config['table_name'])
    # TODO Flow more information down to the client
    # We should be able to flow multiple key value pairs down to dagit
    # See https://github.com/dagster-io/dagster/issues/1408
    yield Materialization(path='Persisted Db Table: {table_name}'.format(
        table_name=context.solid_config['table_name']))
    yield Result(data_frame)
Exemple #9
0
def process_q2_data(context, april_data, may_data, june_data,
                    master_cord_data):

    dfs = {'april': april_data, 'may': may_data, 'june': june_data}

    missing_things = []

    for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({
                    'month': month,
                    'missing_column': required_column
                })

    yield ExpectationResult(
        success=not bool(missing_things),
        name='airport_ids_present',
        message='Sequence IDs present in incoming monthly flight data.',
        result_metadata={'missing_columns': missing_things},
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(
            june_data.columns),
        name='flight_data_same_shape',
        result_metadata={'columns': april_data.columns},
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False,
        fraction=context.solid_config['subsample_pct'] / 100.0)
    sampled_q2_data.createOrReplaceTempView('q2_data')

    dest_prefixed_master_cord_data = do_prefix_column_names(
        master_cord_data, 'DEST_')
    dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data')

    origin_prefixed_master_cord_data = do_prefix_column_names(
        master_cord_data, 'ORIGIN_')
    origin_prefixed_master_cord_data.createOrReplaceTempView(
        'origin_cord_data')

    full_data = context.resources.spark.sql('''
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data 
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        ''')

    yield Result(rename_spark_dataframe_columns(full_data,
                                                lambda c: c.lower()))
Exemple #10
0
def test_single_transform_returning_result():
    solid_inst = single_output_transform(
        'test_return_result',
        inputs=[],
        transform_fn=lambda *_args, **_kwargs: Result(None),
        output=OutputDefinition(),
    )

    with pytest.raises(DagsterInvariantViolationError):
        execute_single_solid_in_isolation(ExecutionContext(), solid_inst)
Exemple #11
0
        def _compute_fn(context, _):
            (dataset, delete_contents, not_found_ok) = [
                context.solid_config.get(k) for k in ('dataset', 'delete_contents', 'not_found_ok')
            ]

            context.log.info('executing BQ delete_dataset for dataset %s' % dataset)

            context.resources.bq.delete_dataset(
                dataset, delete_contents=delete_contents, not_found_ok=not_found_ok
            )
            yield Result(None)
Exemple #12
0
def upload_to_s3(context, file_obj):
    '''Upload a file to s3.

    Args:
        info (ExpectationExecutionInfo): Must expose a boto3 S3 client as its `s3` resource.

    Returns:
        (str, str):
            The bucket and key to which the file was uploaded.
    '''
    bucket = context.solid_config['bucket']
    key = context.solid_config['key']

    context.resources.s3.put_object(Bucket=bucket,
                                    Body=file_obj.read(),
                                    Key=key,
                                    **(context.solid_config.get('kwargs')
                                       or {}))
    yield Result(bucket, 'bucket')
    yield Result(key, 'key')
Exemple #13
0
def upload_to_s3(info, file_path):
    '''Upload a file to s3.

    Args:
        info (ExpectationExecutionInfo): Must expose a boto3 S3 client as its `s3` resource.

    Returns:
        (str, str):
            The bucket and key to which the file was uploaded.
    '''
    bucket = info.config['bucket']
    key = info.config['key']

    with open(file_path, 'rb') as fd:
        info.resources.s3.put_object(Bucket=bucket,
                                     Body=fd,
                                     Key=key,
                                     **(info.config.get('kwargs') or {}))
    yield Result(bucket, 'bucket')
    yield Result(key, 'key')
Exemple #14
0
def test_do_not_yield_result():
    solid_inst = SolidDefinition(
        name='do_not_yield_result',
        inputs=[],
        outputs=[OutputDefinition()],
        transform_fn=lambda *_args, **_kwargs: Result('foo'),
    )

    with pytest.raises(
            DagsterInvariantViolationError,
            match='Transform for solid do_not_yield_result returned a Result',
    ):
        execute_single_solid_in_isolation(ExecutionContext(), solid_inst)
Exemple #15
0
def put_object_to_s3_bytes(context, file_obj):
    '''Upload file contents to s3.

    Args:
        file_obj (Bytes): The bytes of a file object.

    Returns:
        (str, str):
            The bucket and key to which the file was uploaded.
    '''
    bucket = context.solid_config['Bucket']
    key = context.solid_config['Key']

    # the s3 put_object API expects the actual bytes to be on the 'Body' key in kwargs; since we
    # get all other fields from config, we copy the config object and add 'Body' here.
    cfg = context.solid_config.copy()
    cfg['Body'] = file_obj.read()

    context.resources.s3.put_object(**cfg)

    yield Result(bucket, 'bucket')
    yield Result(key, 'key')
def _transform_fn(info, inputs):
    passed_rows = []
    seen = set()
    for row in inputs.values():
        for item in row:
            key = list(item.keys())[0]
            if key not in seen:
                seen.add(key)
                passed_rows.append(item)

    result = []
    result.extend(passed_rows)
    result.append({info.solid.name: 'transform_called'})
    yield Result(result)
Exemple #17
0
        def _compute_fn(context, inputs):
            destination = context.solid_config.get('destination')
            load_job_config = _preprocess_config(context.solid_config.get('load_job_config', {}))
            cfg = LoadJobConfig(**load_job_config) if load_job_config else None

            context.log.info(
                'executing BQ load with config: %s'
                % (cfg.to_api_repr() if cfg else '(no config provided)')
            )

            context.resources.bq.load_table_from_source(
                source, inputs, destination, job_config=cfg
            ).result()

            yield Result(None)
Exemple #18
0
    def transform_fn(context, _inputs):
        '''Inner function defining the new solid.

        Args:
            context (TransformExecutionContext): Must expose a `db` resource with an `execute` method,
                like a SQLAlchemy engine, that can execute raw SQL against a database.

        Returns:
            str:
                The table name of the newly materialized SQL select statement.
        '''

        context.log.info('Executing sql statement:\n{sql_statement}'.format(
            sql_statement=sql_statement))
        context.resources.db_info.engine.execute(text(sql_statement))
        yield Result(value=table_name, output_name='result')
Exemple #19
0
        def _compute_fn(context, _):
            query_job_config = _preprocess_config(context.solid_config.get('query_job_config', {}))

            # Retrieve results as pandas DataFrames
            results = []
            for sql_query in sql_queries:
                # We need to construct a new QueryJobConfig for each query.
                # See: https://bit.ly/2VjD6sl
                cfg = QueryJobConfig(**query_job_config) if query_job_config else None
                context.log.info(
                    'executing query %s with config: %s'
                    % (sql_query, cfg.to_api_repr() if cfg else '(no config provided)')
                )
                results.append(context.resources.bq.query(sql_query, job_config=cfg).to_dataframe())

            yield Result(results)
Exemple #20
0
        def _compute_fn(context, _):  # pylint: disable=too-many-locals
            client = boto3.client(
                'emr', region_name=context.solid_config.get('aws_region'))

            # kick off the EMR job flow
            response = client.run_job_flow(
                **context.solid_config['job_config'])

            # Job flow IDs and cluster IDs are interchangable
            job_flow_id = response.get('JobFlowId')

            context.log.info('waiting for EMR cluster job flow completion...')

            max_iter = max_wait_time_sec / poll_interval_sec

            # wait for the task
            done = False
            curr_iter = 0
            while not done and curr_iter < max_iter:
                cluster = client.describe_cluster(ClusterId=job_flow_id)
                status = cluster.get('Cluster', {}).get('Status', {})
                state = status.get('State')
                state_change_reason = status.get('StateChangeReason',
                                                 {}).get('Message')

                context.log.info(
                    'EMR cluster %s state: %s state change reason: %s' %
                    (job_flow_id, state, state_change_reason))

                # This will take a while... cluster creation usually > 5 minutes
                time.sleep(poll_interval_sec)

                # Note that the user can specify Instances.KeepJobFlowAliveWhenNoSteps, which will
                # keep the cluster alive after the job completes. In such cases where the cluster
                # continues in waiting state, we stop waiting here and yield.

                # See: https://bit.ly/2UUq1G9
                # pylint: disable=no-member
                done = state in {
                    EmrClusterState.Waiting.value,
                    EmrClusterState.Terminated.value,
                    EmrClusterState.TerminatedWithErrors.value,
                }

                curr_iter += 1

            yield Result(job_flow_id)
Exemple #21
0
        def _spark_compute_fn(context, _):
            '''Define Spark execution.

            This function defines how we'll execute the Spark job and invokes spark-submit.
            '''

            spark_shell_cmd = create_spark_shell_cmd(context.solid_config,
                                                     main_class)

            context.log.info("Running spark-submit: " +
                             ' '.join(spark_shell_cmd))
            retcode = run_spark_subprocess(spark_shell_cmd, context.log)

            if retcode != 0:
                raise SparkSolidError(
                    'Spark job failed. Please consult your logs.')

            yield Result(context.solid_config.get('spark_outputs'), 'paths')
Exemple #22
0
        def _snowflake_compute_fn(context, _):  # pylint: disable=too-many-locals
            '''Define Snowflake execution.

            This function defines how we'll execute the Snowflake SQL query.
            '''
            with context.resources.snowflake.get_connection(
                    context.log) as conn:
                with closing(conn.cursor()) as cursor:
                    results = []
                    for query in sql_queries:
                        if sys.version_info[0] < 3:
                            query = query.encode('utf-8')

                        context.log.info(
                            'Executing SQL query %s %s' %
                            (query, 'with parameters ' +
                             str(parameters) if parameters else ''))
                        cursor.execute(query, parameters)  # pylint: disable=E1101
                        fetchall_results = cursor.fetchall()  # pylint: disable=E1101
                        results.append(pd.DataFrame(fetchall_results))

                    yield Result(results)
Exemple #23
0
    def _t_fn(info, inputs):
        base_dir = '/tmp/dagstermill/{run_id}/'.format(
            run_id=info.context.run_id)
        output_notebook_dir = os.path.join(base_dir, 'output_notebooks/')

        if not os.path.exists(output_notebook_dir):
            os.makedirs(output_notebook_dir)

        temp_path = os.path.join(
            output_notebook_dir,
            '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

        try:
            _source_nb = pm.execute_notebook(
                notebook_path,
                temp_path,
                parameters=dict(dm_context=serialize_dm_context(info, inputs)),
            )

            output_nb = pm.read_notebook(temp_path)

            info.context.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(name=name, data=output_nb.data))

            for output_def in info.solid_def.output_defs:
                if output_def.name in output_nb.data:

                    value = unmarshal_value(output_def.runtime_type,
                                            output_nb.data[output_def.name])

                    yield Result(value, output_def.name)

        finally:
            if do_cleanup and os.path.exists(temp_path):
                os.remove(temp_path)
Exemple #24
0
    def _t_fn(info, inputs):
        if not os.path.exists('/tmp/dagstermill/'):
            os.mkdir('/tmp/dagstermill/')

        temp_path = '/tmp/dagstermill/{prefix}-out.ipynb'.format(
            prefix=str(uuid.uuid4()))

        try:
            _source_nb = pm.execute_notebook(
                notebook_path,
                temp_path,
                parameters=dict(
                    inputs=serialize_dm_object(inputs),
                    config=serialize_dm_object(info.config),
                ),
            )

            output_nb = pm.read_notebook(temp_path)

            info.context.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(
                    name=name,
                    data=output_nb.data,
                ))

            for output_def in info.solid_def.output_defs:
                if output_def.name in output_nb.data:
                    yield Result(
                        deserialize_dm_object(output_nb.data[output_def.name]),
                        output_def.name,
                    )

        finally:
            if do_cleanup and os.path.exists(temp_path):
                os.remove(temp_path)
Exemple #25
0
    def _t_fn(info, inputs):
        check.param_invariant(
            isinstance(info.context.environment_config, dict),
            'info',
            'TransformExecutionInfo must have valid environment_config',
        )

        base_dir = '/tmp/dagstermill/{run_id}/'.format(
            run_id=info.context.run_id)
        output_notebook_dir = os.path.join(base_dir, 'output_notebooks/')

        if not os.path.exists(output_notebook_dir):
            os.makedirs(output_notebook_dir)

        temp_path = os.path.join(
            output_notebook_dir,
            '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

        output_log_path = os.path.join(base_dir, 'run.log')

        try:
            nb = load_notebook_node(notebook_path)
            nb_no_parameters = replace_parameters(
                info, nb,
                get_papermill_parameters(info, inputs, output_log_path))
            intermediate_path = os.path.join(
                output_notebook_dir,
                '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4())))
            write_ipynb(nb_no_parameters, intermediate_path)

            with open(output_log_path, 'a') as f:
                f.close()

            # info.log.info("Output log path is {}".format(output_log_path))
            # info.log.info("info.context.event_callback {}".format(info.context.event_callback))

            process = subprocess.Popen(
                ["papermill", intermediate_path, temp_path])
            # _source_nb = pm.execute_notebook(intermediate_path, temp_path)

            while process.poll() is None:  # while subprocess alive
                if info.context.event_callback:
                    with open(output_log_path, 'r') as ff:
                        current_time = os.path.getmtime(output_log_path)
                        while process.poll() is None:
                            new_time = os.path.getmtime(output_log_path)
                            if new_time != current_time:
                                line = ff.readline()
                                if not line:
                                    break
                                event_record_dict = json.loads(line)

                                event_record_dict['event_type'] = EventType(
                                    event_record_dict['event_type'])
                                info.context.event_callback(
                                    EventRecord(**event_record_dict))
                                current_time = new_time

            if process.returncode != 0:
                # Throw event that is an execution error!
                info.log.debug("There was an error in Papermill!")
                info.log.debug('stderr was None'
                               if process.stderr is None else process.stderr)
                exit()

            output_nb = pm.read_notebook(temp_path)

            info.log.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(name=name, data=output_nb.data))

            info.log.info(
                "Output notebook path is {}".format(output_notebook_dir))

            for output_def in info.solid_def.output_defs:
                if output_def.name in output_nb.data:

                    value = read_value(output_def.runtime_type,
                                       output_nb.data[output_def.name])

                    yield Result(value, output_def.name)

        finally:
            if do_cleanup and os.path.exists(temp_path):
                os.remove(temp_path)
Exemple #26
0
 def transform_fn(info, _args):
     yield Result(execute_sql_text_on_context(info, sql_text))
Exemple #27
0
 def hello_world(_info):
     yield Result(value={'foo': 'bar'})
Exemple #28
0
def test_multiple_single_result():
    mr = MultipleResults(Result('value', 'output_one'))
    assert mr.results == [Result('value', 'output_one')]
Exemple #29
0
 def hello_world(_info):
     return MultipleResults(
         Result(value={'foo': 'left'}, output_name='left'),
         Result(value={'foo': 'right'}, output_name='right'),
     )
Exemple #30
0
 def hello_world(_info):
     return Result(value={'foo': 'bar'})