Beispiel #1
0
from dagster import Field, Int, String, composite_solid, pipeline, solid
from dagster.core.execution.api import create_execution_plan, execute_plan_iterator
from dagster.core.instance import DagsterInstance


@solid(config={'foo': Field(String)})
def node_a(context):
    return context.solid_config['foo']


@solid(config={'bar': Field(Int)})
def node_b(context, input_):
    return input_ * context.solid_config['bar']


@composite_solid
def composite_with_nested_config_solid():
    return node_b(node_a())


@pipeline
def composite_pipeline():
    return composite_with_nested_config_solid()


@composite_solid(
    config_fn=lambda _, cfg: {
        'node_a': {
            'config': {
                'foo': cfg['foo']
            }
Beispiel #2
0
# pylint: disable=unused-argument, no-value-for-parameter

# start_marker
import os

from dagster import Field, pipeline, solid
from dagster.experimental import DynamicOutput, DynamicOutputDefinition
from dagster.utils import file_relative_path


@solid(
    config_schema={"path": Field(str, default_value=file_relative_path(__file__, "sample"))},
    output_defs=[DynamicOutputDefinition(str)],
)
def files_in_directory(context):
    path = context.solid_config["path"]
    dirname, _, filenames = next(os.walk(path))
    for file in filenames:
        yield DynamicOutput(
            value=os.path.join(dirname, file),
            # create a mapping key from the file name
            mapping_key=file.replace(".", "_").replace("-", "_"),
        )


@solid
def process_file(_, path: str) -> int:
    # simple example of calculating size
    return os.path.getsize(path)

Beispiel #3
0
from dagster_gcp import (
    bigquery_resource,
    import_gcs_paths_to_bq,
    bq_solid_for_queries,
    dataproc_resource,
    dataproc_solid,
)

PROJECT_ID = os.getenv('GCP_PROJECT_ID')
DEPLOY_BUCKET_PREFIX = os.getenv('GCP_DEPLOY_BUCKET_PREFIX')
REGION = 'us-west1'
LATEST_JAR_HASH = '214f4bff2eccb4e9c08578d96bd329409b7111c8'


@solid(
    config={'paths': Field(List[String])},
    description='pass configured output paths to BigQuery load command inputs',
)
def output_paths(context, start) -> List[String]:  # pylint: disable=unused-argument
    return context.solid_config['paths']


def events_dataproc_fn(context, cfg):
    dt = datetime.datetime.fromtimestamp(context.run_config.tags['execution_epoch_time'])

    return {
        'dataproc_solid': {
            'config': {
                'job_scoped_cluster': False,
                'job_config': {
                    'job': {
Beispiel #4
0
    Bool,
    Field,
    Int,
    String,
    execute_pipeline,
    pipeline,
    solid,
)


@solid(
    config={
        'delimiter':
        Field(
            String,
            default_value=',',
            is_optional=True,
            description=('A one-character string used to separate fields.'),
        ),
        'doublequote':
        Field(
            Bool,
            default_value=False,
            is_optional=True,
            description=(
                'Controls how instances of quotechar appearing inside a field '
                'should themselves be quoted. When True, the character is '
                'doubled. When False, the escapechar is used as a prefix to '
                'the quotechar.'),
        ),
        'escapechar':
        Field(
Beispiel #5
0
from dagster import Field, RepositoryDefinition, Shape, composite_solid, pipeline, seven, solid


@solid(
    config_schema={
        'cluster_cfg':
        Shape({
            'num_mappers': Field(int),
            'num_reducers': Field(int),
            'master_heap_size_mb': Field(int),
            'worker_heap_size_mb': Field(int),
        }),
        'name':
        Field(str),
    })
def hello(context):
    context.log.info(seven.json.dumps(context.solid_config['cluster_cfg']))
    return 'Hello, %s!' % context.solid_config['name']


def config_mapping_fn(cfg):
    return {
        'hello': {
            'config': {
                'cluster_cfg': {
                    'num_mappers': 100,
                    'num_reducers': 20,
                    'master_heap_size_mb': 1024,
                    'worker_heap_size_mb': 8192,
                },
                'name': cfg['name'],
Beispiel #6
0
"""isort:skip_file"""
import json
import logging

from dagster import Field, ModeDefinition, logger, pipeline, solid

# start_custom_logger_marker_0


@logger(
    {
        "log_level": Field(str, is_required=False, default_value="INFO"),
        "name": Field(str, is_required=False, default_value="dagster"),
    },
    description="A JSON-formatted console logger",
)
def json_console_logger(init_context):
    level = init_context.logger_config["log_level"]
    name = init_context.logger_config["name"]

    klass = logging.getLoggerClass()
    logger_ = klass(name, level=level)

    handler = logging.StreamHandler()

    class JsonFormatter(logging.Formatter):
        def format(self, record):
            return json.dumps(record.__dict__)

    handler.setFormatter(JsonFormatter())
    logger_.addHandler(handler)
Beispiel #7
0
def csv_hello_world_solids_config_fs_storage():
    return {
        'solids': {
            'sum_solid': {
                'inputs': {
                    'num': file_relative_path(__file__, '../data/num.csv')
                }
            }
        },
        'storage': {
            'filesystem': {}
        },
    }


@solid(config={'file': Field(Path)})
def loop(context):
    with open(context.solid_config['file'], 'w') as ff:
        ff.write('yup')

    while True:
        time.sleep(0.1)


@pipeline
def infinite_loop_pipeline():
    loop()


@solid
def noop_solid(_):
Beispiel #8
0
def test_solid_config():
    solid_config_type = define_solid_config_cls(Field(Int), None, None)
    solid_inst = throwing_evaluate_config_value(solid_config_type, {'config': 1})
    assert solid_inst['config'] == 1
    assert solid_config_type.type_attributes.is_system_config
Beispiel #9
0
from dagster import Field, Int, String, composite_solid, pipeline, solid


@solid(config={'foo': Field(String)})
def basic(context):
    return context.solid_config


def inner_wrap_fn(_ctx, cfg):
    return {
        'basic': {
            'config': {
                'foo':
                'override here' + cfg['inner_first'] + ' : ' +
                cfg['inner_second']
            }
        }
    }


@composite_solid(config_fn=inner_wrap_fn,
                 config={
                     'inner_first': Field(String),
                     'inner_second': Field(String)
                 })
def inner_wrap():
    return basic()


def outer_wrap_fn(_ctx, cfg):
    return {
            if len(insert_tracking_columns) > 0:
                insert_tracking_columns += ', '

            insert_tracking_columns += f"{names[idx]}"

    yield Output(create_tracking_columns, 'create_tracking_columns')
    yield Output(insert_tracking_columns, 'insert_tracking_columns')


@solid(
    required_resource_keys={'postgres_warehouse'},
    config={
        'fatal':
        Field(
            Bool,
            default_value=True,
            is_optional=True,
            description='Controls whether exceptions cause a Failure or not',
        )
    })
def upload_tracking_table(context, results: Dict, insert_columns: String,
                          table_name: String):
    """
    Upload a DataFrame to the Postgres server, creating the table if it doesn't exist
    :param context: execution context
    :param results: dict of results dicts with set ids as key
             { <set_id>: { 'uploaded': True|False,
                           'value': { 'fileset': <set_id>,
                                      'sj_pk_min': min value of sales journal primary key,
                                      'sj_pk_max': max value of sales journal primary key  }}}
    :param insert_columns: column names for the database table
    :param table_name: name of database table to upload to
    backoff = 0.01

    while True:
        logs = instance.all_logs(run_id)
        if 'STEP_START' in (log_record.dagster_event.event_type_value
                            for log_record in logs):
            return
        else:
            time.sleep(backoff)
            total_time += backoff
            backoff = backoff * 2
            if total_time > timeout:
                raise Exception('Timed out')


@solid(config_schema={'length': Field(Int)}, output_defs=[])
def streamer(context):
    for i in range(context.solid_config['length']):
        yield Materialization(label=str(i))
        time.sleep(0.1)


@pipeline
def streaming_pipeline():
    streamer()


@repository
def test_repository():
    return [streaming_pipeline]
Beispiel #12
0
from dagster import resource, Field


@resource(
    {
        "timezone": Field(str, is_required=True, description="Run timezone"),
    }
)
def timezone_config(context):
    return context.resource_config


@resource(
    {
        "endpoints": Field(
            dict,
            is_required=True,
            description="dicts of endpoint and key_column keyed by table_id",
        )
    }
)
def endpoints(context):
    return context.resource_config

@resource({"map": Field(dict, is_required=True, description='column map from api response to project structure')})
def mapping(context):
    return context.resource_config


@resource(
    {
Beispiel #13
0
def _define_region_config():
    return Field(String, is_required=True)
Beispiel #14
0
import requests
from dagster import solid, Output, Field, OutputDefinition, InputDefinition, Nothing
from dagster.utils import script_relative_path
from pandas import DataFrame
import sqlite3
import dagstermill as dm

from definitions.exchangerates.dagster_types import ExchangeRateDataFrame


@solid(
    config_schema={
        "base_currency": Field(str, is_required=False, default_value="EUR"),
        "date_from": str,
        "date_to": str,
    })
def extract(context):
    result = requests.get(f"https://api.exchangeratesapi.io/history?"
                          f"&start_at={context.solid_config['date_from']}"
                          f"&end_at={context.solid_config['date_to']}"
                          f"&base={context.solid_config['base_currency']}")
    if result.status_code != 200:
        raise ValueError("API didn't return valid result")
    return result.json()


@solid(output_defs=[OutputDefinition(ExchangeRateDataFrame)])
def transform(context, currency_json) -> DataFrame:
    data = []
    for day in currency_json["rates"]:
        for currency in currency_json["rates"][day]:
Beispiel #15
0
                                          href=href)
        output = DbtCloudOutput(run_details=final_run_details,
                                result=self.get_run_results(run_id))
        if output.docs_url:
            self._log.info(
                f"Docs for this run can be viewed here: {output.docs_url}")
        return output


@resource(
    config_schema={
        "auth_token":
        Field(
            StringSource,
            is_required=True,
            description="dbt Cloud API Token. User tokens can be found in the "
            "[dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the "
            "[dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) "
            "for instructions on creating a Service Account token.",
        ),
        "account_id":
        Field(
            int,
            is_required=True,
            description=
            "dbt Cloud Account ID. This value can be found in the url of a variety of "
            "views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.",
        ),
        "disable_schedule_on_trigger":
        Field(
            bool,
            default_value=True,
            ),
            EventMetadataEntry.text(
                str(len(value[0].keys()) if len(value) > 0 else 0),
                'n_cols',
                'Number of columns seen in the data frame',
            ),
            EventMetadataEntry.text(
                str(list(value[0].keys()) if len(value) > 0 else []),
                'column_names',
                'Keys of columns seen in the data frame',
            ),
        ],
    )


@input_hydration_config(Selector({'csv': Field(String)}))
def less_simple_data_frame_input_hydration_config(context, selector):
    csv_path = os.path.join(os.path.dirname(__file__), selector['csv'])
    with open(csv_path, 'r') as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info('Read {n_lines} lines'.format(n_lines=len(lines)))
    return lines


LessSimpleDataFrame = DagsterType(
    name='LessSimpleDataFrame',
    description='A more sophisticated data frame that type checks its structure.',
    type_check_fn=less_simple_data_frame_type_check,
    input_hydration_config=less_simple_data_frame_input_hydration_config,
)
Beispiel #17
0
from dagster import Field, pipeline, solid
from dagster.core.definitions.events import DynamicOutput
from dagster.core.definitions.output import DynamicOutputDefinition


@solid
def multiply_by_two(context, y):
    context.log.info("echo_again is returning " + str(y * 2))
    return y * 2


@solid(config_schema={"fail_on_first_try": Field(bool, default_value=False)})
def multiply_inputs(context, y, ten):
    if context.solid_config["fail_on_first_try"]:
        current_run = context.instance.get_run_by_id(context.run_id)
        if y == 2 and current_run.parent_run_id is None:
            raise Exception()
    context.log.info("echo is returning " + str(y * ten))
    return y * ten


@solid
def emit_ten(_):
    return 10


@solid
def sum_numbers(_, base, nums):
    return base + sum(nums)

Beispiel #18
0
from dagster import AssetKey, AssetMaterialization, EventMetadata, Field, Output, pipeline, solid


@solid(
    config_schema={
        "bucket": Field(str, is_required=True),
        "s3_key": Field(str, is_required=True),
    })
def read_s3_key(context):
    s3_key = context.solid_config["s3_key"]
    bucket = context.solid_config["bucket"]
    path = f"s3://{bucket}/{s3_key}"
    context.log.info(f"Found file {path}")
    yield AssetMaterialization(
        asset_key=AssetKey(["log_s3", path]),
        metadata={"S3 path": EventMetadata.url(path)},
    )
    yield Output(path)


@pipeline(
    description="Demo pipeline that spits out some file info, given a path")
def log_s3_pipeline():
    read_s3_key()
Beispiel #19
0
from dagster import Field, String, SystemStorageData, system_storage
from dagster.core.storage.intermediates_manager import IntermediateStoreIntermediatesManager
from dagster.core.storage.system_storage import fs_system_storage, mem_system_storage

from .file_manager import S3FileManager
from .intermediate_store import S3IntermediateStore


@system_storage(
    name='s3',
    is_persistent=True,
    config={
        's3_bucket': Field(String),
        's3_prefix': Field(String, is_required=False, default_value='dagster'),
    },
    required_resource_keys={'s3'},
)
def s3_system_storage(init_context):
    s3_session = init_context.resources.s3.session
    s3_key = '{prefix}/storage/{run_id}/files'.format(
        prefix=init_context.system_storage_config['s3_prefix'],
        run_id=init_context.pipeline_run.run_id,
    )
    return SystemStorageData(
        file_manager=S3FileManager(
            s3_session=s3_session,
            s3_bucket=init_context.system_storage_config['s3_bucket'],
            s3_base_key=s3_key,
        ),
        intermediates_manager=IntermediateStoreIntermediatesManager(
            S3IntermediateStore(
Beispiel #20
0
from random import random

from dagster import Field, pipeline, solid

DEFAULT_EXCEPTION_RATE = 0.3


@solid
def unreliable_start(_):
    return 1


@solid(config={'rate': Field(float, is_required=False, default_value=DEFAULT_EXCEPTION_RATE)})
def unreliable(context, num):
    if random() < context.solid_config['rate']:
        raise Exception('blah')

    return num


@pipeline
def unreliable_pipeline():
    one = unreliable.alias('one')
    two = unreliable.alias('two')
    three = unreliable.alias('three')
    four = unreliable.alias('four')
    five = unreliable.alias('five')
    six = unreliable.alias('six')
    seven = unreliable.alias('seven')
    seven(six(five(four(three(two(one(unreliable_start())))))))
Beispiel #21
0
    try:
        from dagster_dask import dask_executor

        return default_executors + [dask_executor]
    except ImportError:
        return default_executors


@solid(
    input_defs=[InputDefinition('chase_duration', int)],
    output_defs=[OutputDefinition(int, 'total')],
    config={
        'chase_size':
        Field(
            int,
            default_value=100000,
            is_optional=True,
            description='How big should the pointer chase array be?',
        )
    },
)
def hammer(context, chase_duration):
    '''what better way to do a lot of gnarly work than to pointer chase?'''
    ptr_length = context.solid_config['chase_size']

    data = list(range(0, ptr_length))
    random.shuffle(data)

    curr = random.randint(0, ptr_length)
    # and away we go
    start_time = time.time()
    while (time.time() - start_time) < chase_duration:
Beispiel #22
0
                }
            }
        },
    )
    assert res.output_value() == ["foo", 1, 3.1]


def test_dict_return_solid():
    res = execute_solid(dict_return_solid)
    assert res.output_value() == {"foo": "bar"}


######


@solid(config_schema=Field(Any))
def any_config(context):
    return context.solid_config


@solid(config_schema=Field(Bool))
def bool_config(context):
    return "true" if context.solid_config else "false"


@solid(config_schema=Int)
def add_n(context, x: Int) -> int:
    return x + context.solid_config


@solid(config_schema=Field(Float))
Beispiel #23
0
                command,
                action_on_failure=action_on_failure))
        return steps

    @property
    def running_on_emr(self):
        '''Detects whether we are running on the EMR cluster
        '''
        if os.path.exists('/mnt/var/lib/info/job-flow.json'):
            return True
        return False


@resource({
    'pipeline_file':
    Field(str, description='Path to the file where the pipeline is defined'),
    'pipeline_fn_name':
    Field(str),
    'spark_config':
    spark_config(),
    'cluster_id':
    Field(str,
          description='Name of the job flow (cluster) on which to execute'),
    'region_name':
    Field(str),
    'action_on_failure':
    Field(str, is_required=False, default_value='CANCEL_AND_WAIT'),
    'staging_bucket':
    Field(
        str,
        is_required=True,
from dagster import Field, List, PermissiveDict, String
from dagster.core.definitions.executor import executor

from .config import CeleryConfig


@executor(
    name='celery',
    config={
        'broker':
        Field(
            String,
            is_optional=True,
            description=(
                'The URL of the Celery broker. Default: '
                '\'pyamqp://guest@{os.getenv(\'DAGSTER_CELERY_BROKER_HOST\','
                '\'localhost\')}//\'.'),
        ),
        'backend':
        Field(
            String,
            is_optional=True,
            default_value='rpc://',
            description=
            'The URL of the Celery results backend. Default: \'rpc://\'.',
        ),
        'include':
        Field(List[String],
              is_optional=True,
              description='List of modules every worker should import'),
        'config_source':
Beispiel #25
0
import dagstermill as dm
from docs_snippets.legacy.data_science.download_file import download_file

from dagster import Field, InputDefinition, Int, pipeline
from dagster.utils import script_relative_path

k_means_iris = dm.define_dagstermill_solid(
    'k_means_iris',
    script_relative_path('iris-kmeans_2.ipynb'),
    input_defs=[InputDefinition('path', str, description='Local path to the Iris dataset')],
    config_schema=Field(
        Int, default_value=3, is_required=False, description='The number of clusters to find'
    ),
)


@pipeline
def iris_pipeline():
    k_means_iris(download_file())
Beispiel #26
0
def test_custom_dagster_dataframe_parametrizable_input():
    @dagster_type_loader(
        Selector({
            "door_a": Field(str),
            "door_b": Field(str),
            "door_c": Field(str),
        }))
    def silly_loader(_, config):
        which_door = list(config.keys())[0]
        if which_door == "door_a":
            return DataFrame({"foo": ["goat"]})
        elif which_door == "door_b":
            return DataFrame({"foo": ["car"]})
        elif which_door == "door_c":
            return DataFrame({"foo": ["goat"]})
        raise DagsterInvariantViolationError(
            "You did not pick a door. You chose: {which_door}".format(
                which_door=which_door))

    @dagster_type_materializer(
        Selector({
            "devnull": Field(str),
            "nothing": Field(str)
        }))
    def silly_materializer(_, _config, _value):
        return AssetMaterialization(asset_key="nothing",
                                    description="just one of those days")

    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn.exists("foo"),
        ],
        loader=silly_loader,
        materializer=silly_materializer,
    )

    @solid(
        input_defs=[InputDefinition("df", TestDataFrame)],
        output_defs=[OutputDefinition(TestDataFrame)],
    )
    def did_i_win(_, df):
        return df

    solid_result = execute_solid(
        did_i_win,
        run_config={
            "solids": {
                "did_i_win": {
                    "inputs": {
                        "df": {
                            "door_a": "bar"
                        }
                    },
                    "outputs": [{
                        "result": {
                            "devnull": "baz"
                        }
                    }],
                }
            }
        },
    )
    assert solid_result.success
    output_df = solid_result.output_value()
    assert isinstance(output_df, DataFrame)
    assert output_df["foo"].tolist() == ["goat"]
    materialization_events = solid_result.materialization_events_during_compute
    assert len(materialization_events) == 1
    assert materialization_events[
        0].event_specific_data.materialization.label == "nothing"
Beispiel #27
0
from dagster import (
    Field,
    InputDefinition,
    ModeDefinition,
    OutputDefinition,
    String,
    pipeline,
    repository,
    solid,
)
from dagster.core.storage.asset_store import versioned_filesystem_asset_store


@solid(
    version="create_string_version",
    config_schema={"input_str": Field(String)},
    output_defs=[
        OutputDefinition(name="created_string",
                         manager_key="object_manager",
                         asset_metadata={})
    ],
)
def create_string_1_asset(context):
    return context.solid_config["input_str"]


@solid(
    input_defs=[InputDefinition("_string_input", String)],
    version="take_string_version",
    config_schema={"input_str": Field(String)},
    output_defs=[
Beispiel #28
0
@solid
def read_csv(context, csv_path):
    lines = []
    csv_path = os.path.join(os.path.dirname(__file__), csv_path)
    with open(csv_path, "r") as fd:
        for row in csv.DictReader(fd):
            row["calories"] = int(row["calories"])
            lines.append(row)

    context.log.info("Read {n_lines} lines".format(n_lines=len(lines)))
    return lines


@solid(
    config_schema={
        "process_hot": Field(Bool, is_required=False, default_value=True),
        "process_cold": Field(Bool, is_required=False, default_value=True),
    },
    output_defs=[
        OutputDefinition(name="hot_cereals",
                         dagster_type=DataFrame,
                         is_required=False),
        OutputDefinition(name="cold_cereals",
                         dagster_type=DataFrame,
                         is_required=False),
    ],
)
def split_cereals(context, cereals):
    if context.solid_config["process_hot"]:
        hot_cereals = [cereal for cereal in cereals if cereal["type"] == "H"]
        yield Output(hot_cereals, "hot_cereals")
Beispiel #29
0
            'Persisted table {table_name} in database configured in the db_info resource.'
        ).format(table_name=table_name),
        metadata_entries=[
            EventMetadataEntry.text(label='Host', text=context.resources.db_info.host),
            EventMetadataEntry.text(label='Db', text=context.resources.db_info.db_name),
        ],
    )
    yield Output(value=table_name, output_name='table_name')


@solid(
    required_resource_keys={'pyspark_step_launcher'},
    description='Subsample a spark dataset via the configuration option.',
    config={
        'subsample_pct': Field(
            Int, description='The integer percentage of rows to sample from the input dataset.',
        )
    },
)
def subsample_spark_dataset(context, data_frame: DataFrame) -> DataFrame:
    return data_frame.sample(
        withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0
    )


@composite_solid(
    description='''Ingest a zipped csv file from s3,
stash in a keyed file store (does not download if already
present by default), unzip that file, and load it into a
Spark Dataframe. See documentation in constituent solids for
more detail.'''
Beispiel #30
0
from dagster import Field, StringSource, resource
from slack import WebClient


@resource(
    {
        "token": Field(
            StringSource,
            description="""To configure access to the Slack API, you'll need an access
                    token provisioned with access to your Slack workspace.

                    Tokens are typically either user tokens or bot tokens. For programmatic posting
                    to Slack from this resource, you probably want to provision and use a bot token.

                    More in the Slack API documentation here: https://api.slack.com/docs/token-types
                    """,
        )
    },
    description="This resource is for connecting to Slack",
)
def slack_resource(context):
    """This resource is for connecting to Slack.

    The resource object is a `slack.WebClient`.

    By configuring this Slack resource, you can post messages to Slack from any Dagster solid:

    Examples:

    .. code-block:: python