from dagster import Field, Int, String, composite_solid, pipeline, solid from dagster.core.execution.api import create_execution_plan, execute_plan_iterator from dagster.core.instance import DagsterInstance @solid(config={'foo': Field(String)}) def node_a(context): return context.solid_config['foo'] @solid(config={'bar': Field(Int)}) def node_b(context, input_): return input_ * context.solid_config['bar'] @composite_solid def composite_with_nested_config_solid(): return node_b(node_a()) @pipeline def composite_pipeline(): return composite_with_nested_config_solid() @composite_solid( config_fn=lambda _, cfg: { 'node_a': { 'config': { 'foo': cfg['foo'] }
# pylint: disable=unused-argument, no-value-for-parameter # start_marker import os from dagster import Field, pipeline, solid from dagster.experimental import DynamicOutput, DynamicOutputDefinition from dagster.utils import file_relative_path @solid( config_schema={"path": Field(str, default_value=file_relative_path(__file__, "sample"))}, output_defs=[DynamicOutputDefinition(str)], ) def files_in_directory(context): path = context.solid_config["path"] dirname, _, filenames = next(os.walk(path)) for file in filenames: yield DynamicOutput( value=os.path.join(dirname, file), # create a mapping key from the file name mapping_key=file.replace(".", "_").replace("-", "_"), ) @solid def process_file(_, path: str) -> int: # simple example of calculating size return os.path.getsize(path)
from dagster_gcp import ( bigquery_resource, import_gcs_paths_to_bq, bq_solid_for_queries, dataproc_resource, dataproc_solid, ) PROJECT_ID = os.getenv('GCP_PROJECT_ID') DEPLOY_BUCKET_PREFIX = os.getenv('GCP_DEPLOY_BUCKET_PREFIX') REGION = 'us-west1' LATEST_JAR_HASH = '214f4bff2eccb4e9c08578d96bd329409b7111c8' @solid( config={'paths': Field(List[String])}, description='pass configured output paths to BigQuery load command inputs', ) def output_paths(context, start) -> List[String]: # pylint: disable=unused-argument return context.solid_config['paths'] def events_dataproc_fn(context, cfg): dt = datetime.datetime.fromtimestamp(context.run_config.tags['execution_epoch_time']) return { 'dataproc_solid': { 'config': { 'job_scoped_cluster': False, 'job_config': { 'job': {
Bool, Field, Int, String, execute_pipeline, pipeline, solid, ) @solid( config={ 'delimiter': Field( String, default_value=',', is_optional=True, description=('A one-character string used to separate fields.'), ), 'doublequote': Field( Bool, default_value=False, is_optional=True, description=( 'Controls how instances of quotechar appearing inside a field ' 'should themselves be quoted. When True, the character is ' 'doubled. When False, the escapechar is used as a prefix to ' 'the quotechar.'), ), 'escapechar': Field(
from dagster import Field, RepositoryDefinition, Shape, composite_solid, pipeline, seven, solid @solid( config_schema={ 'cluster_cfg': Shape({ 'num_mappers': Field(int), 'num_reducers': Field(int), 'master_heap_size_mb': Field(int), 'worker_heap_size_mb': Field(int), }), 'name': Field(str), }) def hello(context): context.log.info(seven.json.dumps(context.solid_config['cluster_cfg'])) return 'Hello, %s!' % context.solid_config['name'] def config_mapping_fn(cfg): return { 'hello': { 'config': { 'cluster_cfg': { 'num_mappers': 100, 'num_reducers': 20, 'master_heap_size_mb': 1024, 'worker_heap_size_mb': 8192, }, 'name': cfg['name'],
"""isort:skip_file""" import json import logging from dagster import Field, ModeDefinition, logger, pipeline, solid # start_custom_logger_marker_0 @logger( { "log_level": Field(str, is_required=False, default_value="INFO"), "name": Field(str, is_required=False, default_value="dagster"), }, description="A JSON-formatted console logger", ) def json_console_logger(init_context): level = init_context.logger_config["log_level"] name = init_context.logger_config["name"] klass = logging.getLoggerClass() logger_ = klass(name, level=level) handler = logging.StreamHandler() class JsonFormatter(logging.Formatter): def format(self, record): return json.dumps(record.__dict__) handler.setFormatter(JsonFormatter()) logger_.addHandler(handler)
def csv_hello_world_solids_config_fs_storage(): return { 'solids': { 'sum_solid': { 'inputs': { 'num': file_relative_path(__file__, '../data/num.csv') } } }, 'storage': { 'filesystem': {} }, } @solid(config={'file': Field(Path)}) def loop(context): with open(context.solid_config['file'], 'w') as ff: ff.write('yup') while True: time.sleep(0.1) @pipeline def infinite_loop_pipeline(): loop() @solid def noop_solid(_):
def test_solid_config(): solid_config_type = define_solid_config_cls(Field(Int), None, None) solid_inst = throwing_evaluate_config_value(solid_config_type, {'config': 1}) assert solid_inst['config'] == 1 assert solid_config_type.type_attributes.is_system_config
from dagster import Field, Int, String, composite_solid, pipeline, solid @solid(config={'foo': Field(String)}) def basic(context): return context.solid_config def inner_wrap_fn(_ctx, cfg): return { 'basic': { 'config': { 'foo': 'override here' + cfg['inner_first'] + ' : ' + cfg['inner_second'] } } } @composite_solid(config_fn=inner_wrap_fn, config={ 'inner_first': Field(String), 'inner_second': Field(String) }) def inner_wrap(): return basic() def outer_wrap_fn(_ctx, cfg): return {
if len(insert_tracking_columns) > 0: insert_tracking_columns += ', ' insert_tracking_columns += f"{names[idx]}" yield Output(create_tracking_columns, 'create_tracking_columns') yield Output(insert_tracking_columns, 'insert_tracking_columns') @solid( required_resource_keys={'postgres_warehouse'}, config={ 'fatal': Field( Bool, default_value=True, is_optional=True, description='Controls whether exceptions cause a Failure or not', ) }) def upload_tracking_table(context, results: Dict, insert_columns: String, table_name: String): """ Upload a DataFrame to the Postgres server, creating the table if it doesn't exist :param context: execution context :param results: dict of results dicts with set ids as key { <set_id>: { 'uploaded': True|False, 'value': { 'fileset': <set_id>, 'sj_pk_min': min value of sales journal primary key, 'sj_pk_max': max value of sales journal primary key }}} :param insert_columns: column names for the database table :param table_name: name of database table to upload to
backoff = 0.01 while True: logs = instance.all_logs(run_id) if 'STEP_START' in (log_record.dagster_event.event_type_value for log_record in logs): return else: time.sleep(backoff) total_time += backoff backoff = backoff * 2 if total_time > timeout: raise Exception('Timed out') @solid(config_schema={'length': Field(Int)}, output_defs=[]) def streamer(context): for i in range(context.solid_config['length']): yield Materialization(label=str(i)) time.sleep(0.1) @pipeline def streaming_pipeline(): streamer() @repository def test_repository(): return [streaming_pipeline]
from dagster import resource, Field @resource( { "timezone": Field(str, is_required=True, description="Run timezone"), } ) def timezone_config(context): return context.resource_config @resource( { "endpoints": Field( dict, is_required=True, description="dicts of endpoint and key_column keyed by table_id", ) } ) def endpoints(context): return context.resource_config @resource({"map": Field(dict, is_required=True, description='column map from api response to project structure')}) def mapping(context): return context.resource_config @resource( {
def _define_region_config(): return Field(String, is_required=True)
import requests from dagster import solid, Output, Field, OutputDefinition, InputDefinition, Nothing from dagster.utils import script_relative_path from pandas import DataFrame import sqlite3 import dagstermill as dm from definitions.exchangerates.dagster_types import ExchangeRateDataFrame @solid( config_schema={ "base_currency": Field(str, is_required=False, default_value="EUR"), "date_from": str, "date_to": str, }) def extract(context): result = requests.get(f"https://api.exchangeratesapi.io/history?" f"&start_at={context.solid_config['date_from']}" f"&end_at={context.solid_config['date_to']}" f"&base={context.solid_config['base_currency']}") if result.status_code != 200: raise ValueError("API didn't return valid result") return result.json() @solid(output_defs=[OutputDefinition(ExchangeRateDataFrame)]) def transform(context, currency_json) -> DataFrame: data = [] for day in currency_json["rates"]: for currency in currency_json["rates"][day]:
href=href) output = DbtCloudOutput(run_details=final_run_details, result=self.get_run_results(run_id)) if output.docs_url: self._log.info( f"Docs for this run can be viewed here: {output.docs_url}") return output @resource( config_schema={ "auth_token": Field( StringSource, is_required=True, description="dbt Cloud API Token. User tokens can be found in the " "[dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the " "[dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) " "for instructions on creating a Service Account token.", ), "account_id": Field( int, is_required=True, description= "dbt Cloud Account ID. This value can be found in the url of a variety of " "views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.", ), "disable_schedule_on_trigger": Field( bool, default_value=True,
), EventMetadataEntry.text( str(len(value[0].keys()) if len(value) > 0 else 0), 'n_cols', 'Number of columns seen in the data frame', ), EventMetadataEntry.text( str(list(value[0].keys()) if len(value) > 0 else []), 'column_names', 'Keys of columns seen in the data frame', ), ], ) @input_hydration_config(Selector({'csv': Field(String)})) def less_simple_data_frame_input_hydration_config(context, selector): csv_path = os.path.join(os.path.dirname(__file__), selector['csv']) with open(csv_path, 'r') as fd: lines = [row for row in csv.DictReader(fd)] context.log.info('Read {n_lines} lines'.format(n_lines=len(lines))) return lines LessSimpleDataFrame = DagsterType( name='LessSimpleDataFrame', description='A more sophisticated data frame that type checks its structure.', type_check_fn=less_simple_data_frame_type_check, input_hydration_config=less_simple_data_frame_input_hydration_config, )
from dagster import Field, pipeline, solid from dagster.core.definitions.events import DynamicOutput from dagster.core.definitions.output import DynamicOutputDefinition @solid def multiply_by_two(context, y): context.log.info("echo_again is returning " + str(y * 2)) return y * 2 @solid(config_schema={"fail_on_first_try": Field(bool, default_value=False)}) def multiply_inputs(context, y, ten): if context.solid_config["fail_on_first_try"]: current_run = context.instance.get_run_by_id(context.run_id) if y == 2 and current_run.parent_run_id is None: raise Exception() context.log.info("echo is returning " + str(y * ten)) return y * ten @solid def emit_ten(_): return 10 @solid def sum_numbers(_, base, nums): return base + sum(nums)
from dagster import AssetKey, AssetMaterialization, EventMetadata, Field, Output, pipeline, solid @solid( config_schema={ "bucket": Field(str, is_required=True), "s3_key": Field(str, is_required=True), }) def read_s3_key(context): s3_key = context.solid_config["s3_key"] bucket = context.solid_config["bucket"] path = f"s3://{bucket}/{s3_key}" context.log.info(f"Found file {path}") yield AssetMaterialization( asset_key=AssetKey(["log_s3", path]), metadata={"S3 path": EventMetadata.url(path)}, ) yield Output(path) @pipeline( description="Demo pipeline that spits out some file info, given a path") def log_s3_pipeline(): read_s3_key()
from dagster import Field, String, SystemStorageData, system_storage from dagster.core.storage.intermediates_manager import IntermediateStoreIntermediatesManager from dagster.core.storage.system_storage import fs_system_storage, mem_system_storage from .file_manager import S3FileManager from .intermediate_store import S3IntermediateStore @system_storage( name='s3', is_persistent=True, config={ 's3_bucket': Field(String), 's3_prefix': Field(String, is_required=False, default_value='dagster'), }, required_resource_keys={'s3'}, ) def s3_system_storage(init_context): s3_session = init_context.resources.s3.session s3_key = '{prefix}/storage/{run_id}/files'.format( prefix=init_context.system_storage_config['s3_prefix'], run_id=init_context.pipeline_run.run_id, ) return SystemStorageData( file_manager=S3FileManager( s3_session=s3_session, s3_bucket=init_context.system_storage_config['s3_bucket'], s3_base_key=s3_key, ), intermediates_manager=IntermediateStoreIntermediatesManager( S3IntermediateStore(
from random import random from dagster import Field, pipeline, solid DEFAULT_EXCEPTION_RATE = 0.3 @solid def unreliable_start(_): return 1 @solid(config={'rate': Field(float, is_required=False, default_value=DEFAULT_EXCEPTION_RATE)}) def unreliable(context, num): if random() < context.solid_config['rate']: raise Exception('blah') return num @pipeline def unreliable_pipeline(): one = unreliable.alias('one') two = unreliable.alias('two') three = unreliable.alias('three') four = unreliable.alias('four') five = unreliable.alias('five') six = unreliable.alias('six') seven = unreliable.alias('seven') seven(six(five(four(three(two(one(unreliable_start())))))))
try: from dagster_dask import dask_executor return default_executors + [dask_executor] except ImportError: return default_executors @solid( input_defs=[InputDefinition('chase_duration', int)], output_defs=[OutputDefinition(int, 'total')], config={ 'chase_size': Field( int, default_value=100000, is_optional=True, description='How big should the pointer chase array be?', ) }, ) def hammer(context, chase_duration): '''what better way to do a lot of gnarly work than to pointer chase?''' ptr_length = context.solid_config['chase_size'] data = list(range(0, ptr_length)) random.shuffle(data) curr = random.randint(0, ptr_length) # and away we go start_time = time.time() while (time.time() - start_time) < chase_duration:
} } }, ) assert res.output_value() == ["foo", 1, 3.1] def test_dict_return_solid(): res = execute_solid(dict_return_solid) assert res.output_value() == {"foo": "bar"} ###### @solid(config_schema=Field(Any)) def any_config(context): return context.solid_config @solid(config_schema=Field(Bool)) def bool_config(context): return "true" if context.solid_config else "false" @solid(config_schema=Int) def add_n(context, x: Int) -> int: return x + context.solid_config @solid(config_schema=Field(Float))
command, action_on_failure=action_on_failure)) return steps @property def running_on_emr(self): '''Detects whether we are running on the EMR cluster ''' if os.path.exists('/mnt/var/lib/info/job-flow.json'): return True return False @resource({ 'pipeline_file': Field(str, description='Path to the file where the pipeline is defined'), 'pipeline_fn_name': Field(str), 'spark_config': spark_config(), 'cluster_id': Field(str, description='Name of the job flow (cluster) on which to execute'), 'region_name': Field(str), 'action_on_failure': Field(str, is_required=False, default_value='CANCEL_AND_WAIT'), 'staging_bucket': Field( str, is_required=True,
from dagster import Field, List, PermissiveDict, String from dagster.core.definitions.executor import executor from .config import CeleryConfig @executor( name='celery', config={ 'broker': Field( String, is_optional=True, description=( 'The URL of the Celery broker. Default: ' '\'pyamqp://guest@{os.getenv(\'DAGSTER_CELERY_BROKER_HOST\',' '\'localhost\')}//\'.'), ), 'backend': Field( String, is_optional=True, default_value='rpc://', description= 'The URL of the Celery results backend. Default: \'rpc://\'.', ), 'include': Field(List[String], is_optional=True, description='List of modules every worker should import'), 'config_source':
import dagstermill as dm from docs_snippets.legacy.data_science.download_file import download_file from dagster import Field, InputDefinition, Int, pipeline from dagster.utils import script_relative_path k_means_iris = dm.define_dagstermill_solid( 'k_means_iris', script_relative_path('iris-kmeans_2.ipynb'), input_defs=[InputDefinition('path', str, description='Local path to the Iris dataset')], config_schema=Field( Int, default_value=3, is_required=False, description='The number of clusters to find' ), ) @pipeline def iris_pipeline(): k_means_iris(download_file())
def test_custom_dagster_dataframe_parametrizable_input(): @dagster_type_loader( Selector({ "door_a": Field(str), "door_b": Field(str), "door_c": Field(str), })) def silly_loader(_, config): which_door = list(config.keys())[0] if which_door == "door_a": return DataFrame({"foo": ["goat"]}) elif which_door == "door_b": return DataFrame({"foo": ["car"]}) elif which_door == "door_c": return DataFrame({"foo": ["goat"]}) raise DagsterInvariantViolationError( "You did not pick a door. You chose: {which_door}".format( which_door=which_door)) @dagster_type_materializer( Selector({ "devnull": Field(str), "nothing": Field(str) })) def silly_materializer(_, _config, _value): return AssetMaterialization(asset_key="nothing", description="just one of those days") TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn.exists("foo"), ], loader=silly_loader, materializer=silly_materializer, ) @solid( input_defs=[InputDefinition("df", TestDataFrame)], output_defs=[OutputDefinition(TestDataFrame)], ) def did_i_win(_, df): return df solid_result = execute_solid( did_i_win, run_config={ "solids": { "did_i_win": { "inputs": { "df": { "door_a": "bar" } }, "outputs": [{ "result": { "devnull": "baz" } }], } } }, ) assert solid_result.success output_df = solid_result.output_value() assert isinstance(output_df, DataFrame) assert output_df["foo"].tolist() == ["goat"] materialization_events = solid_result.materialization_events_during_compute assert len(materialization_events) == 1 assert materialization_events[ 0].event_specific_data.materialization.label == "nothing"
from dagster import ( Field, InputDefinition, ModeDefinition, OutputDefinition, String, pipeline, repository, solid, ) from dagster.core.storage.asset_store import versioned_filesystem_asset_store @solid( version="create_string_version", config_schema={"input_str": Field(String)}, output_defs=[ OutputDefinition(name="created_string", manager_key="object_manager", asset_metadata={}) ], ) def create_string_1_asset(context): return context.solid_config["input_str"] @solid( input_defs=[InputDefinition("_string_input", String)], version="take_string_version", config_schema={"input_str": Field(String)}, output_defs=[
@solid def read_csv(context, csv_path): lines = [] csv_path = os.path.join(os.path.dirname(__file__), csv_path) with open(csv_path, "r") as fd: for row in csv.DictReader(fd): row["calories"] = int(row["calories"]) lines.append(row) context.log.info("Read {n_lines} lines".format(n_lines=len(lines))) return lines @solid( config_schema={ "process_hot": Field(Bool, is_required=False, default_value=True), "process_cold": Field(Bool, is_required=False, default_value=True), }, output_defs=[ OutputDefinition(name="hot_cereals", dagster_type=DataFrame, is_required=False), OutputDefinition(name="cold_cereals", dagster_type=DataFrame, is_required=False), ], ) def split_cereals(context, cereals): if context.solid_config["process_hot"]: hot_cereals = [cereal for cereal in cereals if cereal["type"] == "H"] yield Output(hot_cereals, "hot_cereals")
'Persisted table {table_name} in database configured in the db_info resource.' ).format(table_name=table_name), metadata_entries=[ EventMetadataEntry.text(label='Host', text=context.resources.db_info.host), EventMetadataEntry.text(label='Db', text=context.resources.db_info.db_name), ], ) yield Output(value=table_name, output_name='table_name') @solid( required_resource_keys={'pyspark_step_launcher'}, description='Subsample a spark dataset via the configuration option.', config={ 'subsample_pct': Field( Int, description='The integer percentage of rows to sample from the input dataset.', ) }, ) def subsample_spark_dataset(context, data_frame: DataFrame) -> DataFrame: return data_frame.sample( withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0 ) @composite_solid( description='''Ingest a zipped csv file from s3, stash in a keyed file store (does not download if already present by default), unzip that file, and load it into a Spark Dataframe. See documentation in constituent solids for more detail.'''
from dagster import Field, StringSource, resource from slack import WebClient @resource( { "token": Field( StringSource, description="""To configure access to the Slack API, you'll need an access token provisioned with access to your Slack workspace. Tokens are typically either user tokens or bot tokens. For programmatic posting to Slack from this resource, you probably want to provision and use a bot token. More in the Slack API documentation here: https://api.slack.com/docs/token-types """, ) }, description="This resource is for connecting to Slack", ) def slack_resource(context): """This resource is for connecting to Slack. The resource object is a `slack.WebClient`. By configuring this Slack resource, you can post messages to Slack from any Dagster solid: Examples: .. code-block:: python