def my_directory_sensor(): for filename in os.listdir(MY_DIRECTORY): filepath = os.path.join(MY_DIRECTORY, filename) if os.path.isfile(filepath): yield RunRequest( run_key=filename, run_config={ "solids": { "process_file": { "config": { "filename": filename } } } }, )
def isolated_run_request(): filename = "placeholder" # start_run_request_marker yield RunRequest( run_key=filename, run_config={ "solids": { "process_file": { "config": { "filename": filename } } } }, )
def gdelt_enhanced_articles_sensor(context, asset_event): yield RunRequest( run_key=asset_event.dagster_event.event_specific_data.materialization. metadata_entries[0].entry_data.text, run_config={ "ops": { "classify_protest_relevancy": { "config": { "asset_key": asset_event.dagster_event.asset_key.path, "asset_materialization_path": asset_event.dagster_event.event_specific_data. materialization.metadata_entries[0].entry_data.text } } } }, )
def toy_s3_sensor(context): if not bucket: raise Exception( "S3 bucket not specified at environment variable `DAGSTER_TOY_SENSOR_S3_BUCKET`." ) new_s3_keys = get_s3_keys(bucket, since_key=context.last_run_key) if not new_s3_keys: yield SkipReason(f"No s3 updates found for bucket {bucket}.") return for s3_key in new_s3_keys: yield RunRequest( run_key=s3_key, run_config={ "solids": {"read_s3_key": {"config": {"bucket": bucket, "s3_key": s3_key}}} }, )
def gtfs_sensor(context): last_mtime = parse_run_key(context.last_run_key)[1] if context.last_run_key else 0 for blob in filter_blobs_by_modification_time( get_list_of_blobs(GTFS_BLOBS_PREFIX, SENSOR_BUCKET, mode="staging"), last_mtime, after=True, ): run_key: str = build_run_key(blob.name, time.mktime(blob.updated.timetuple())) # Parse dataset_id and table_id path_list: list = [ n for n in blob.name.split(GTFS_BLOBS_PREFIX)[1].split("/") if n != "" ] dataset_id: str = path_list[0] table_id: str = path_list[1] # Set run configs config: dict = read_config( Path(__file__).parent / f"{dataset_id}/{table_id}.yaml" ) config["solids"]["save_blob_to_tempfile"] = { "inputs": { "blob_path": {"value": blob.name}, "bucket_name": {"value": SENSOR_BUCKET}, } } config["solids"]["create_gtfs_version_partition"] = { "inputs": { "original_filepath": {"value": blob.name}, "bucket_name": {"value": SENSOR_BUCKET}, } } config["solids"]["upload_blob_to_storage"] = { "inputs": { "blob_path": {"value": blob.name}, "bucket_name": {"value": SENSOR_BUCKET}, } } config["resources"]["basedosdados_config"] = { "config": {"dataset_id": dataset_id, "table_id": table_id} } yield RunRequest(run_key=run_key, run_config=config)
def test_sensor_w_no_job(): @sensor() def no_job_sensor(): pass with pytest.raises( Exception, match=r".* Sensor evaluation function returned a RunRequest for a sensor lacking a " r"specified target .*", ): no_job_sensor.check_valid_run_requests( [ RunRequest( run_key=None, run_config=None, tags=None, ) ] )
def my_directory_sensor_cursor(context): last_mtime = float(context.cursor) if context.cursor else 0 max_mtime = last_mtime for filename in os.listdir(MY_DIRECTORY): filepath = os.path.join(MY_DIRECTORY, filename) if os.path.isfile(filepath): fstats = os.stat(filepath) file_mtime = fstats.st_mtime if file_mtime <= last_mtime: continue # the run key should include mtime if we want to kick off new runs based on file modifications run_key = f"{filename}:{str(file_mtime)}" run_config = {"solids": {"process_file": {"config": {"filename": filename}}}} yield RunRequest(run_key=run_key, run_config=run_config) max_mtime = max(max_mtime, file_mtime) context.update_cursor(str(max_mtime))
def my_directory_sensor_with_skip_reasons(_context): has_files = False for filename in os.listdir(MY_DIRECTORY): filepath = os.path.join(MY_DIRECTORY, filename) if os.path.isfile(filepath): yield RunRequest( run_key=filename, run_config={ "solids": { "process_file": { "config": { "filename": filename } } } }, ) has_files = True if not has_files: yield SkipReason(f"No files found in {MY_DIRECTORY}.")
def multi_asset_sensor(context): cursor_dict = json.loads(context.cursor) if context.cursor else {} a_cursor = cursor_dict.get("a") b_cursor = cursor_dict.get("b") a_event_records = context.instance.get_event_records( EventRecordsFilter( event_type=DagsterEventType.ASSET_MATERIALIZATION, asset_key=AssetKey("table_a"), after_cursor=a_cursor, ), ascending=False, limit=1, ) b_event_records = context.instance.get_event_records( EventRecordsFilter( event_type=DagsterEventType.ASSET_MATERIALIZATION, asset_key=AssetKey("table_a"), after_cursor=b_cursor, ), ascending=False, limit=1, ) if not a_event_records or not b_event_records: return # make sure we only generate events if both table_a and table_b have been materialized since # the last evaluation. yield RunRequest(run_key=None) # update the sensor cursor by combining the individual event cursors from the two separate # asset event streams context.update_cursor( json.dumps( { "a": a_event_records[0].storage_id, "b": b_event_records[0].storage_id, } ) )
def toy_asset_sensor(context): with context.get_instance() as instance: events = instance.events_for_asset_key( AssetKey(["model"]), after_cursor=context.last_run_key, ascending=False, limit=1 ) if not events: return record_id, event = events[0] # take the most recent materialization from_pipeline = event.pipeline_name yield RunRequest( run_key=str(record_id), run_config={ "solids": { "read_materialization": { "config": {"asset_key": ["model"], "pipeline": from_pipeline} } } }, )
def my_directory_sensor_cursor(context): last_mtime = parse_run_key( context.last_run_key)[1] if context.last_run_key else None for filename in os.listdir(MY_DIRECTORY): filepath = os.path.join(MY_DIRECTORY, filename) if os.path.isfile(filepath): fstats = os.stat(filepath) file_mtime = fstats.st_mtime if file_mtime > last_mtime: # the run key should include mtime if we want to kick off new runs based on file modifications run_key = build_run_key(filename, file_mtime) run_config = ({ "solids": { "process_file": { "config": { "filename": filename } } } }, ) yield RunRequest(run_key=run_key, run_config=run_config)
def hn_tables_updated_sensor(context): cursor_dict = json.loads(context.cursor) if context.cursor else {} comments_cursor = cursor_dict.get("comments") stories_cursor = cursor_dict.get("stories") comments_event_records = context.instance.get_event_records( EventRecordsFilter( event_type=DagsterEventType.ASSET_MATERIALIZATION, asset_key=AssetKey(["snowflake", "hackernews", "comments"]), after_cursor=comments_cursor, ), ascending=False, limit=1, ) stories_event_records = context.instance.get_event_records( EventRecordsFilter( event_type=DagsterEventType.ASSET_MATERIALIZATION, asset_key=AssetKey(["snowflake", "hackernews", "stories"]), after_cursor=stories_cursor, ), ascending=False, limit=1, ) if not comments_event_records or not stories_event_records: return # make sure we only generate events if both table_a and table_b have been materialized since # the last evaluation. yield RunRequest(run_key=None) # update the sensor cursor by combining the individual event cursors from the two separate # asset event streams context.update_cursor( json.dumps({ "comments": comments_event_records[0].storage_id, "stories": stories_event_records[0].storage_id, }))
def sensor_B(): yield RunRequest(run_key=None, run_config={})
def materialized_views_update_sensor(context: SensorExecutionContext): """Sensor for updating materialized views on file changes. For every new or modified file, the pipeline `update_managed_materialized_views` is triggered. This ensures BQ materialized views are always up-to-date. """ # Store largest mtime largest_mtime = 0 # Store deleted and modified blobs deleted_blobs = [] modified_blobs = [] # Get connection to Redis rp = RedisPal(host=constants.REDIS_HOST.value) # Get list of blobs in bucket blobs_list = get_list_of_blobs(MATERIALIZED_VIEWS_PREFIX, SENSOR_BUCKET) # Get previous set of blobs in Redis previous_blobs_set: set = rp.get( constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value) # If there is no previous set, create it if not previous_blobs_set: rp.set(constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value, set([b.name for b in blobs_list])) # If there is a previous set, compare it to the current set else: deleted_blobs: set = previous_blobs_set - \ set([b.name for b in blobs_list]) # Get previous run mtime previous_run_mtime = rp.get( constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value) # If there is no previous run mtime, set modified blobs to the current blobs if not previous_run_mtime: modified_blobs = blobs_list # If there is a previous run mtime, compare it to the current list # and get modified files else: modified_blobs = filter_blobs_by_mtime(blobs_list, previous_run_mtime) # Update last run time largest_mtime = get_largest_blob_mtime(blobs_list) rp.set(constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value, largest_mtime) # If there are modified or deleted files, trigger pipeline if modified_blobs or deleted_blobs: # Load run configuration and set inputs config: dict = read_config( Path(__file__).parent / "materialized_views_update.yaml") config["solids"]["delete_managed_views"]["inputs"]["blob_names"][ "value"] = list(deleted_blobs) config["solids"]["update_managed_views"]["inputs"]["blob_names"][ "value"] = [b.name for b in modified_blobs] # Set a run key run_key: str = build_run_key("update-managed-views", largest_mtime) # Yield a run request yield RunRequest(run_key=run_key, run_config=config) # If there are no modified or deleted files, # skip the pipeline else: yield SkipReason( f"Modified files: {len(modified_blobs)}. Deleted files: {len(deleted_blobs)}" )
def sensor_to_test(): yield RunRequest( run_key="foo", run_config={"solids": {"process_file": {"config": {"filename": "foo"}}}}, )
def addition_sensor(context): should_run = True if should_run: yield RunRequest(run_key=None, run_config={})
def always_on_sensor(_context): return RunRequest(run_key=None, run_config={}, tags={})
def basic_sensor_with_context(_arbitrary_context): return RunRequest(run_key=None, run_config={}, tags={})
def basic_sensor(_): return RunRequest(run_key=None, run_config={}, tags={})
def foo_schedule(): return RunRequest(run_key=None, run_config=FOO_CONFIG, tags={"foo": "FOO"})
def foo_schedule(_context): yield RunRequest(run_key=None, run_config=FOO_CONFIG, tags={"foo": "FOO"})
def job2_sensor(): should_run = True if should_run: yield RunRequest(run_key=None, run_config={})
def sensor_A(_context): yield RunRequest(run_key=None, run_config={})
def foo_schedule(): yield RunRequest(run_key=None, run_config={}, tags={"foo": "FOO"})