Beispiel #1
0
def my_directory_sensor():
    for filename in os.listdir(MY_DIRECTORY):
        filepath = os.path.join(MY_DIRECTORY, filename)
        if os.path.isfile(filepath):
            yield RunRequest(
                run_key=filename,
                run_config={
                    "solids": {
                        "process_file": {
                            "config": {
                                "filename": filename
                            }
                        }
                    }
                },
            )
Beispiel #2
0
def isolated_run_request():
    filename = "placeholder"

    # start_run_request_marker

    yield RunRequest(
        run_key=filename,
        run_config={
            "solids": {
                "process_file": {
                    "config": {
                        "filename": filename
                    }
                }
            }
        },
    )
Beispiel #3
0
def gdelt_enhanced_articles_sensor(context, asset_event):
    yield RunRequest(
        run_key=asset_event.dagster_event.event_specific_data.materialization.
        metadata_entries[0].entry_data.text,
        run_config={
            "ops": {
                "classify_protest_relevancy": {
                    "config": {
                        "asset_key":
                        asset_event.dagster_event.asset_key.path,
                        "asset_materialization_path":
                        asset_event.dagster_event.event_specific_data.
                        materialization.metadata_entries[0].entry_data.text
                    }
                }
            }
        },
    )
Beispiel #4
0
    def toy_s3_sensor(context):
        if not bucket:
            raise Exception(
                "S3 bucket not specified at environment variable `DAGSTER_TOY_SENSOR_S3_BUCKET`."
            )

        new_s3_keys = get_s3_keys(bucket, since_key=context.last_run_key)
        if not new_s3_keys:
            yield SkipReason(f"No s3 updates found for bucket {bucket}.")
            return

        for s3_key in new_s3_keys:
            yield RunRequest(
                run_key=s3_key,
                run_config={
                    "solids": {"read_s3_key": {"config": {"bucket": bucket, "s3_key": s3_key}}}
                },
            )
Beispiel #5
0
def gtfs_sensor(context):
    last_mtime = parse_run_key(context.last_run_key)[1] if context.last_run_key else 0

    for blob in filter_blobs_by_modification_time(
        get_list_of_blobs(GTFS_BLOBS_PREFIX, SENSOR_BUCKET, mode="staging"),
        last_mtime,
        after=True,
    ):
        run_key: str = build_run_key(blob.name, time.mktime(blob.updated.timetuple()))

        # Parse dataset_id and table_id
        path_list: list = [
            n for n in blob.name.split(GTFS_BLOBS_PREFIX)[1].split("/") if n != ""
        ]
        dataset_id: str = path_list[0]
        table_id: str = path_list[1]

        # Set run configs
        config: dict = read_config(
            Path(__file__).parent / f"{dataset_id}/{table_id}.yaml"
        )
        config["solids"]["save_blob_to_tempfile"] = {
            "inputs": {
                "blob_path": {"value": blob.name},
                "bucket_name": {"value": SENSOR_BUCKET},
            }
        }
        config["solids"]["create_gtfs_version_partition"] = {
            "inputs": {
                "original_filepath": {"value": blob.name},
                "bucket_name": {"value": SENSOR_BUCKET},
            }
        }
        config["solids"]["upload_blob_to_storage"] = {
            "inputs": {
                "blob_path": {"value": blob.name},
                "bucket_name": {"value": SENSOR_BUCKET},
            }
        }
        config["resources"]["basedosdados_config"] = {
            "config": {"dataset_id": dataset_id, "table_id": table_id}
        }

        yield RunRequest(run_key=run_key, run_config=config)
def test_sensor_w_no_job():
    @sensor()
    def no_job_sensor():
        pass

    with pytest.raises(
        Exception,
        match=r".* Sensor evaluation function returned a RunRequest for a sensor lacking a "
        r"specified target .*",
    ):
        no_job_sensor.check_valid_run_requests(
            [
                RunRequest(
                    run_key=None,
                    run_config=None,
                    tags=None,
                )
            ]
        )
Beispiel #7
0
def my_directory_sensor_cursor(context):
    last_mtime = float(context.cursor) if context.cursor else 0

    max_mtime = last_mtime
    for filename in os.listdir(MY_DIRECTORY):
        filepath = os.path.join(MY_DIRECTORY, filename)
        if os.path.isfile(filepath):
            fstats = os.stat(filepath)
            file_mtime = fstats.st_mtime
            if file_mtime <= last_mtime:
                continue

            # the run key should include mtime if we want to kick off new runs based on file modifications
            run_key = f"{filename}:{str(file_mtime)}"
            run_config = {"solids": {"process_file": {"config": {"filename": filename}}}}
            yield RunRequest(run_key=run_key, run_config=run_config)
            max_mtime = max(max_mtime, file_mtime)

    context.update_cursor(str(max_mtime))
Beispiel #8
0
def my_directory_sensor_with_skip_reasons(_context):
    has_files = False
    for filename in os.listdir(MY_DIRECTORY):
        filepath = os.path.join(MY_DIRECTORY, filename)
        if os.path.isfile(filepath):
            yield RunRequest(
                run_key=filename,
                run_config={
                    "solids": {
                        "process_file": {
                            "config": {
                                "filename": filename
                            }
                        }
                    }
                },
            )
            has_files = True
    if not has_files:
        yield SkipReason(f"No files found in {MY_DIRECTORY}.")
Beispiel #9
0
def multi_asset_sensor(context):
    cursor_dict = json.loads(context.cursor) if context.cursor else {}
    a_cursor = cursor_dict.get("a")
    b_cursor = cursor_dict.get("b")

    a_event_records = context.instance.get_event_records(
        EventRecordsFilter(
            event_type=DagsterEventType.ASSET_MATERIALIZATION,
            asset_key=AssetKey("table_a"),
            after_cursor=a_cursor,
        ),
        ascending=False,
        limit=1,
    )
    b_event_records = context.instance.get_event_records(
        EventRecordsFilter(
            event_type=DagsterEventType.ASSET_MATERIALIZATION,
            asset_key=AssetKey("table_a"),
            after_cursor=b_cursor,
        ),
        ascending=False,
        limit=1,
    )

    if not a_event_records or not b_event_records:
        return

    # make sure we only generate events if both table_a and table_b have been materialized since
    # the last evaluation.
    yield RunRequest(run_key=None)

    # update the sensor cursor by combining the individual event cursors from the two separate
    # asset event streams
    context.update_cursor(
        json.dumps(
            {
                "a": a_event_records[0].storage_id,
                "b": b_event_records[0].storage_id,
            }
        )
    )
Beispiel #10
0
    def toy_asset_sensor(context):
        with context.get_instance() as instance:
            events = instance.events_for_asset_key(
                AssetKey(["model"]), after_cursor=context.last_run_key, ascending=False, limit=1
            )

            if not events:
                return

            record_id, event = events[0]  # take the most recent materialization
            from_pipeline = event.pipeline_name

            yield RunRequest(
                run_key=str(record_id),
                run_config={
                    "solids": {
                        "read_materialization": {
                            "config": {"asset_key": ["model"], "pipeline": from_pipeline}
                        }
                    }
                },
            )
Beispiel #11
0
def my_directory_sensor_cursor(context):
    last_mtime = parse_run_key(
        context.last_run_key)[1] if context.last_run_key else None

    for filename in os.listdir(MY_DIRECTORY):
        filepath = os.path.join(MY_DIRECTORY, filename)
        if os.path.isfile(filepath):
            fstats = os.stat(filepath)
            file_mtime = fstats.st_mtime
            if file_mtime > last_mtime:
                # the run key should include mtime if we want to kick off new runs based on file modifications
                run_key = build_run_key(filename, file_mtime)
                run_config = ({
                    "solids": {
                        "process_file": {
                            "config": {
                                "filename": filename
                            }
                        }
                    }
                }, )
                yield RunRequest(run_key=run_key, run_config=run_config)
Beispiel #12
0
    def hn_tables_updated_sensor(context):
        cursor_dict = json.loads(context.cursor) if context.cursor else {}
        comments_cursor = cursor_dict.get("comments")
        stories_cursor = cursor_dict.get("stories")

        comments_event_records = context.instance.get_event_records(
            EventRecordsFilter(
                event_type=DagsterEventType.ASSET_MATERIALIZATION,
                asset_key=AssetKey(["snowflake", "hackernews", "comments"]),
                after_cursor=comments_cursor,
            ),
            ascending=False,
            limit=1,
        )
        stories_event_records = context.instance.get_event_records(
            EventRecordsFilter(
                event_type=DagsterEventType.ASSET_MATERIALIZATION,
                asset_key=AssetKey(["snowflake", "hackernews", "stories"]),
                after_cursor=stories_cursor,
            ),
            ascending=False,
            limit=1,
        )

        if not comments_event_records or not stories_event_records:
            return

        # make sure we only generate events if both table_a and table_b have been materialized since
        # the last evaluation.
        yield RunRequest(run_key=None)

        # update the sensor cursor by combining the individual event cursors from the two separate
        # asset event streams
        context.update_cursor(
            json.dumps({
                "comments": comments_event_records[0].storage_id,
                "stories": stories_event_records[0].storage_id,
            }))
Beispiel #13
0
def sensor_B():
    yield RunRequest(run_key=None, run_config={})
Beispiel #14
0
def materialized_views_update_sensor(context: SensorExecutionContext):
    """Sensor for updating materialized views on file changes.

    For every new or modified file, the pipeline `update_managed_materialized_views`
    is triggered. This ensures BQ materialized views are always up-to-date.
    """
    # Store largest mtime
    largest_mtime = 0

    # Store deleted and modified blobs
    deleted_blobs = []
    modified_blobs = []

    # Get connection to Redis
    rp = RedisPal(host=constants.REDIS_HOST.value)

    # Get list of blobs in bucket
    blobs_list = get_list_of_blobs(MATERIALIZED_VIEWS_PREFIX, SENSOR_BUCKET)

    # Get previous set of blobs in Redis
    previous_blobs_set: set = rp.get(
        constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value)

    # If there is no previous set, create it
    if not previous_blobs_set:
        rp.set(constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value,
               set([b.name for b in blobs_list]))

    # If there is a previous set, compare it to the current set
    else:
        deleted_blobs: set = previous_blobs_set - \
            set([b.name for b in blobs_list])

    # Get previous run mtime
    previous_run_mtime = rp.get(
        constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value)

    # If there is no previous run mtime, set modified blobs to the current blobs
    if not previous_run_mtime:
        modified_blobs = blobs_list

    # If there is a previous run mtime, compare it to the current list
    # and get modified files
    else:
        modified_blobs = filter_blobs_by_mtime(blobs_list, previous_run_mtime)

    # Update last run time
    largest_mtime = get_largest_blob_mtime(blobs_list)
    rp.set(constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value, largest_mtime)

    # If there are modified or deleted files, trigger pipeline
    if modified_blobs or deleted_blobs:

        # Load run configuration and set inputs
        config: dict = read_config(
            Path(__file__).parent / "materialized_views_update.yaml")
        config["solids"]["delete_managed_views"]["inputs"]["blob_names"][
            "value"] = list(deleted_blobs)
        config["solids"]["update_managed_views"]["inputs"]["blob_names"][
            "value"] = [b.name for b in modified_blobs]

        # Set a run key
        run_key: str = build_run_key("update-managed-views", largest_mtime)

        # Yield a run request
        yield RunRequest(run_key=run_key, run_config=config)

    # If there are no modified or deleted files,
    # skip the pipeline
    else:
        yield SkipReason(
            f"Modified files: {len(modified_blobs)}. Deleted files: {len(deleted_blobs)}"
        )
Beispiel #15
0
def sensor_to_test():
    yield RunRequest(
        run_key="foo",
        run_config={"solids": {"process_file": {"config": {"filename": "foo"}}}},
    )
Beispiel #16
0
def addition_sensor(context):
    should_run = True
    if should_run:
        yield RunRequest(run_key=None, run_config={})
Beispiel #17
0
def always_on_sensor(_context):
    return RunRequest(run_key=None, run_config={}, tags={})
Beispiel #18
0
 def basic_sensor_with_context(_arbitrary_context):
     return RunRequest(run_key=None, run_config={}, tags={})
Beispiel #19
0
 def basic_sensor(_):
     return RunRequest(run_key=None, run_config={}, tags={})
Beispiel #20
0
 def foo_schedule():
     return RunRequest(run_key=None,
                       run_config=FOO_CONFIG,
                       tags={"foo": "FOO"})
Beispiel #21
0
 def foo_schedule(_context):
     yield RunRequest(run_key=None,
                      run_config=FOO_CONFIG,
                      tags={"foo": "FOO"})
def job2_sensor():
    should_run = True
    if should_run:
        yield RunRequest(run_key=None, run_config={})
Beispiel #23
0
def sensor_A(_context):
    yield RunRequest(run_key=None, run_config={})
Beispiel #24
0
 def foo_schedule():
     yield RunRequest(run_key=None, run_config={}, tags={"foo": "FOO"})