Ejemplo n.º 1
0
 def the_solid(_context):
     yield AssetMaterialization(
         asset_key="foo",
         metadata={
             "text": "FOO",
             "int": 22,
             "url": EventMetadata.url("http://fake.com"),
             "float": 0.1,
             "python": EventMetadata.python_artifact(EventMetadata),
         },
     )
Ejemplo n.º 2
0
def my_metadata_materialization_solid(context):
    df = read_df()
    remote_storage_path = persist_to_storage(df)
    yield AssetMaterialization(
        asset_key="my_dataset",
        description="Persisted result to storage",
        metadata={
            "text_metadata": "Text-based metadata for this event",
            "path": EventMetadata.path(remote_storage_path),
            "dashboard_url": EventMetadata.url("http://mycoolsite.com/url_for_my_data"),
            "size (bytes)": calculate_bytes(df),
        },
    )
    yield Output(remote_storage_path)
Ejemplo n.º 3
0
def my_failure_metadata_solid():
    path = "/path/to/files"
    my_files = get_files(path)
    if len(my_files) == 0:
        raise Failure(
            description="No files to process",
            metadata={
                "filepath":
                EventMetadata.path(path),
                "dashboard_url":
                EventMetadata.url("http://mycoolsite.com/failures"),
            },
        )
    return some_calculation(my_files)
Ejemplo n.º 4
0
def many_table_materializations(_context):
    with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f:
        md_str = f.read()
        for table in raw_tables:
            yield AssetMaterialization(
                asset_key="table_info",
                metadata={
                    "table_name": table,
                    "table_path": EventMetadata.path(f"/path/to/{table}"),
                    "table_data": {"name": table},
                    "table_name_big": EventMetadata.url(f"https://bigty.pe/{table}"),
                    "table_blurb": EventMetadata.md(md_str),
                    "big_int": 29119888133298982934829348,
                    "float_nan": float("nan"),
                },
            )
Ejemplo n.º 5
0
 def raw_file_op(_context):
     yield AssetMaterialization(
         asset_key="table_info",
         metadata={"table_path": EventMetadata.path("/path/to/{}.raw".format(name))},
     )
     yield do_expectation(_context, name)
     yield Output(name)
Ejemplo n.º 6
0
def sort_by_calories(context, cereals):
    sorted_cereals = sorted(cereals,
                            key=lambda cereal: int(cereal["calories"]))
    least_caloric = sorted_cereals[0]["name"]
    most_caloric = sorted_cereals[-1]["name"]

    logger = get_dagster_logger()
    logger.info(f"Least caloric cereal: {least_caloric}")
    logger.info(f"Most caloric cereal: {most_caloric}")

    fieldnames = list(sorted_cereals[0].keys())
    sorted_cereals_csv_path = os.path.abspath(
        f"output/calories_sorted_{context.run_id}.csv")
    os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True)

    with open(sorted_cereals_csv_path, "w") as fd:
        writer = csv.DictWriter(fd, fieldnames)
        writer.writeheader()
        writer.writerows(sorted_cereals)

    yield AssetMaterialization(
        asset_key="sorted_cereals_csv",
        description="Cereals data frame sorted by caloric content",
        metadata={
            "sorted_cereals_csv_path":
            EventMetadata.path(sorted_cereals_csv_path)
        },
    )
    yield Output(None)
Ejemplo n.º 7
0
def observes_dataset_op(context):
    df = read_df()
    remote_storage_path = persist_to_storage(df)
    context.log_event(
        AssetObservation(
            asset_key="my_dataset",
            metadata={
                "text_metadata":
                "Text-based metadata for this event",
                "path":
                EventMetadata.path(remote_storage_path),
                "dashboard_url":
                EventMetadata.url("http://mycoolsite.com/url_for_my_data"),
                "size (bytes)":
                calculate_bytes(df),
            },
        ))
    context.log_event(AssetMaterialization(asset_key="my_dataset"))
    return remote_storage_path
Ejemplo n.º 8
0
def read_s3_key(context):
    s3_key = context.solid_config["s3_key"]
    bucket = context.solid_config["bucket"]
    path = f"s3://{bucket}/{s3_key}"
    context.log.info(f"Found file {path}")
    yield AssetMaterialization(
        asset_key=AssetKey(["log_s3", path]),
        metadata={"S3 path": EventMetadata.url(path)},
    )
    yield Output(path)
Ejemplo n.º 9
0
def my_asset_key_materialization_solid(context):
    df = read_df()
    remote_storage_path = persist_to_storage(df)
    yield AssetMaterialization(
        asset_key=AssetKey(["dashboard", "my_cool_site"]),
        description="Persisted result to storage",
        metadata={
            "dashboard_url": EventMetadata.url("http://mycoolsite.com/dashboard"),
            "size (bytes)": calculate_bytes(df),
        },
    )
    yield Output(remote_storage_path)
Ejemplo n.º 10
0
def asset_metadata(_context, model_info):
    config = dict(SHARED_SNOWFLAKE_CONF)
    config["schema"] = model_info["schema"]
    with connect_snowflake(config=config) as con:
        df = pd.read_sql(f"SELECT * FROM {model_info['name']} LIMIT 5",
                         con=con)
        num_rows = con.execute(
            f"SELECT COUNT(*) FROM {model_info['name']}").fetchone()

    return {
        "Data sample": EventMetadata.md(df.to_markdown()),
        "Rows": num_rows[0]
    }
Ejemplo n.º 11
0
def my_metadata_output(context):
    df = get_some_data()
    yield Output(
        df,
        metadata={
            "text_metadata":
            "Text-based metadata for this event",
            "dashboard_url":
            EventMetadata.url("http://mycoolsite.com/url_for_my_data"),
            "raw_count":
            len(df),
            "size (bytes)":
            calculate_bytes(df),
        },
    )
Ejemplo n.º 12
0
def my_metadata_expectation_solid(context, df):
    df = do_some_transform(df)
    yield ExpectationResult(
        success=len(df) > 0,
        description="ensure dataframe has rows",
        metadata={
            "text_metadata":
            "Text-based metadata for this event",
            "dashboard_url":
            EventMetadata.url("http://mycoolsite.com/url_for_my_data"),
            "raw_count":
            len(df),
            "size (bytes)":
            calculate_bytes(df),
        },
    )
    yield Output(df)
Ejemplo n.º 13
0
def read_file(context):
    relative_filename = context.solid_config["filename"]
    directory = context.solid_config["directory"]
    filename = os.path.join(directory, relative_filename)
    try:
        fstats = os.stat(filename)
        context.log.info("Found file {}".format(relative_filename))
        yield AssetMaterialization(
            asset_key=AssetKey(["log_file", relative_filename]),
            metadata={
                "path": EventMetadata.path(filename),
                "File status": {
                    "size": fstats.st_size,
                    "ctime": fstats.st_ctime,
                    "mtime": fstats.st_mtime,
                },
            },
        )
        yield Output(relative_filename)
    except FileNotFoundError:
        context.log.error("No file found: {}".format(relative_filename))
Ejemplo n.º 14
0
def build_component_top_stories(model: TruncatedSVD,
                                user_story_matrix: IndexedCooMatrix,
                                story_titles: DataFrame):
    """
    For each component in the collaborative filtering model, finds the titles of the top stories
    it's associated with.
    """
    n_stories = 10

    components_column = []
    titles_column = []

    story_titles = story_titles.set_index("id")

    for i in range(model.components_.shape[0]):
        component = model.components_[i]
        top_story_indices = component.argsort()[-n_stories:][::-1]
        top_story_ids = user_story_matrix.col_index[top_story_indices]
        top_story_titles = story_titles.loc[top_story_ids]

        for title in top_story_titles["title"]:
            components_column.append(i)
            titles_column.append(title)

    component_top_stories = DataFrame({
        "component_index":
        Series(components_column),
        "title":
        Series(titles_column)
    })

    yield Output(
        component_top_stories,
        metadata={
            "Top component top stories":
            EventMetadata.md(
                top_components_to_markdown(component_top_stories)),
        },
    )
Ejemplo n.º 15
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        "users",
        "groups",
        "events",
        "friends",
        "pages",
        "fans",
        "event_admins",
        "group_admins",
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key="table_info",
            metadata={
                "table_path": EventMetadata.path(f"/path/to/{table}.raw"),
            },
        )
        yield ExpectationResult(
            success=True,
            label="{table}.row_count".format(table=table),
            description="Row count passed for {table}".format(table=table),
        )
Ejemplo n.º 16
0
    def poll_run(
        self,
        run_id: int,
        poll_interval: float = DEFAULT_POLL_INTERVAL,
        poll_timeout: float = None,
        href: str = None,
    ) -> Dict[str, Any]:
        """
        Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the
        run does not complete successfully.

        Args:
            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to
                the details page of your run in the dbt Cloud UI. It will be the final number in the
                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``
            poll_interval (float): The time (in seconds) that should be waited between successive
                polls of the dbt Cloud API.
            poll_timeout (float): The maximum time (in seconds) that should be waited for this run
                to complete. If this threshold is exceeded, the run will be cancelled and an
                exception will be thrown. By default, this will poll forver.
            href (str): For internal use, generally should not be set manually.

        Returns:
            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.
                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.
        """

        if not href:
            href = self.get_run(run_id).get("href")

        poll_start = datetime.datetime.now()
        while True:
            run_details = self.get_run(run_id)
            status = run_details["status_humanized"]
            self._log.info(f"Polled run {run_id}. Status: [{status}]")

            # completed successfully
            if status == "Success":
                return self.get_run(run_id, include_related=["job", "trigger"])
            elif status in ["Error", "Cancelled"]:
                break
            elif status not in ["Queued", "Starting", "Running"]:
                check.failed(
                    f"Received unexpected status '{status}'. This should never happen"
                )

            if poll_timeout and datetime.datetime.now(
            ) > poll_start + datetime.timedelta(seconds=poll_timeout):
                self.cancel_run(run_id)
                raise Failure(
                    f"Run {run_id} timed out after "
                    f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",
                    metadata={"run_page_url": EventMetadata.url(href)},
                )

            # Sleep for the configured time interval before polling again.
            time.sleep(poll_interval)

        run_details = self.get_run(run_id, include_related=["trigger"])
        raise Failure(
            f"Run {run_id} failed. Status Message: {run_details['status_message']}",
            metadata={
                "run_details": EventMetadata.json(run_details),
                "run_page_url": EventMetadata.url(href),
            },
        )
Ejemplo n.º 17
0
def metadata_for_actions(df):
    return {
        "min_score": int(df["score"].min()),
        "max_score": int(df["score"].max()),
        "sample rows": EventMetadata.md(df[:5].to_markdown()),
    }
Ejemplo n.º 18
0
def daily_top_action(_, df1, df2):
    df = pd.concat([df1, df2]).nlargest(1, "score")
    return Output(df, metadata={"data": EventMetadata.md(df.to_markdown())})
Ejemplo n.º 19
0
 def _best_n_actions(_, df):
     df = df.nlargest(n, "score")
     return Output(
         df,
         metadata={"data": EventMetadata.md(df.to_markdown())},
     )
Ejemplo n.º 20
0
 def handle_output(self, context, obj):
     super().handle_output(context, obj)
     # can pretend this actually came from a library call
     yield EventMetadataEntry(label="num rows written to db",
                              description=None,
                              entry_data=EventMetadata.int(len(obj)))
Ejemplo n.º 21
0
    def poll_sync(
        self,
        connector_id: str,
        initial_last_sync_completion: datetime.datetime,
        poll_interval: float = DEFAULT_POLL_INTERVAL,
        poll_timeout: float = None,
    ) -> Dict[str, Any]:
        """
        Given a Fivetran connector and the timestamp at which the previous sync completed, poll
        until the next sync completes.

        The previous sync completion time is necessary because the only way to tell when a sync
        completes is when this value changes.

        Args:
            connector_id (str): The Fivetran Connector ID. You can retrieve this value from the
                "Setup" tab of a given connector in the Fivetran UI.
            initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync
                (successful or otherwise) for this connector, prior to running this method.
            poll_interval (float): The time (in seconds) that will be waited between successive polls.
            poll_timeout (float): The maximum time that will waited before this operation is timed
                out. By default, this will never time out.

        Returns:
            Dict[str, Any]: Parsed json data representing the API response.
        """
        poll_start = datetime.datetime.now()
        while True:
            (
                curr_last_sync_completion,
                curr_last_sync_succeeded,
                curr_sync_state,
            ) = self.get_connector_sync_status(connector_id)
            self._log.info(
                f"Polled '{connector_id}'. Status: [{curr_sync_state}]")

            if curr_last_sync_completion > initial_last_sync_completion:
                break

            if poll_timeout and datetime.datetime.now(
            ) > poll_start + datetime.timedelta(seconds=poll_timeout):
                raise Failure(
                    f"Sync for connector '{connector_id}' timed out after {datetime.datetime.now() - poll_start}."
                )

            # Sleep for the configured time interval before polling again.
            time.sleep(poll_interval)

        connector_details = self.get_connector_details(connector_id)
        if not curr_last_sync_succeeded:
            raise Failure(
                f"Sync for connector '{connector_id}' failed!",
                metadata={
                    "connector_details":
                    EventMetadata.json(connector_details),
                    "log_url":
                    EventMetadata.url(
                        get_fivetran_logs_url(connector_details)),
                },
            )
        return connector_details