def asset_metadata(_context, model_info): config = dict(SHARED_SNOWFLAKE_CONF) config["schema"] = model_info["schema"] with connect_snowflake(config=config) as con: df = pd.read_sql(f"SELECT * FROM {model_info['name']} LIMIT 5", con=con) num_rows = con.execute( f"SELECT COUNT(*) FROM {model_info['name']}").fetchone() return { "Data sample": EventMetadata.md(df.to_markdown()), "Rows": num_rows[0] }
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_name": table, "table_path": EventMetadata.path(f"/path/to/{table}"), "table_data": {"name": table}, "table_name_big": EventMetadata.url(f"https://bigty.pe/{table}"), "table_blurb": EventMetadata.md(md_str), "big_int": 29119888133298982934829348, "float_nan": float("nan"), }, )
def build_component_top_stories(model: TruncatedSVD, user_story_matrix: IndexedCooMatrix, story_titles: DataFrame): """ For each component in the collaborative filtering model, finds the titles of the top stories it's associated with. """ n_stories = 10 components_column = [] titles_column = [] story_titles = story_titles.set_index("id") for i in range(model.components_.shape[0]): component = model.components_[i] top_story_indices = component.argsort()[-n_stories:][::-1] top_story_ids = user_story_matrix.col_index[top_story_indices] top_story_titles = story_titles.loc[top_story_ids] for title in top_story_titles["title"]: components_column.append(i) titles_column.append(title) component_top_stories = DataFrame({ "component_index": Series(components_column), "title": Series(titles_column) }) yield Output( component_top_stories, metadata={ "Top component top stories": EventMetadata.md( top_components_to_markdown(component_top_stories)), }, )
def metadata_for_actions(df): return { "min_score": int(df["score"].min()), "max_score": int(df["score"].max()), "sample rows": EventMetadata.md(df[:5].to_markdown()), }
def daily_top_action(_, df1, df2): df = pd.concat([df1, df2]).nlargest(1, "score") return Output(df, metadata={"data": EventMetadata.md(df.to_markdown())})
def _best_n_actions(_, df): df = df.nlargest(n, "score") return Output( df, metadata={"data": EventMetadata.md(df.to_markdown())}, )