# - [Issue reported on Altair's github](https://github.com/altair-viz/altair/issues/588) # - [`gpdvega` github](https://github.com/altair-viz/altair/issues/588) #%% safs_by_dist = combined.groupby('school_dist')[['lat', 'lon', 'saf_s_11']].mean() #%% nysd = gpd.read_file('shapes/nysd/nysd.shp') nysd = nysd.to_crs({'init': 'epsg:4326'}) safs_by_dist.index = pd.to_numeric(safs_by_dist.index) geo_safs_by_dist = nysd.join(safs_by_dist, how='inner') geo_safs_by_dist = geo_safs_by_dist.drop(['lat', 'lon'], axis=1) geo_safs_by_dist = geo_safs_by_dist.to_crs({'init': 'epsg:4326'}) #%% # gpdvega library enables using a GeoDataFrame directly with Altair alt.Chart(geo_safs_by_dist).mark_geoshape().encode( color=alt.Color('saf_s_11:Q', scale=alt.Scale(scheme='blues')), tooltip='SchoolDist:N').properties( width=500, height=500, title='Safety Score by NYC School District') #%% [markdown] # Darker shades show a higher score, while lighter scores show a lower score. We can see some parts of the Bronx and Queens that have higher scores. #%% [markdown] # ## Visualising Relation Between Ethnicity and Score #%% ethnicities = ['white_per', 'asian_per', 'black_per', 'hispanic_per'] eth_corr = pd.DataFrame( combined.corr()['sat_score'][ethnicities]).reset_index() eth_corr = eth_corr.rename(columns={ 'index': 'ethnicity_per', 'sat_score': 'sat_score_corr' }) alt.Chart(eth_corr).mark_bar().encode(
def inputs_graph(): alt.renderers.enable('mimetype') alt.data_transformers.disable_max_rows() w, h = 1400, 250 data = ld.getData()[["LapDist","Throttle","Brake","Lap"]] dfturns = ld.GetTurns() zoom = alt.selection_interval(bind='scales', encodings=['x']) throttle = alt.Chart(data).mark_line().encode(x='LapDist', y=alt.Y('Throttle', scale=alt.Scale(domain=[-0.2, 1.2])), color=alt.Color('Lap', scale=alt.Scale(scheme='blues'))).properties(width=w, height=h) brake = alt.Chart(data).mark_line().encode(x='LapDist', y=alt.Y('Brake', scale=alt.Scale(domain=[-0.2, 1.2])), color=alt.Color('Lap', scale=alt.Scale(scheme='reds'))).properties(width=w, height=h) cturns = alt.Chart(dfturns).mark_rule().encode(x="LapDist").properties(width=w, height=h) ctext = alt.Chart(dfturns).mark_text(align="center", angle=90, dy=-7, dx=-100).encode(x="LapDist", text="Turn").properties(width=w, height=h) brake = brake + cturns + ctext throttle = throttle + cturns + ctext chart = throttle.add_selection(zoom) & brake.add_selection(zoom) chart = chart.resolve_scale(color='independent') return chart.to_json()
if __name__ == "__main__": teams = get_teams("https://en.wikipedia.org/wiki/2020_NBA_playoffs") # Make a list of all players for all teams players = [] for team in teams: print(team) players += get_players(team[0], team[1], 3) # Create a pandas dataframe of the Player-objects as dictionaries data = pd.DataFrame([player.to_dict() for player in players]) stats = ["ppg", "bpg", "rpg"] title = ["Points Per Game", "Blocks Per Game", "Rebound Per Game"] files = [""] # Assemble the three charts for ppg, bpg and rpg for stat in range(3): chart = alt.Chart(data).mark_bar().encode(y=alt.Y("name", sort="color", title="Player Name"), x=alt.X(stats[stat] + ":Q", title=title[stat]), color=alt.Color( "team:N", title="Team Name")) chart.save("NBA_player_statistics/players_over_" + stats[stat] + ".html")
burst_sim = fm.simulate_shots( shots=fm.max_consecutive_shots, auto_burst_length=5, control_time=5, ) for ( t, _cursor_coor, _pellets_coors, cof, _vertical_recoil, _horizontal_recoil, ) in burst_sim: datapoints.append({ "time": t, "control": f"5 burst + {fm.fire_timing.refire_time + 5}ms", "cof": cof, }) dataset = altair.Data(values=datapoints) chart = (altair.Chart(dataset).mark_line().encode( x="time:Q", y="cof:Q", color=altair.Color("control:O", scale=altair.Scale(scheme="dark2")), tooltip=["time:Q", "control:O"], ).properties(title="Cone of Fire", height=900, width=900).interactive()) chart.save("cof_simulation.html")
def show_time_series(title: str, df: pd.DataFrame, par: str, y_lab: str): """ Plots a time series plot. for time series plots the marker group by parameter is automatically set to the station. Parameters: ----------- :param title: :param df: :param par: :return: """ x_lab = '' df['time'] = pd.to_datetime(df['time']) min_dat = df['time'].min() max_dat = df['time'].max() time_format = get_time_format(min_dat, max_dat) if self.yax_max == self.yax_min: scy = alt.Scale() else: scy = alt.Scale(domain=(self.yax_min, self.yax_max)) if self.moving_average_days > 0: line = alt.Chart(df, title=title).mark_line( point=False, clip=True ).transform_window( rolling_mean='mean({})'.format(par), frame=[ -self.moving_average_days / 2, self.moving_average_days ]).encode( x=alt.X('time:T', axis=alt.Axis(title=x_lab)), # https://github.com/d3/d3-time-format#locale_format y=alt.Y('rolling_mean:Q', scale=scy, axis=alt.Axis(title=y_lab)), color=alt.Color( 'direction_id', scale=alt.Scale(scheme=cn.color_schema)), ) else: line = alt.Chart(df).mark_line(point=True, clip=True).encode( x=alt.X(f'time:T', axis=alt.Axis(title=x_lab, labelAngle=30, format=time_format)), y=alt.Y('{}:Q'.format(par), scale=scy, axis=alt.Axis(title=y_lab)), color=alt.Color('direction_id', scale=alt.Scale(scheme=cn.color_schema)), ) points = alt.Chart(df).mark_point().encode( x=alt.X('time:T', axis=alt.Axis(title=x_lab)), y=alt.Y('{}:Q'.format(par), scale=scy, axis=alt.Axis(title=y_lab)), color=alt.Color('direction_id', scale=alt.Scale(scheme=cn.color_schema)), tooltip=['site_name', 'direction_id', 'time', par], opacity=alt.value(0.3)) chart = (points + line).properties(width=self.plot_width, height=self.plot_height, title=title) st.altair_chart(chart)
def result_heatmap(data, result="win", title=None, width=500, height=500): """ Function that takes a player's history data and returns an altair chart showing their winning percentage based on their hand totals and the dealer's up card """ possible_results = ["win", "loss", "push", "surrender"] assert result in possible_results, ( "'result' must be 'win', 'loss', or 'push'" ) if not title: title = f"{result.title()} Percentage" # convert data to a DataFrame if it's just a player's history list if isinstance(data, list): data = pd.DataFrame(data) # remove any hands where the dealer had blackjack or the player busted sub_data = data[(data["dealer_blackjack"] == 0) & (data["total"] <= 21)].copy() # calculate winning percentage for each total and dealer up card combo grouped_pct = sub_data.groupby( ["total", "dealer_up"] ).apply(results_pct, as_series=False) # unpack the tuple returned by groupby function and rename columns grouped_pct = grouped_pct.apply(pd.Series) grouped_pct.columns = possible_results # reset index and sort for plotting pct_data = grouped_pct.reset_index().sort_values("total", ascending=False) # dynamically determine how the legend should be labeled min_val = round(min(pct_data[possible_results].min()), 1) max_val = round(max(pct_data[possible_results].max()), 1) min_int = int(min_val * 10) max_int = int(max_val * 10) values = [ round(x * 0.1, 1) for x in range(min_int, max_int + 1) ] # create altair heatmap chart = alt.Chart( pct_data, title=title, width=width, height=height ).mark_rect(binSpacing=1).encode( x=alt.X( "dealer_up:O", axis=alt.Axis(orient="top", labelAngle=0), title="Dealer Up Card" ), y=alt.Y( "total:O", title="Player Total", sort=alt.EncodingSortField(op="mean", order="descending") ), color=alt.Color( f"{result}:Q", legend=alt.Legend( title=f"{result.title()} Probability", values=values ) ), tooltip=[ alt.Tooltip("dealer_up", title="Dealer Up Card"), alt.Tooltip("total", title="Player Total"), alt.Tooltip(f"{result}", title=f"{result.title()} Probability") ] ) return chart
# In[7]: Name_of_Feat = st.selectbox("Feature", Types_of_Features) chart_df = Final_table_clean[[ 'Song Name', 'Album Name', 'Release Date', 'Popularity', f'{Name_of_Feat}']] import altair as alt feat_header = Name_of_Feat.capitalize() st.header(f'{feat_header}' " vs. Popularity") c = alt.Chart(chart_df).mark_circle().encode( alt.X('Popularity', scale=alt.Scale(zero=False)), y=f'{Name_of_Feat}', color=alt.Color('Popularity', scale=alt.Scale(zero=False)), size=alt.value(200), tooltip=['Popularity', f'{Name_of_Feat}', 'Song Name', 'Album Name']) st.altair_chart(c, use_container_width=True) st.header("Table of Groovy Song Attributes") st.table(chart_df) # In[8]: st.write("acousticness: Confidence measure from 0.0 to 1.0 on if a track is acoustic.") st.write("energy: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.") st.write("instrumentalness: Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.") st.write("liveness: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.")
def confirmed(): data = "data/confirmed.csv" filename = "graphs/confirmed.png" if os.path.exists(filename): os.remove(filename) df = pd.read_csv(data) df["date"] = pd.to_datetime(df["date"]) df = df.loc[df["source"] == "fhi:git"] df["new_sma7"] = df.new.rolling(window=7).mean().shift() df = df.melt( id_vars=["date"], value_vars=["new", "new_sma7", "total"], var_name="category", value_name="value", ).dropna() rename = {"new": "New cases", "new_sma7": "Avg 7 d.", "total": "Cumulative"} df["category"] = df["category"].replace(rename) base = alt.Chart( df, title="Number of reported COVID-19 cases by specimen collection date (Source: FHI)", ).encode(alt.X("yearmonthdate(date):O", axis=alt.Axis(title=None, labelAngle=-40))) bar = ( base.transform_filter(alt.datum.category == "New cases") .mark_bar(color="#FFD1D1") .encode(y=alt.Y("value:Q", axis=alt.Axis(title="New per day", grid=True))) ) line = ( base.transform_filter(alt.datum.category == "Cumulative") .mark_line(color="#2E507B", strokeWidth=3) .encode( y=alt.Y("value:Q", axis=alt.Axis(title="Cumulative")), color=alt.Color( "category:N", scale=alt.Scale( domain=["New cases", "Avg 7 d.", "Cumulative"], range=["#FFD1D1", "red", "#2E507B"], ), legend=alt.Legend(title=None), ), ) ) ma7 = ( base.transform_filter(alt.datum.category == "Avg 7 d.") .mark_line(opacity=0.8) .encode(y=alt.Y("value:Q"), color=alt.Color("category:N")) ) chart = ( alt.layer(bar + ma7, line) .resolve_scale(y="independent") .properties(width=1200, height=600) .configure_legend( strokeColor="gray", fillColor="#FFFFFF", labelFontSize=12, symbolStrokeWidth=2, symbolSize=160, padding=6, cornerRadius=5, direction="horizontal", orient="none", legendX=480, legendY=655, ) ) chart.save(filename)
""" Seattle Weather Heatmap ----------------------- This example shows the 2010 daily high temperature (F) in Seattle, WA. """ # category: case studies import altair as alt from vega_datasets import data # Since the data is more than 5,000 rows we'll import it from a URL source = data.seattle_temps.url alt.Chart( source, title="2010 Daily High Temperature (F) in Seattle, WA").mark_rect().encode( x=alt.X('date:O', timeUnit='date'), y=alt.Y('date:O', timeUnit='month'), color=alt.Color('temp:Q', aggregate='max'), tooltip=[ alt.Tooltip('date:T', timeUnit='monthdate', title='Date'), alt.Tooltip('temp:Q', aggregate='max', title='Max Temp') ]).properties(width=600)
def vaccine_doses(): data = "data/vaccine_doses.csv" filename = "graphs/vaccine_doses.png" if os.path.exists(filename): os.remove(filename) df = pd.read_csv(data) df["date"] = pd.to_datetime(df["date"]) df = df[df["granularity_geo"] == "nation"] df["new_sma7"] = df.new_doses.rolling(window=7).mean().shift() df = df.melt( id_vars=["date"], value_vars=["total_dose_1", "total_dose_2", "total_dose_3"], var_name="category", value_name="value", ).dropna() rename = { "total_dose_1": "Dose 1", "total_dose_2": "Dose 2", "total_dose_3": "Dose 3", } df["category"] = df["category"].replace(rename) chart = ( alt.Chart( df, title="Number of people who received their first, second and third dose of a COVID-19 vaccine in Norway (Source: FHI)", ) .mark_area(line={}, opacity=0.3) .encode( x=alt.X("yearmonthdate(date):O", axis=alt.Axis(title=None, labelAngle=-40)), y=alt.Y( "value:Q", stack=None, title="Number of people", ), color=alt.Color( "category:N", scale=alt.Scale( domain=[ "Dose 1", "Dose 2", "Dose 3", ], range=["#5dade2", " #2ecc71", "#006600"], ), legend=alt.Legend(title=None), ), ) .properties(width=1200, height=600) .configure_legend( strokeColor="gray", fillColor="#FFFFFF", labelFontSize=12, symbolStrokeWidth=2, symbolSize=160, padding=6, cornerRadius=5, direction="horizontal", orient="none", legendX=380, legendY=660, ) ) chart.save(filename)
def tested_lab(): data = "data/tested_lab.csv" filename = "graphs/tested_lab.png" if os.path.exists(filename): os.remove(filename) df = pd.read_csv(data) mapping = { "new_neg": "New (Negative)", "new_pos": "New (Positive)", "new_total": "New", "pr100_pos": "Share Positive", "total": "Cumulative", } df = df.rename(columns=mapping) df["date"] = pd.to_datetime(df["date"]) df["Share Negative"] = 100 - df["Share Positive"] df = df.melt( id_vars=["date", "Share Positive"], var_name="category", value_name="value" ) base = alt.Chart( df, title="Number of tested persons per specimen collection date and number of positive results (Source: FHI)", ).encode(alt.X("yearmonthdate(date):O", axis=alt.Axis(title=None, labelAngle=-40))) andel = base.mark_line(color="red", opacity=0.8).encode( y=alt.Y("Share Positive:Q", title="% Positive", axis=alt.Axis(grid=True)) ) bar = ( base.transform_filter( (alt.datum.category == "New (Negative)") | (alt.datum.category == "New (Positive)") ) .mark_bar() .encode( y=alt.Y("value:Q", title="Number of persons"), color=alt.Color( "category:N", scale=alt.Scale( domain=["New (Positive)", "New (Negative)", "% Positive"], range=["#FF9622", "#6DA9FF", "red"], ), legend=alt.Legend(title=None), ), ) ) chart = ( alt.layer(bar, andel) .resolve_scale(y="independent") .properties(width=1200, height=600) .configure_legend( strokeColor="gray", fillColor="#FFFFFF", labelFontSize=12, symbolStrokeWidth=2, symbolSize=160, padding=6, cornerRadius=5, direction="horizontal", orient="none", legendX=480, legendY=655, ) ) chart.save(filename)
def smittestopp(): data = "data/smittestopp.csv" filename = "graphs/smittestopp.png" if os.path.exists(filename): os.remove(filename) df = pd.read_csv(data) df["date"] = pd.to_datetime(df["date"]) df = df.melt( id_vars=["date"], value_vars=["new_reported", "total_downloads"], var_name="category", value_name="value", ).dropna() rename = { "new_reported": "Number of reported infections", "total_downloads": "Number of downloads", } df["category"] = df["category"].replace(rename) base = alt.Chart( df, title="Number of downloads of Smittestopp og number of reported infections through the app (Source: FHI)", ).encode(alt.X("yearmonthdate(date):O", axis=alt.Axis(title=None, labelAngle=-40))) downloads = ( base.transform_filter(alt.datum.category == "Number of downloads") .mark_area(line={}, color="#5BC1FF", opacity=0.2) .encode( y=alt.Y( "value:Q", axis=alt.Axis(title="Number of downloads", grid=True), ) ) ) reported = ( base.transform_filter(alt.datum.category == "Number of reported infections") .mark_bar(color="#FFA57E") .encode( y=alt.Y("value:Q", axis=alt.Axis(title="Number of reported infections")), color=alt.Color( "category:N", scale=alt.Scale( domain=[ "Number of downloads", "Number of reported infections", ], range=["#5BC1FF", "#FFA57E"], ), legend=alt.Legend(title=None), ), ) ) chart = ( alt.layer(reported, downloads) .resolve_scale(y="independent") .properties(width=1200, height=600) .configure_legend( strokeColor="gray", fillColor="#FFFFFF", labelFontSize=12, symbolStrokeWidth=2, symbolSize=160, labelLimit=200, padding=6, cornerRadius=5, direction="horizontal", orient="none", legendX=390, legendY=660, ) ) chart.save(filename)
def hospitalized(): data = "data/hospitalized.csv" filename = "graphs/hospitalized.png" if os.path.exists(filename): os.remove(filename) df = pd.read_csv(data) today = date.today() idx = pd.date_range("2020-03-08", today) df.index = pd.DatetimeIndex(df["date"]) df = df.reindex(idx) df["date"] = df.index df = df.reset_index(drop=True) df["admissions"] = df["admissions"].fillna(method="ffill").astype(int) df["icu"] = df["icu"].fillna(method="ffill").astype(int) df["respiratory"] = df["respiratory"].fillna(method="ffill").astype(int) df_melt = pd.melt( df, id_vars=["date"], value_vars=["admissions", "icu", "respiratory"], value_name="value", ).replace( { "admissions": "Hospitalized", "icu": "Intensive", "respiratory": "Respirator", } ) chart = ( alt.Chart( df_melt, title="Number of patients admitted to hospital with COVID-19 (Source: Helsedirektoratet)", ) .mark_area(line={}, opacity=0.3) .encode( x=alt.X("yearmonthdate(date):O", axis=alt.Axis(title=None, labelAngle=-40)), y=alt.Y( "value:Q", stack=None, title="Number of patients", ), color=alt.Color( "variable:N", scale=alt.Scale( domain=["Hospitalized", "Intensive Care", "Respirator"], range=["#5A9DFF", "#FF8B1B", "#FF642B"], ), legend=alt.Legend(title=None), ), ) .properties(width=1200, height=600) .configure_legend( strokeColor="gray", fillColor="#FFFFFF", labelFontSize=12, symbolStrokeWidth=2, symbolSize=160, padding=6, cornerRadius=5, direction="horizontal", orient="none", legendX=480, legendY=655, ) ) chart.save(filename)
def dead(): data = "data/dead.csv" filename = "graphs/dead.png" if os.path.exists(filename): os.remove(filename) df = pd.read_csv(data) today = date.today() idx = pd.date_range("2020-03-07", df["date"].max()) df.index = pd.DatetimeIndex(df["date"]) df = df.reindex(idx) df["date"] = df.index df = df.reset_index(drop=True) df = df[df.date <= str(today)] df["new"] = df["new"].fillna(0).astype(int) df["total"] = df["total"].fillna(method="bfill").astype(int) df["new_sma7"] = df.new.rolling(window=7).mean() df = df.melt( id_vars=["date"], value_vars=["new", "new_sma7", "total"], var_name="category", value_name="value", ).dropna() rename = {"new": "New", "new_sma7": "Avg 7 d.", "total": "Cumulative"} df["category"] = df["category"].replace(rename) base = alt.Chart(df, title="COVID-19 related deaths (Source: FHI)").encode( alt.X("yearmonthdate(date):O", axis=alt.Axis(title=None, labelAngle=-40)) ) bar = ( base.transform_filter(alt.datum.category == "New") .mark_bar(color="#FFD1D1") .encode(y=alt.Y("value:Q", axis=alt.Axis(title="New per day", grid=True))) ) line = ( base.transform_filter(alt.datum.category == "Cumulative") .mark_line(color="#2E507B", strokeWidth=3) .encode( y=alt.Y("value:Q", axis=alt.Axis(title="Cumulative")), color=alt.Color( "category:N", scale=alt.Scale( domain=["New", "Avg 7 d.", "Cumulative"], range=["#FFD1D1", "red", "#2E507B"], ), legend=alt.Legend(title=None), ), ) ) ma7 = ( base.transform_filter(alt.datum.category == "Avg 7 d.") .mark_line(opacity=0.8) .encode(y=alt.Y("value:Q"), color=alt.Color("category:N")) ) chart = ( alt.layer(bar + ma7, line) .resolve_scale(y="independent") .properties(width=1200, height=600) .configure_legend( strokeColor="gray", fillColor="#FFFFFF", labelFontSize=12, symbolStrokeWidth=2, symbolSize=160, padding=6, cornerRadius=5, direction="horizontal", orient="none", legendX=480, legendY=655, ) ) chart.save(filename)
chart = alt.layer(line, selectors, points, rules, text).properties(width=900, height=300) #Plot Altair 7 geographical analysis; ref : https://github.com/altair-viz/altair/issues/2044 import altair as alt from vega_datasets import data world_source = final_df source = alt.topo_feature(data.world_110m.url, "countries") background = alt.Chart(source).mark_geoshape(fill="white") foreground = (alt.Chart(source).mark_geoshape( stroke="black", strokeWidth=0.15).encode( color=alt.Color( "confirmed:N", scale=alt.Scale(scheme="redpurple"), legend=None, ), tooltip=[ alt.Tooltip("Country/Region:N", title="Country"), alt.Tooltip("confirmed:Q", title="confirmed cases"), ], ).transform_lookup( lookup="id", from_=alt.LookupData(world_source, "id", ["confirmed", "Country/Region"]), )) final_map = ((background + foreground).configure_view( strokeWidth=0).properties(width=700, height=400).project("naturalEarth1")) print(final_map)
def make_category_per_user_plots(infile): grouped_flows = infra.pd.read_parquet(infile) grouped_flows = grouped_flows.reset_index() grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[ "bytes_down"] user_category_total = grouped_flows[["user", "category", "bytes_total" ]].groupby(["user", "category" ]).sum().reset_index() # Filter users by time in network to eliminate early incomplete samples user_active_ranges = infra.pd.read_parquet( "data/clean/user_active_deltas.parquet")[[ "user", "days_since_first_active", "days_active", "days_online" ]] # Drop users that joined less than a week ago. users_to_analyze = user_active_ranges.loc[ user_active_ranges["days_since_first_active"] > 7] # Drop users active for less than one day users_to_analyze = users_to_analyze.loc[ users_to_analyze["days_active"] > 1, ] # Sort categories by total amount of bytes. cat_totals = grouped_flows.groupby("category").sum().reset_index() cat_sort_order = cat_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() cat_sort_order["cat_rank"] = cat_sort_order["bytes_total"].rank( method="min", ascending=False) cat_sort_list = cat_sort_order["category"].tolist() # Rank users by their daily use. user_totals = user_category_total.groupby("user").sum().reset_index() user_totals = user_totals.merge(users_to_analyze, on="user", how="inner") user_totals["user_total_bytes_avg_online_day"] = user_totals[ "bytes_total"] / user_totals["days_online"] user_totals["user_rank"] = user_totals[ "user_total_bytes_avg_online_day"].rank(method="min") user_category_total = user_category_total.merge(user_totals[[ "user", "user_rank", "days_online", "user_total_bytes_avg_online_day" ]], on="user", how="inner") user_category_total = user_category_total.merge( cat_sort_order[["category", "cat_rank"]], on="category", how="inner") print(user_category_total) user_category_total["bytes_avg_online_day"] = user_category_total[ "bytes_total"] / user_category_total["days_online"] user_category_total["share_of_bytes_avg_online_day"] = \ user_category_total["bytes_avg_online_day"] / user_category_total["user_total_bytes_avg_online_day"] print(user_category_total) # This might not be showing exactly what I want to show, since in merging # users some users that dominate video could be overrepresented. Maybe # want to merge on the fraction of traffic to each part from each user? # Are users counted equally or are bytes counted equally... alt.Chart(user_category_total[[ "category", "user_rank", "cat_rank", "bytes_avg_online_day" ]]).mark_bar().encode( x="user_rank:O", y=alt.Y( "bytes_avg_online_day", stack="normalize", sort=cat_sort_list, ), color=alt.Color( "category:N", scale=alt.Scale(scheme="tableau20"), sort=cat_sort_list, ), order=alt.Order( "cat_rank", sort="descending", ), ).properties(width=500, ).save( "renders/bytes_per_average_online_day_per_user_bar.png", scale_factor=2, ) alt.Chart(user_category_total[[ "category", "user_rank", "cat_rank", "bytes_avg_online_day" ]]).mark_point( size=10, strokeWidth=2, ).encode( x="user_rank:O", y=alt.Y("bytes_avg_online_day", sort=cat_sort_list, title="average bytes per online day"), color=alt.Color( "category:N", scale=alt.Scale(scheme="tableau20"), sort=cat_sort_list, ), order=alt.Order( "cat_rank", sort="descending", ), ).properties(width=500, ).save( "renders/bytes_per_average_online_day_per_user_points.png", scale_factor=2, ) alt.Chart(user_category_total[[ "category", "user_rank", "cat_rank", "share_of_bytes_avg_online_day" ]]).mark_point( size=10, strokeWidth=2, ).encode( x="user_rank:O", y=alt.Y("share_of_bytes_avg_online_day", sort=cat_sort_list, title="share of average bytes per online day"), color=alt.Color( "category:N", scale=alt.Scale(scheme="tableau20"), sort=cat_sort_list, ), order=alt.Order( "cat_rank", sort="descending", ), ).properties(width=500, ).save( "renders/share_of_bytes_per_average_online_day_per_user_points.png", scale_factor=2, )
def outcome_bars(data, name=None, width=100): """ Create a bar chart showing the percentage of hands won, lost, and pushed """ # if it's a dataframe already, just add the name for the legend if isinstance(data, pd.DataFrame): data_list = [data] elif isinstance(data, list): # check if it's a list of dicionaries, like player history, or a list # of lists for item in data: l_o_d = isinstance(item, dict) # if it's a list of dictionaries, just convert them if l_o_d: data_list = [pd.DataFrame(data)] else: data_list = [pd.DataFrame(item) for item in data] else: msg = "'data' must be a DataFrame or list" raise TypeError(msg) # calculate percentages # assign name to data if not name: name = [f"Game{i}" for i in range(len(data))] plot_data_list = [] # list to hold dataframes that will be plotted for _name, _data in zip(name, data_list): win, loss, push, surrender = results_pct(_data, as_series=False) plot_data_list.append( {"game": _name, "result": "Win", "pct": win, "order": 1}, ) plot_data_list.append( {"game": _name, "result": "Loss", "pct": loss, "order": 2} ) plot_data_list.append( {"game": _name, "result": "Push", "pct": push, "order": 3} ) plot_data_list.append( {"game": _name, "result": "Surrender", "pct": surrender, "order": 3} ) plot_data = pd.DataFrame(plot_data_list) # create altair chart chart = alt.Chart(plot_data, width=width).mark_bar().encode( x=alt.X( "game", axis=alt.Axis(labelAngle=-45), title=None, sort=["Win", "Loss", "Push"] ), y=alt.Y( "pct:Q" ), color=alt.Color( "game:O", legend=None ), column=alt.Column( "result:O", title="Result" ), tooltip=[ alt.Tooltip("pct", title="Pct") ] ) return chart
def make_plot(): transactions = infra.pd.read_parquet( "data/clean/transactions_DIV_none_INDEX_timestamp.parquet") purchases = transactions.loc[transactions["kind"] == "purchase"] purchases = purchases.groupby("amount_bytes")["timestamp"].count() purchases = purchases.reset_index().rename({"timestamp": "count"}, axis="columns") purchases["amount_MB"] = purchases["amount_bytes"] * 1.0 / 1000**2 purchases[ "total_GB"] = purchases["amount_MB"] * purchases["count"] * 1.0 / 1000 print(purchases) bars = alt.Chart(purchases).mark_bar().encode( x=alt.X( 'count', title="Count", ), y=alt.Y( 'amount_MB', type="ordinal", title="Package Type (MB)", ), color=alt.Color( 'amount_MB:N', legend=None, )) text = bars.mark_text( align="left", baseline="middle", xOffset=5, ).encode( text="count:Q", color=alt.value("black"), ) bars = text + bars bars.properties( width=500, height=75, ).save( "renders/package_counts.png", scale_factor=2, ) alt.Chart(purchases).mark_bar().encode(x=alt.X( 'total_GB', title="Total GB Purchased", ), y=alt.Y( 'amount_MB', type="ordinal", title="Package Type (MB)", ), color=alt.Color( 'amount_MB:N', legend=None, )).properties( width=500, height=75, ).save( "renders/package_bytes.png", scale_factor=2, )
'col': 2 }, { 'country': 'United States', 'animal': 'sheep', 'col': 1 }]) domains = ['person', 'cattle', 'pigs', 'sheep'] shape_scale = alt.Scale( domain=domains, range=[ 'M1.7 -1.7h-0.8c0.3 -0.2 0.6 -0.5 0.6 -0.9c0 -0.6 -0.4 -1 -1 -1c-0.6 0 -1 0.4 -1 1c0 0.4 0.2 0.7 0.6 0.9h-0.8c-0.4 0 -0.7 0.3 -0.7 0.6v1.9c0 0.3 0.3 0.6 0.6 0.6h0.2c0 0 0 0.1 0 0.1v1.9c0 0.3 0.2 0.6 0.3 0.6h1.3c0.2 0 0.3 -0.3 0.3 -0.6v-1.8c0 0 0 -0.1 0 -0.1h0.2c0.3 0 0.6 -0.3 0.6 -0.6v-2c0.2 -0.3 -0.1 -0.6 -0.4 -0.6z', 'M4 -2c0 0 0.9 -0.7 1.1 -0.8c0.1 -0.1 -0.1 0.5 -0.3 0.7c-0.2 0.2 1.1 1.1 1.1 1.2c0 0.2 -0.2 0.8 -0.4 0.7c-0.1 0 -0.8 -0.3 -1.3 -0.2c-0.5 0.1 -1.3 1.6 -1.5 2c-0.3 0.4 -0.6 0.4 -0.6 0.4c0 0.1 0.3 1.7 0.4 1.8c0.1 0.1 -0.4 0.1 -0.5 0c0 0 -0.6 -1.9 -0.6 -1.9c-0.1 0 -0.3 -0.1 -0.3 -0.1c0 0.1 -0.5 1.4 -0.4 1.6c0.1 0.2 0.1 0.3 0.1 0.3c0 0 -0.4 0 -0.4 0c0 0 -0.2 -0.1 -0.1 -0.3c0 -0.2 0.3 -1.7 0.3 -1.7c0 0 -2.8 -0.9 -2.9 -0.8c-0.2 0.1 -0.4 0.6 -0.4 1c0 0.4 0.5 1.9 0.5 1.9l-0.5 0l-0.6 -2l0 -0.6c0 0 -1 0.8 -1 1c0 0.2 -0.2 1.3 -0.2 1.3c0 0 0.3 0.3 0.2 0.3c0 0 -0.5 0 -0.5 0c0 0 -0.2 -0.2 -0.1 -0.4c0 -0.1 0.2 -1.6 0.2 -1.6c0 0 0.5 -0.4 0.5 -0.5c0 -0.1 0 -2.7 -0.2 -2.7c-0.1 0 -0.4 2 -0.4 2c0 0 0 0.2 -0.2 0.5c-0.1 0.4 -0.2 1.1 -0.2 1.1c0 0 -0.2 -0.1 -0.2 -0.2c0 -0.1 -0.1 -0.7 0 -0.7c0.1 -0.1 0.3 -0.8 0.4 -1.4c0 -0.6 0.2 -1.3 0.4 -1.5c0.1 -0.2 0.6 -0.4 0.6 -0.4z', 'M1.2 -2c0 0 0.7 0 1.2 0.5c0.5 0.5 0.4 0.6 0.5 0.6c0.1 0 0.7 0 0.8 0.1c0.1 0 0.2 0.2 0.2 0.2c0 0 -0.6 0.2 -0.6 0.3c0 0.1 0.4 0.9 0.6 0.9c0.1 0 0.6 0 0.6 0.1c0 0.1 0 0.7 -0.1 0.7c-0.1 0 -1.2 0.4 -1.5 0.5c-0.3 0.1 -1.1 0.5 -1.1 0.7c-0.1 0.2 0.4 1.2 0.4 1.2l-0.4 0c0 0 -0.4 -0.8 -0.4 -0.9c0 -0.1 -0.1 -0.3 -0.1 -0.3l-0.2 0l-0.5 1.3l-0.4 0c0 0 -0.1 -0.4 0 -0.6c0.1 -0.1 0.3 -0.6 0.3 -0.7c0 0 -0.8 0 -1.5 -0.1c-0.7 -0.1 -1.2 -0.3 -1.2 -0.2c0 0.1 -0.4 0.6 -0.5 0.6c0 0 0.3 0.9 0.3 0.9l-0.4 0c0 0 -0.4 -0.5 -0.4 -0.6c0 -0.1 -0.2 -0.6 -0.2 -0.5c0 0 -0.4 0.4 -0.6 0.4c-0.2 0.1 -0.4 0.1 -0.4 0.1c0 0 -0.1 0.6 -0.1 0.6l-0.5 0l0 -1c0 0 0.5 -0.4 0.5 -0.5c0 -0.1 -0.7 -1.2 -0.6 -1.4c0.1 -0.1 0.1 -1.1 0.1 -1.1c0 0 -0.2 0.1 -0.2 0.1c0 0 0 0.9 0 1c0 0.1 -0.2 0.3 -0.3 0.3c-0.1 0 0 -0.5 0 -0.9c0 -0.4 0 -0.4 0.2 -0.6c0.2 -0.2 0.6 -0.3 0.8 -0.8c0.3 -0.5 1 -0.6 1 -0.6z', 'M-4.1 -0.5c0.2 0 0.2 0.2 0.5 0.2c0.3 0 0.3 -0.2 0.5 -0.2c0.2 0 0.2 0.2 0.4 0.2c0.2 0 0.2 -0.2 0.5 -0.2c0.2 0 0.2 0.2 0.4 0.2c0.2 0 0.2 -0.2 0.4 -0.2c0.1 0 0.2 0.2 0.4 0.1c0.2 0 0.2 -0.2 0.4 -0.3c0.1 0 0.1 -0.1 0.4 0c0.3 0 0.3 -0.4 0.6 -0.4c0.3 0 0.6 -0.3 0.7 -0.2c0.1 0.1 1.4 1 1.3 1.4c-0.1 0.4 -0.3 0.3 -0.4 0.3c-0.1 0 -0.5 -0.4 -0.7 -0.2c-0.3 0.2 -0.1 0.4 -0.2 0.6c-0.1 0.1 -0.2 0.2 -0.3 0.4c0 0.2 0.1 0.3 0 0.5c-0.1 0.2 -0.3 0.2 -0.3 0.5c0 0.3 -0.2 0.3 -0.3 0.6c-0.1 0.2 0 0.3 -0.1 0.5c-0.1 0.2 -0.1 0.2 -0.2 0.3c-0.1 0.1 0.3 1.1 0.3 1.1l-0.3 0c0 0 -0.3 -0.9 -0.3 -1c0 -0.1 -0.1 -0.2 -0.3 -0.2c-0.2 0 -0.3 0.1 -0.4 0.4c0 0.3 -0.2 0.8 -0.2 0.8l-0.3 0l0.3 -1c0 0 0.1 -0.6 -0.2 -0.5c-0.3 0.1 -0.2 -0.1 -0.4 -0.1c-0.2 -0.1 -0.3 0.1 -0.4 0c-0.2 -0.1 -0.3 0.1 -0.5 0c-0.2 -0.1 -0.1 0 -0.3 0.3c-0.2 0.3 -0.4 0.3 -0.4 0.3l0.2 1.1l-0.3 0l-0.2 -1.1c0 0 -0.4 -0.6 -0.5 -0.4c-0.1 0.3 -0.1 0.4 -0.3 0.4c-0.1 -0.1 -0.2 1.1 -0.2 1.1l-0.3 0l0.2 -1.1c0 0 -0.3 -0.1 -0.3 -0.5c0 -0.3 0.1 -0.5 0.1 -0.7c0.1 -0.2 -0.1 -1 -0.2 -1.1c-0.1 -0.2 -0.2 -0.8 -0.2 -0.8c0 0 -0.1 -0.5 0.4 -0.8z' ]) color_scale = alt.Scale(domain=domains, range=[ 'rgb(162,160,152)', 'rgb(194,81,64)', 'rgb(93,93,93)', 'rgb(91,131,149)' ]) alt.Chart(source).mark_point(filled=True).encode( alt.X('col:O', axis=None), alt.Y('animal:O', axis=None), alt.Row('country:N', header=alt.Header(title='')), alt.Shape('animal:N', legend=None, scale=shape_scale), alt.Color('animal:N', legend=None, scale=color_scale), alt.OpacityValue(1), alt.SizeValue(200)).properties(width=800, height=200)
source2 = [ { "start": "1933", "end": "1945", "event": "Nazi Rule" }, { "start": "1948", "end": "1989", "event": "GDR (East Germany)" }, ] source = alt.pd.DataFrame(source) source2 = alt.pd.DataFrame(source2) line = alt.Chart(source).mark_line(color="#333").encode( x=alt.X("year:T", axis=alt.Axis(format="%Y"), title="Year"), y=alt.Y("population", title="Population"), ) point = line.mark_point(color="#333") rect = alt.Chart(source2).mark_rect().encode(x="start:T", x2="end:T", color=alt.Color("event:N", title="Event")) (rect + line + point).properties( title="Population of Falkensee from 1875 to 2014", width=500, height=300)
def exercise_ecg_interactive_plot( sample_id: Union[int, str], folder: Optional[str] = None, time_interval_seconds: int = 10, ) -> Union[HTML, alt.Chart]: """Wrangle exercise ECG data to tidy and present it as an interactive plot. Args: sample_id: The id of the ECG sample to retrieve. folder: The local or Cloud Storage folder under which the files reside. time_interval_seconds: the width of the time interval (in seconds) to display of signal data Returns: An Altair plot or a notebook-friendly error. """ (exercise_ecg_trend, exercise_ecg_signal) = reshape_exercise_ecg_to_tidy(sample_id=sample_id, folder=folder) if(exercise_ecg_trend.shape[0] == 0 or exercise_ecg_signal.shape[0] == 0): return HTML(f''' <div class="alert alert-block alert-danger"> <b>Warning:</b> Exercise ECG not available for sample {sample_id}.<br> Use the <kbd>folder</kbd> parameter to read HD5s from a different local directory or Cloud Storage bucket. </div>''') trend_data_file = os.path.basename(EXERCISE_ECG_TREND_DATA_FILE.name) exercise_ecg_trend.to_json(trend_data_file, orient='records') signal_data_file = os.path.basename(EXERCISE_ECG_SIGNAL_DATA_FILE.name) exercise_ecg_signal.to_json(signal_data_file, orient='records') brush = alt.selection_single(on='mouseover', nearest=True, fields=['time'], init={'time': 200.0}) lead_dropdown = alt.binding_select(options=list(exercise_ecg_signal.lead.unique())) lead_select = alt.selection_single( fields=['lead'], bind=lead_dropdown, name='Choose just one to view', init={'lead': exercise_ecg_signal.lead.unique()[0]}, ) trend = alt.Chart(trend_data_file).mark_point(opacity=0.8, filled=True, size=100).encode( x='time:Q', color=alt.Color('phasename:N', legend=alt.Legend(orient='top'), title='Phase names'), tooltip=[ 'artifact:Q', 'grade:Q', 'heartrate:Q', 'load:Q', 'mets:Q', 'pacecount:Q', 'phasename:N', 'phasetime:Q', 'time:Q', 'vecount:Q', ], ).properties( width=900, height=100, title=f'Click on a point to select a {time_interval_seconds} second time interval.', ).add_selection(brush) signal = alt.Chart(signal_data_file).mark_line().encode( alt.X('time:Q', axis=alt.Axis(labelAngle=15)), y='raw_mV:Q', color=alt.Color('lead:N', legend=alt.Legend(orient='top'), title='Lead names'), ).properties( width=900, height=300, title='Exercise ECG signal for {}'.format(sample_id), ).add_selection( lead_select, ).transform_filter( lead_select, ).transform_filter( # https://github.com/altair-viz/altair/issues/1960 f'''((toNumber({brush.name}.time) - {time_interval_seconds/2.0}) < datum.time) && (datum.time < toNumber({brush.name}.time) + {time_interval_seconds/2.0})''', ) return trend.encode(y='heartrate:Q') & trend.encode(y='load:Q') & signal
import altair as alt import pandas as pd penguins_df = pd.read_csv('data/penguins.csv') colour_plot = alt.Chart(penguins_df).mark_point(size=10).encode( alt.X('flipper_length_mm', scale=alt.Scale(domain=[160, 240]), title="Flipper length (mm)"), alt.Y('body_mass_g', scale=alt.Scale(domain=[2500, 6500]), title='Mass (grams)'), alt.Color('species', title='Penguin species', scale=alt.Scale(scheme='set2')), alt.Shape('species') ).properties( title= 'Gentoo penguins tend to have the longest flippers and weight the most among the penguin species.' ) colour_plot
st.header("**Card Frequency over Decks**") st.image(build_image(top_30_path), width=150) st.text(f"The 30 most frequent cards used in {class_selectbox} decks") top_30 = cards_appearance.head(30) rarity_color_list = [ '#641E16', '#2ECC71', '#D0D3D4', '#3498DB', '#8E44AD', '#F7DC6F' ] cards_appearance_bars = alt.Chart(top_30).mark_bar(size=20).encode( x=alt.X('numberOfAppearance:Q'), y=alt.Y('cardName:N', sort="-x"), color=alt.Color('numberOfAppearance:Q', legend=None, scale=alt.Scale(domain=[class_selectbox], range=[class_selected_details["color"]])), # TODO improvement: colors by rarity with correct sort by Q desc tooltip=['cardName', 'cardRarity', 'numberOfAppearance']) cards_appearance_text = cards_appearance_bars.mark_text( align='left', baseline='middle', dx=3).encode(text='numberOfAppearance:Q') cards_appearance_chart = (cards_appearance_bars + cards_appearance_text).configure_axis( grid=False).configure_view( strokeWidth=0).properties(width=700, height=700) st.write(cards_appearance_chart) st.header("**Card Details**")
def render_most_similar(data, title): bars = ( alt.Chart(data, height=400, title=title) .mark_bar() .encode( alt.X( 'distance', title='', scale=alt.Scale(domain=(0, 1.0), clamp=True), axis=None ), alt.Y( 'word', title='', sort=alt.EncodingSortField( field='distance', order='descending' ) ), color=alt.Color('distance', legend=None, scale=alt.Scale(scheme='blues')), tooltip=[ alt.Tooltip( field='word', type='nominal' ), alt.Tooltip( field='distance', format='.3f', type='quantitative' ) ] ) ) text = alt.Chart(data).mark_text( align='left', baseline='middle', dx=5, font='Roboto', size=15, color='black' ).encode( x=alt.X( 'distance', axis=None ), y=alt.Y( 'word', sort=alt.EncodingSortField( field='distance', order='descending' ) ), text=alt.Text("distance", format=".3f"), ) chart = bars + text chart = (chart.configure_axisX( labelFontSize=20, labelFont='Roboto', grid=False, domain=False ) .configure_axisY( labelFontSize=20, labelFont='Roboto', grid=False, domain=False ) .configure_view( strokeOpacity=0 ) .configure_title( fontSize=25, font='Roboto', dy=-10 ) ) return chart
def render_curve(df, ns=[], epsilons=[], save_path=None): """Render, and optionally save, a plot of the loss-data curve. Optionally takes arguments `ns` and `epsilons` to draw lines on the plot illustrating where metrics were calculated. Arguments: - df: (pd.DataFrame) the dataframe containing a loss-data curve as returned by LossDataEstimator.compute_curve or LossDataEstimator.to_dataframe. - ns: (list<num>) the list of training set sizes to use for computing metrics. - epsilons: (list<num>) the settings of epsilon used for computing SDL and eSC. - save_path: (str) optional: a path (ending in .pdf or .png) to save the chart. saving requires the [`altair-saver`](https://github.com/altair-viz/altair_saver/) package and its dependencies. Returns: an Altair chart. Note that this chart displays well in notebooks, so calling `render_curve(df)` without a save path will work well with Jupyter. """ import altair as alt from . import altair_theme # noqa: F401 alt.data_transformers.disable_max_rows() if "name" not in df: print("Dataframe has no 'name' field. Using 'default'.") df['name'] = 'default' if len(ns) > 0: ns = _closest_valid_ns(df, ns) title = 'Loss-data curve' color_title = 'Representation' xscale = alt.Scale(type='log') yscale = alt.Scale(type='log') x_axis = alt.X('samples', scale=xscale, title='Dataset size') y_axis = alt.Y('mean(val_loss)', scale=yscale, title='Validation loss') line = alt.Chart(df, title=title).mark_line() line = line.encode( x=x_axis, y=y_axis, color=alt.Color('name:N', title=color_title, legend=None), ) point = alt.Chart(df, title=title).mark_point(size=80, opacity=1) point = point.encode(x=x_axis, y=y_axis, color=alt.Color( 'name:N', title=color_title, ), shape=alt.Shape('name:N', title=color_title), tooltip=['samples', 'name']) rules_df = pd.concat( [pd.DataFrame({'x': ns}), pd.DataFrame({'y': epsilons})], sort=False) rule_x = alt.Chart(rules_df).mark_rule(strokeDash=[4, 4]).encode(x='x') rule_y = alt.Chart(rules_df).mark_rule(strokeDash=[4, 4]).encode(y='y') chart = alt.layer(rule_x, rule_y, line, point).resolve_scale(color='independent', shape='independent') if save_path is not None: import altair_saver altair_saver.save(chart, save_path) return chart
def build_graph(self): with open(os.path.join(os.path.dirname(__file__), 'colors.json')) as f: colors = json.load(f) allColorsValues = [] # filter data max_languages = 5 top_languages = {} for year in self.yearly_data.keys(): for quarter in self.yearly_data[year].keys(): for language in sorted(list(self.yearly_data[year][quarter].keys()), key=lambda lang: self.yearly_data[year][quarter][lang], reverse=True)[ 0:max_languages]: if 'top' not in self.yearly_data[year][quarter]: self.yearly_data[year][quarter]['top'] = {} if self.yearly_data[year][quarter][language] != 0: self.yearly_data[year][quarter]['top'][language] = self.yearly_data[year][quarter][language] if language not in top_languages: top_languages[language] = 1 top_languages[language] += 1 # print(self.yearly_data) all_languages = list(top_languages.keys()) for language in all_languages: if colors[language]['color'] is not None: allColorsValues.append(colors[language]['color']) languages_all_loc = {} for language in all_languages: language_year = [] for year in self.yearly_data.keys(): language_quarter = [0, 0, 0, 0] for quarter in self.yearly_data[year].keys(): if language in self.yearly_data[year][quarter]['top']: language_quarter[quarter - 1] = self.yearly_data[year][quarter]['top'][language] else: language_quarter[quarter - 1] = 0 language_year.append(language_quarter) languages_all_loc[language] = language_year # print(languages_all_loc) language_df = {} def prep_df(df, name): df = df.stack().reset_index() df.columns = ['c1', 'c2', 'values'] df['Language'] = name return df for language in languages_all_loc.keys(): language_df[language] = pd.DataFrame(languages_all_loc[language], index=list(self.yearly_data.keys()), columns=["Q1", "Q2", "Q3", "Q4"]) for language in language_df.keys(): language_df[language] = prep_df(language_df[language], language) df = pd.concat(language_df.values()) chart = alt.Chart(df).mark_bar().encode( # tell Altair which field to group columns on x=alt.X('c2:N', title=None), # tell Altair which field to use as Y values and how to calculate y=alt.Y('sum(values):Q', axis=alt.Axis( grid=False, title='Lines Of Code added')), # tell Altair which field to use to use as the set of columns to be represented in each group column=alt.Column('c1:N', title=None), # tell Altair which field to use for color segmentation color=alt.Color('Language:N', scale=alt.Scale( domain=all_languages, # make it look pretty with an enjoyable color pallet range=allColorsValues, ), )) \ .configure_view( # remove grid lines around column clusters strokeOpacity=0 ) chart.save('bar_graph.png') return 'bar_graph.png'
def chart1(): def mds_special(): font = "Arial" axisColor = "#000000" gridColor = "#DEDDDD" return { "config": { "title": { "fontSize": 18, "font": font, "anchor": "start", # equivalent of left-aligned. "fontColor": "#000000" }, 'view': { "height": 300, "width": 400 }, "axisX": { "domain": True, #"domainColor": axisColor, "gridColor": gridColor, "domainWidth": 1, "grid": False, "labelFont": font, "labelFontSize": 12, "labelAngle": 0, "tickColor": axisColor, "tickSize": 5, # default, including it just to show you can change it "titleFont": font, "titleFontSize": 16, "titlePadding": 10, # guessing, not specified in styleguide "title": "X Axis Title (units)", }, "axisY": { "domain": False, "grid": True, "gridColor": gridColor, "gridWidth": 1, "labelFont": font, "labelFontSize": 12, "labelAngle": 0, #"ticks": False, # even if you don't have a "domain" you need to turn these off. "titleFont": font, "titleFontSize": 16, "titlePadding": 10, # guessing, not specified in styleguide "title": "Y Axis Title (units)", # titles are by default vertical left of axis so we need to hack this #"titleAngle": 0, # horizontal #"titleY": -10, # move it up #"titleX": 18, # move it to the right so it aligns with the labels }, } } # register the custom theme under a chosen name alt.themes.register('mds_special', mds_special) # enable the newly registered theme alt.themes.enable('mds_special') from vega_datasets import data states = alt.topo_feature(data.us_10m.url, 'states') hate_crime = pd.read_csv('../data/crime_state_id_clean.csv') p1 = alt.Chart(states).mark_geoshape().encode( alt.Color('avg_hatecrimes_per_100k_fbi:Q', title="Average hate crime per 100K"), tooltip=[ alt.Tooltip('avg_hatecrimes_per_100k_fbi:Q', title='Average hate crime per 100K'), alt.Tooltip('state:N') ]).transform_lookup( lookup='id', from_=alt.LookupData(hate_crime, 'id', [ 'avg_hatecrimes_per_100k_fbi', 'state' ])).project('albersUsa').properties( title='Average hate crimes per 100K population in each state', width=550, height=300) return p1
def main(_): print("Loading data...") dfs = [] for filename in os.listdir(FLAGS.data): if filename.endswith(".csv"): dfs.append( pd.read_csv(os.path.join(FLAGS.data, filename), encoding="utf-8")) data = pd.concat(dfs) print("%d Examples" % (len(set(data["id"])))) print("%d Annotations" % len(data)) if not os.path.isdir(FLAGS.plot_dir): os.makedirs(FLAGS.plot_dir) with open(FLAGS.emotion_file, "r") as f: all_emotions = f.read().splitlines() all_emotions_neutral = all_emotions + ["neutral"] emotion2idx = {e: i for i, e in enumerate(all_emotions)} print("%d emotion Categories" % len(all_emotions)) print("Processing data...") # Remove neutral labels data = data[data["neutral"] == 0] # Remove examples with no ratings (difficult examples) data = data[data[all_emotions_neutral].sum(axis=1) != 0] # Convert into num_examples x num_raters x num_ratings format data = data.groupby("id").filter(lambda x: len(x) >= 3) id_groups = data.groupby("id") worker2examples = {} # dict mapping worker ids to (example, rater id) tuples max_num_raters = data.groupby("id").size().max() ratings = np.zeros( (len(id_groups), max_num_raters, len(all_emotions))) # ignore "neutral" rater_msk = np.zeros( (len(id_groups), max_num_raters)) # for masking out non-existent raters print("Ratings shape", ratings.shape) # Get ratings and rater mask texts = [] for ex_idx, (_, g) in enumerate(id_groups): texts.append(g.iloc[0]["text"]) rater_count = 0 # iterate through workers for _, row in g.iterrows(): for e in all_emotions: ratings[ex_idx, rater_count, emotion2idx[e]] = row[e] rater_msk[ex_idx, rater_count] = 1 worker_id = row["rater_id"] if worker_id in worker2examples: worker2examples[worker_id].append((ex_idx, rater_count)) else: worker2examples[worker_id] = [(ex_idx, rater_count)] rater_count += 1 print("Calculating leave-out (partial) correlations...") partial_corr_per_rater = [] corr_per_rater = [] for worker_id in worker2examples: partial_corrs, corrs = LeaveOut(ratings, rater_msk, worker2examples, worker_id) if len(partial_corrs) < len(all_emotions): continue partial_corr_per_rater.append(partial_corrs) corr_per_rater.append(corrs) corr_per_rater = np.array(corr_per_rater) partial_corr_per_rater = np.array(partial_corr_per_rater) # Verify that there are no NaN values assert np.isnan(corr_per_rater).sum() == 0 # Apply Wilcoxon signed rank test to test significance of each dimension p_vals = np.apply_along_axis(wilcoxon, 0, partial_corr_per_rater)[1] # Apply Bonferroni correction reject, corr_pvals, _, newalpha = multipletests( p_vals, alpha=0.05, method="bonferroni") print("Which dimensions to keep?") print(reject) print(corr_pvals) print(newalpha) print("Running PPCA on all the data...") # Take all raters and split them randomly x = [] y = [] rater_counts = rater_msk.sum(axis=1).astype(int) all_ratings_avg = [] for i, ex in enumerate(ratings): # Get actual raters based on mask keep = [] for worker_rating in ex[:rater_counts[i]]: keep.append(list(worker_rating)) all_ratings_avg.append(list(np.array(keep).mean(axis=0))) # Shuffle raters randomly random.shuffle(keep) num_raters = len(keep) x.append(list(np.array(keep[:int(num_raters / 2)]).mean(axis=0))) y.append(list(np.array(keep[int(num_raters / 2):]).mean(axis=0))) x = np.array(x) y = np.array(y) all_ratings_avg = np.array(all_ratings_avg) w, v = PPCA(x, y) # final components (p-values determine which ones to keep) print("Plotting percentage of covariance explained...") PlotCovar(v) # Apply varimax rotation w_vari = Varimax(w) # Get mapping between ppcs and emotions map_df = pd.DataFrame( w_vari, index=all_emotions, columns=np.arange(len(all_emotions))).round(4) # Sort to move values to diagonal map_df = map_df[list( np.argsort(map_df.apply(lambda x: pd.Series.nonzero(x)[0]).values)[0])] f = plt.figure(figsize=(10, 6), dpi=300) sns.heatmap( map_df, center=0, cmap=sns.diverging_palette(240, 10, n=50), yticklabels=all_emotions) plt.xlabel("Component") plt.savefig( FLAGS.plot_dir + "/component_loadings.pdf", dpi=600, format="pdf", bbox_inches="tight") ppc2emotion = map_df.abs().idxmax().to_dict() emotion2ppc = {e: i for i, e in ppc2emotion.items()} print(ppc2emotion) print("Plotting frequency and mean left-out rater correlations...") corr_mean = corr_per_rater.mean(axis=0) corr_mean_ordered = [corr_mean[emotion2ppc[e]] for e in all_emotions] df_plot = pd.DataFrame({ "emotion": all_emotions, "agreement": corr_mean_ordered }) df_plot["count"] = df_plot["emotion"].map( data[all_emotions].sum(axis=0).to_dict()) df_plot.sort_values("count", ascending=False, inplace=True) df_plot.to_csv(FLAGS.plot_dir + "/emotion_agreements.csv", index=False) # Get colors norm = plt.Normalize(df_plot["agreement"].min(), df_plot["agreement"].max()) sm = plt.cm.ScalarMappable(cmap="BuPu", norm=norm) sm.set_array([]) # Generate figure fig = plt.figure(dpi=600, figsize=(5, 6)) ax = sns.barplot( data=df_plot, y="emotion", x="count", orient="h", hue="agreement", palette="BuPu", dodge=False, edgecolor="black", linewidth=1) ax.get_legend().remove() ax.figure.colorbar(sm) plt.text(18000, 31, "Interrater\nCorrelation", ha="center") plt.xlabel("Number of Examples") plt.ylabel("") plt.draw() labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(["%dk" % (int(int(label) / 1000)) for label in labels]) plt.tight_layout() fig.savefig( FLAGS.plot_dir + "/label_distr_agreement.pdf", dpi=600, format="pdf", bbox_inches="tight") print("Generating t-SNE plot...") # Get PPC scores for all examples all_ratings_avg = Demean(all_ratings_avg) # demean all ratings ppc_scores = all_ratings_avg.dot(w_vari) # project onto ppcs ppc_scores_abs = np.absolute(ppc_scores) # Load maximally distinct colors colors = pd.read_csv( FLAGS.rgb_colors, sep="\t", header=None, names=np.arange(3)) # Set colors (todo(ddemszky): add names to colors in file) palette_rgb = colors.values with open(FLAGS.emotion_color_order) as f: color_order = f.read().splitlines() ppc2color = {emotion2ppc[e]: i for i, e in enumerate(color_order)} # get rgb value for each example based on weighted average of top emotions rgb_vals = [] hex_vals = [] top_categories = [] threshold = 0.5 # exclude points not loading on any of the top 10 categories counter = 0 rgb_max = 255 other_color = palette_rgb[len(all_emotions), :] for i, scores in enumerate(ppc_scores_abs): top_ppcs = [ idx for idx in (-scores).argsort()[:2] if scores[idx] > threshold ] top_emotions = ",".join([ppc2emotion[idx] for idx in top_ppcs ]) if top_ppcs else "other" top_categories.append(top_emotions) if len(top_ppcs) < 1: # doesn't have top emotions from list color = other_color # use grey counter += 1 else: # Weighted average of top emotions (square->weighted average->square root) color_ids = [ppc2color[idx] for idx in top_ppcs] weights = [scores[idx] for idx in top_ppcs] # Need to round, otherwise floating point precision issues will result # in values slightly above 1 avg = np.round( np.sqrt( np.average( np.power(palette_rgb[color_ids] * rgb_max, 2), axis=0, weights=weights)) / rgb_max, 4) if (avg > 1).sum() > 0: print(avg) color = avg rgb_vals.append(list(color)) hex_vals.append("#%02x%02x%02x" % tuple(np.array(color * rgb_max, dtype=int))) rgb_vals = np.array(rgb_vals) # Create t-SNE model tsne_model = TSNE( perplexity=30, n_components=2, n_iter=1000, random_state=23, learning_rate=500, init="pca") new_values = tsne_model.fit_transform(ppc_scores) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) # Put data in dataframe df = pd.DataFrame({ "x": x, "y": y, "color": hex_vals, "label(s)": top_categories, "text": texts }) df = df[df["label(s)"] != "other"] df["top_label"] = df["label(s)"].str.split(",").str[0] # Two selections: # - a brush that is active on the top panel # - a multi-click that is active on the bottom panel brush = alt.selection(type="interval") click = alt.selection_multi(encodings=["color"]) sample = df.sample(5000) # max 5000 examples can be plotted points = alt.Chart(sample).mark_point( filled=True, size=50).encode( x="x:Q", y="y:Q", color=alt.Color("color", scale=None), tooltip=["label(s)", "text"]).properties( width=700, height=600).add_selection(brush) # Bottom panel is a bar chart bars = alt.Chart(sample).mark_bar().encode( x="count()", y="top_label:N", color=alt.condition(click, alt.Color("color:N", scale=None), alt.value("lightgray")), ).transform_filter(brush.ref()).properties( width=700, selection=click) chart = alt.vconcat( points, bars, data=sample, title="t-SNE Projection of Examples") chart.save(FLAGS.plot_dir + "/tsne.html", format="html")
color_scale = alt.Scale( domain=[ "Very Unlikely", "Unlikely", "Does Not Matter", "Likely", "Very Likely" ], range=["#c30d24", "#f3a583", "#cccccc", "#94c6da", "#1770ab"] ) y_axis = alt.Axis( title='Statement Study', offset=5, ticks=False, minExtent=60, domain=False ) chart = alt.Chart(source).mark_bar().encode( x='percentage_start:Q', x2='percentage_end:Q', y=alt.Y('question:N', axis=y_axis), color=alt.Color( 'type:N', legend=alt.Legend( title='Response'), scale=color_scale, ) ) chart.save('statement_divergent_chart.html')
def make_org_quantiles_plots(infile): grouped_flows = infra.pd.read_parquet(infile) grouped_flows = grouped_flows.reset_index() grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[ "bytes_down"] user_org_total = grouped_flows[["user", "org", "bytes_total" ]].groupby(["user", "org"]).sum().reset_index() # Filter users by time in network to eliminate early incomplete samples user_active_ranges = infra.pd.read_parquet( "data/clean/user_active_deltas.parquet")[[ "user", "days_since_first_active", "days_active", "days_online" ]] # Drop users that joined less than a week ago. users_to_analyze = user_active_ranges.loc[ user_active_ranges["days_since_first_active"] > 7] # Drop users active for less than one day users_to_analyze = users_to_analyze.loc[ users_to_analyze["days_active"] > 1, ] # Sort orgs by total amount of bytes. org_totals = grouped_flows.groupby("org").sum().reset_index() org_sort_order = org_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() org_sort_order["rank"] = org_sort_order["bytes_total"].rank( method="min", ascending=False) org_sort_list = org_sort_order["org"].tolist() # Group users by quantiles of their daily use. user_totals = user_org_total.groupby("user").sum().reset_index() user_totals = user_totals.merge(users_to_analyze, on="user", how="inner") user_totals["avg_daily_bytes"] = user_totals["bytes_total"] / user_totals[ "days_online"] user_totals["rank_total"] = user_totals["bytes_total"].rank(method="min", pct=True) user_totals["rank_daily"] = user_totals["avg_daily_bytes"].rank( method="min") user_totals["quantile"] = pd.cut(user_totals["rank_daily"], 10, precision=0, right=False, include_lowest=True) # Compute the share of each user's traffic in each org user_shares = user_totals.rename( columns={"bytes_total": "user_bytes_total"}) user_shares = user_org_total.merge( user_shares[["user", "user_bytes_total"]], on="user", how="inner") user_shares["org_share"] = user_shares["bytes_total"] / user_shares[ "user_bytes_total"] user_shares = user_shares[["user", "org", "org_share"]] # Merge the user quantile information back into the flows, and then group by category quantile_flows = user_org_total.merge( user_totals[["user", "quantile", "days_online"]], on="user", how="inner") quantile_flows["normalized_bytes_total"] = quantile_flows[ "bytes_total"] / quantile_flows["days_online"] # Merge category share information into the plot frame quantile_flows = quantile_flows.merge(user_shares, on=["user", "org"], how="inner") # Compute means for quantiles and quantile labels quantile_totals = quantile_flows.groupby(["quantile", "org"]).mean() quantile_totals = quantile_totals.reset_index() quantile_totals["quantile_str"] = quantile_totals["quantile"].apply( lambda x: str(x)) # Add sort information back to rendered dataframe quantile_totals = quantile_totals.merge(org_sort_order[["org", "rank"]], on="org", how="inner") # This might not be showing exactly what I want to show, since in merging # users some users that dominate video could be overrepresented. Maybe # want to merge on the fraction of traffic to each part from each user? # Are users counted equally or are bytes counted equally... alt.Chart(quantile_totals[[ "org", "quantile_str", "bytes_total", "rank", "normalized_bytes_total" ]]).mark_bar().encode( x="quantile_str:O", y=alt.Y( "normalized_bytes_total", stack="normalize", sort=org_sort_list, ), color=alt.Color( "org:N", scale=alt.Scale(scheme="tableau20"), sort=org_sort_list, ), order=alt.Order( "rank", sort="descending", ), ).properties(width=500, ).save( "renders/bytes_per_org_per_quantile_bar.png", scale_factor=2, ) quantile_totals["normalize_mb_total"] = quantile_totals[ "normalized_bytes_total"] / 1000.0**2 # Generate an order based on the intervals, not the strings, to correctly sort the axis. quantiles = quantile_totals[["quantile", "quantile_str" ]].groupby(["quantile"]).first() quantiles = quantiles["quantile_str"].to_list() alt.Chart(quantile_totals[[ "org", "quantile_str", "bytes_total", "rank", "normalize_mb_total" ]]).mark_line().encode( x=alt.X( "quantile_str:N", title="User by Rank of Average Use Per Online Day (Grouped)", sort=quantiles, ), y=alt.Y("normalize_mb_total", sort=org_sort_list, title="Average Traffic Per Online Day (MB)"), color=alt.Color( "org:N", scale=alt.Scale(scheme="tableau20"), sort=org_sort_list, legend=alt.Legend( title="Organization", orient="none", fillColor="white", labelLimit=500, padding=5, strokeColor="black", columns=3, labelFontSize=8, legendX=15, legendY=5, symbolLimit=20, ), ), order=alt.Order( "rank", sort="descending", ), ).configure_axisX( labelAngle=0, labelFontSize=7, ).properties(width=500, ).save( "renders/bytes_per_org_per_quantile_line.png", scale_factor=2, ) alt.Chart(quantile_totals[[ "org", "quantile_str", "org_share", "rank" ]]).mark_line().encode( x=alt.X( "quantile_str:N", title="User by Rank of Average Use Per Online Day (Grouped)", sort=quantiles, ), y=alt.Y("org_share", sort=org_sort_list, title="Average Fraction of Traffic Per User"), color=alt.Color( "org:N", scale=alt.Scale(scheme="tableau20"), sort=org_sort_list, legend=alt.Legend( title="Organization", # orient="none", # fillColor="white", labelLimit=500, # padding=5, # strokeColor="black", # columns=3, # labelFontSize=8, # legendX=15, # legendY=5, symbolLimit=20, ), ), order=alt.Order( "rank", sort="descending", ), ).configure_axisX( labelAngle=0, labelFontSize=7, ).properties(width=500, ).save( "renders/bytes_per_org_share_per_quantile_line.png", scale_factor=2, )