# selectCountry = alt.selection_single( # name='Select', # name the selection 'Select' # fields=['country'], # limit selection to the country field # init={'country': countries[0]}, # use first country entry as initial value # bind=alt.binding_select(options=countries) # bind to a menu of unique country values # ) # Year selection brush = alt.selection_interval(encodings=['x']) years = alt.Chart(deaths).mark_line().add_selection( brush ).transform_filter( alt.datum.country == selectCountry ).encode( alt.X('year:O', title='Year'), alt.Y('sum(value)', title='Smoking Deaths (all ages)') ).properties( width=400, height=100 ) # Area chart - Smoking deaths by ages base = alt.Chart(deaths).mark_area().transform_filter( alt.datum.country == selectCountry ).transform_filter( brush ).encode( alt.X('year:O', title='Year'), y=alt.Y('value:Q', title='Smoking Deaths by Ages (normalized)', stack="normalize"), color=alt.Color('Age:O', scale=alt.Scale(scheme='lightorange')),
def plot_distance( self, rank="auto", metric="braycurtis", title=None, xlabel=None, ylabel=None, tooltip=None, return_chart=False, linkage="average", label=None, ): """Plot beta diversity distance matrix as a heatmap and dendrogram. Parameters ---------- rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. metric : {'braycurtis', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac}, optional Function to use when calculating the distance between two samples. linkage : {'average', 'single', 'complete', 'weighted', 'centroid', 'median'} The type of linkage to use when clustering axes. title : `string`, optional Text label at the top of the plot. xlabel : `string`, optional Text label along the horizontal axis. ylabel : `string`, optional Text label along the vertical axis. tooltip : `string` or `list`, optional A string or list containing strings representing metadata fields. When a point in the plot is hovered over, the value of the metadata associated with that sample will be displayed in a modal. label : `string` or `callable`, optional A metadata field (or function) used to label each analysis. If passing a function, a dict containing the metadata for each analysis is passed as the first and only positional argument. The callable function must return a string. Examples -------- Plot the weighted UniFrac distance between all our samples, using counts at the genus level. >>> plot_distance(rank='genus', metric='unifrac') """ import altair as alt import numpy as np import pandas as pd from onecodex.viz import dendrogram if len(self._results) < 2: raise OneCodexException( "`plot_distance` requires 2 or more valid classification results." ) # this will be passed to the heatmap chart as a dataframe eventually plot_data = { "1) Label": [], "2) Label": [], "Distance": [], "classification_id": [] } # here we figure out what to put in the tooltips and get the appropriate data if tooltip: if not isinstance(tooltip, list): tooltip = [tooltip] else: tooltip = [] tooltip.insert(0, "Label") magic_metadata, magic_fields = self._metadata_fetch(tooltip, label=label) formatted_fields = [] for _, magic_field in magic_fields.items(): field_group = [] for i in (1, 2): field = "{}) {}".format(i, magic_field) plot_data[field] = [] field_group.append(field) formatted_fields.append(field_group) clust = self._cluster_by_sample(rank=rank, metric=metric, linkage=linkage) # must convert to long format for heatmap plotting for idx1, id1 in enumerate(clust["dist_matrix"].index): for idx2, id2 in enumerate(clust["dist_matrix"].index): if idx1 == idx2: plot_data["Distance"].append(np.nan) else: plot_data["Distance"].append( clust["dist_matrix"].iloc[idx1, idx2]) plot_data["classification_id"].append(id1) for field_group, magic_field in zip(formatted_fields, magic_fields.values()): plot_data[field_group[0]].append( magic_metadata[magic_field][id1]) plot_data[field_group[1]].append( magic_metadata[magic_field][id2]) plot_data = pd.DataFrame(data=plot_data) labels_in_order = magic_metadata["Label"][ clust["ids_in_order"]].tolist() # it's important to tell altair to order the cells in the heatmap according to the clustering # obtained from scipy alt_kwargs = dict( x=alt.X("1) Label:N", axis=alt.Axis(title=xlabel), sort=labels_in_order), y=alt.Y("2) Label:N", axis=alt.Axis(title=ylabel, orient="right"), sort=labels_in_order), color="Distance:Q", tooltip=list(chain.from_iterable(formatted_fields)) + ["Distance:Q"], href="url:N", url="https://app.onecodex.com/classification/" + alt.datum.classification_id, ) chart = (alt.Chart( plot_data, width=15 * len(clust["dist_matrix"].index), height=15 * len(clust["dist_matrix"].index), ).transform_calculate(url=alt_kwargs.pop("url")).mark_rect().encode( **alt_kwargs)) if title: chart = chart.properties(title=title) dendro_chart = dendrogram(clust["scipy_tree"]) if return_chart: return dendro_chart | chart else: (dendro_chart | chart).display()
Sources: [covidtracking.com](https://covidtracking.com/api), [census.gov](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-state-total.html) Hospitalization data is more normalized than case data because testing rates (and thus known cases) varies too much across states. While less of a leading indicator, the 7 day change gives a reasonable early sign of trouble. """) # Magic streamlit function that renders a date picker and assigns the picked value to picked_date picked_date = st.date_input("Date", value=usdata_diff['date'].max()).strftime('%Y-%m-%d') # 7 day change bar chart st.subheader('Change last 7 days') # The reason why picked_date was converted to string above is otherwise the data # selection would not work in this line below. st.write(alt.Chart(usdata_diff[usdata_diff['date'] == picked_date]).mark_bar().encode( y=alt.Y('state', sort='-x'), x=alt.X('hospitalizedPer100k7daychange:Q', axis=alt.Axis(orient='top')), tooltip=[alt.Tooltip("hospitalizedPer100k7daychange:Q", title="7 day change", format=',.0d')] ).properties( width=800 ) ) st.subheader('Total') st.markdown("Red bars are republican states, blue bars are democratic states per 2016 presidential election") # The reason why picked_date was converted to string above is otherwise the data # selection would not work in this line below. st.write(alt.Chart(usdata[usdata['date'] == picked_date]).mark_bar().encode( y=alt.Y('state', sort='-x'), x=alt.X('hospitalizedCurrentlyPer100k', axis=alt.Axis(orient='top')), tooltip=[alt.Tooltip("hospitalizedCurrentlyPer100k:Q", title="hospitalized per 100k", format=',.2f')] ).properties(
def graph_compare_cea2034(df, graph_params, speaker1, speaker2): selection1, selection2, selectorsMeasurements, scales = build_selections( df, speaker1, speaker2) # TODO(move to parameters) x_axis = alt.X('Freq:Q', scale=alt.Scale(type="log", domain=[20, 20000], nice=False)) y_axis = alt.Y('dB:Q', scale=alt.Scale(zero=False, domain=[-40, 10])) color = alt.Color('Measurements', type='nominal', sort=None) opacity = alt.condition(selectorsMeasurements, alt.value(1), alt.value(0.2)) line = alt.Chart(df).transform_filter( alt.FieldOneOfPredicate(field='Measurements', oneOf=[ 'On Axis', 'Listening Window', 'Early Reflections', 'Sound Power' ])).encode(x=x_axis, y=y_axis, color=color, opacity=opacity) points = line.mark_circle(size=100).encode( opacity=alt.condition(nearest, alt.value(1), alt.value(0)), tooltip=['Measurements', 'Freq', 'dB']) di_axis = alt.Y('dB:Q', scale=alt.Scale(zero=False, domain=[-10, 40], nice=False)) di = alt.Chart(df).transform_filter( alt.FieldOneOfPredicate( field='Measurements', oneOf=['Early Reflections DI', 'Sound Power DI'])).encode(x=x_axis, y=di_axis, color=color, opacity=opacity) points_di = di.mark_circle(size=100).encode( opacity=alt.condition(nearest, alt.value(1), alt.value(0)), tooltip=['Measurements', 'Freq', 'dB']) spin_full = alt.layer(points + line.mark_line(), points_di + di.mark_line(clip=True)).resolve_scale( y='independent').properties(width=600, height=300) spin_dash = alt.layer( points + line.mark_line(strokeDash=[4, 2]), points_di + di.mark_line(clip=True, strokeDash=[4, 2])).resolve_scale( y='independent').properties(width=600, height=300) line1 = spin_full.add_selection(selection1).transform_filter(selection1) line2 = spin_dash.add_selection(selection2).transform_filter(selection2) points = line.mark_point().encode( opacity=alt.condition(nearest, alt.value(1), alt.value(0))) rules = alt.Chart(df).mark_rule(color='gray').encode( x='Freq:Q').transform_filter(nearest) layers = alt.layer( line2, line1, rules).add_selection(selectorsMeasurements).add_selection( scales).add_selection(nearest).interactive() return layers
def check_column( data, columns, bins=False, missing=0.1, cardinality=15, float_frequency=30, category_frequency=100, outlier_function=quartile, ): """ Presents a summary of given column(s) of the given pandas dataframe including summary statistics, bar chart or histogram, and any abnormalities found :param data: a pandas dataframe :param columns: a single column name or a list of column names to analyze :param bins: a boolean value or list of boolean values to determine whether to bin the histogram for each column :param missing: a cutoff point for high percentage of missing / zero values, defaults to 10% :param cardinality: a cutoff point for high cardinality of a categorical column, defaults to 15 :param float_frequency: a cutoff point for high frequency of floating point numbers, defaults to 30 :param category_frequency: a cutoff point for low frequency of categories in categorical columns, defaults to 100 :param outlier_function: a function of the dataset and column name that returns the lower and upper limit for outliers, defaults to 1.5*IQR above the 3rd quartile or below the 1st quartile """ if isinstance(columns, str): # with only one column, convert to lists columns = [columns] bins = [bins] else: if bins == False: # with multiple columns and no bins, # convert to list of correct length bins = [False] * len(columns) if isinstance(bins, int): # with multiple columns and only one bin # specification, convert to list of correct length bins = [bins] * len(columns) i = 0 for col in columns: bin = bins[i] i += 1 if data[col].dtype == "O": # cannot bin categorical data bin = False if bin == False: if data[col].dtype == "O": chart = ( alt.Chart(data) .mark_bar(color="#64b5f6") .encode( alt.X( col, axis=alt.Axis(title=col.title()), sort=alt.SortField( field="count()", order="descending", op="values" ), ), alt.Y("count()"), ) ) else: chart = ( alt.Chart(data) .mark_bar(color="#64b5f6") .encode( alt.X( col, axis=alt.Axis(title=col.title()) ), alt.Y("count()") ) ) else: chart = ( alt.Chart(data) .mark_bar(color="#64b5f6") .encode( alt.X( col, bin=alt.Bin(maxbins=bin), axis=alt.Axis(title=col.title()) ), alt.Y("count()"), ) ) if data[col].dtype == "float64": stats = data[col].describe() else: stats = data.groupby(col)[col].agg(["count"]) stats["prop"] = stats["count"] / len(data) stats = pd.DataFrame(stats).T display(Markdown("#### Column Summary: " + col.title())) display(stats) display(chart) check_data( data, [col], missing=missing, cardinality=cardinality, float_frequency=float_frequency, category_frequency=category_frequency, outlier_function=outlier_function, title=False, )
def make_category_plot_separate_top_n(infile, n_to_separate=20): pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.width', None) grouped_flows = infra.pd.read_parquet(infile) grouped_flows = grouped_flows.reset_index() grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[ "bytes_down"] # Figure out sorting order by total amount. cat_totals = grouped_flows.groupby("category").sum().reset_index() cat_sort_order = cat_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() cat_sort_list = cat_sort_order["category"].tolist() user_totals = grouped_flows.groupby("user").sum().reset_index() user_sort_order = user_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() user_sort_list = user_sort_order["user"].tolist() # Generate a frame from the sorted user list that identifies the top users top_annotation_frame = user_sort_order[["user"]] bottom_n = len(user_sort_order) - n_to_separate top_annotation_frame = top_annotation_frame.assign( topN="Bottom {}".format(bottom_n)) top_annotation_frame.loc[top_annotation_frame.index < n_to_separate, "topN"] = "Top {}".format(n_to_separate) grouped_flows["GB"] = grouped_flows["bytes_total"] / (1000**3) grouped_flows = grouped_flows[["category", "user", "GB"]].groupby(["user", "category"]).sum() grouped_flows = grouped_flows.reset_index() grouped_flows["logGB"] = grouped_flows["GB"].transform(np.log10) grouped_flows = grouped_flows.merge(top_annotation_frame, on="user") alt.Chart(grouped_flows).mark_rect().encode( x=alt.X( "user:N", title="User (Sorted by Total GB)", axis=alt.Axis(labels=False), sort=user_sort_list, ), y=alt.Y( "category:N", title="Category (Sorted by Total GB)", sort=cat_sort_list, ), # shape="direction", color=alt.Color( "GB:Q", title="Total GB", scale=alt.Scale(scheme="viridis"), ), ).facet(column=alt.Column( "topN:N", sort="descending", title="", ), ).resolve_scale(x="independent", color="independent").save( "renders/users_per_category_split_outliers.png", scale_factor=2, )
def make_category_plot(infile): pd.set_option('display.max_columns', None) grouped_flows = infra.pd.read_parquet(infile) grouped_flows = grouped_flows.reset_index() grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[ "bytes_down"] # Figure out sorting order by total amount. cat_totals = grouped_flows.groupby("category").sum().reset_index() cat_sort_order = cat_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() cat_sort_list = cat_sort_order["category"].tolist() user_totals = grouped_flows.groupby("user").sum().reset_index() user_sort_order = user_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() user_sort_list = user_sort_order["user"].tolist() grouped_flows["GB"] = grouped_flows["bytes_total"] / (1000**3) grouped_flows = grouped_flows[["category", "user", "GB"]].groupby(["user", "category"]).sum() grouped_flows = grouped_flows.reset_index() grouped_flows["logGB"] = grouped_flows["GB"].transform(np.log10) # Filter users by time in network to eliminate early incomplete samples user_active_ranges = infra.pd.read_parquet( "data/clean/user_active_deltas.parquet")[[ "user", "days_since_first_active", "days_active" ]] # Drop users that joined less than a week ago or were active for less than a week. users_to_analyze = user_active_ranges.loc[ user_active_ranges["days_since_first_active"] >= 7] # Only needed if using the time normalized graph. # # Drop users active for less than one week # users_to_analyze = users_to_analyze.loc[ # users_to_analyze["days_active"] >= 7, # ] grouped_flows = grouped_flows.merge(users_to_analyze, on="user", how="inner") alt.Chart(grouped_flows).mark_rect().encode( x=alt.X( "user:N", title="User (Sorted by Total GB)", axis=alt.Axis(labels=False), sort=user_sort_list, ), y=alt.Y( "category:N", title="Category (Sorted by Total GB)", sort=cat_sort_list, ), # shape="direction", color=alt.Color( "logGB:Q", title="log(Total GB)", scale=alt.Scale(scheme="viridis"), ), ).properties(width=500, ).save( "renders/users_per_category.png", scale_factor=2, ) # Normalize by each user's total spend to highlight categories user_total_to_merge = user_totals[[ "user", "bytes_total" ]].rename(columns={"bytes_total": "user_total_bytes"}) normalized_user_flows = grouped_flows.copy() normalized_user_flows = normalized_user_flows.merge(user_total_to_merge, on="user") normalized_user_flows["user_total_bytes"] = normalized_user_flows[ "user_total_bytes"] / 1000**3 normalized_user_flows["normalized_bytes"] = normalized_user_flows[ "GB"] / normalized_user_flows["user_total_bytes"] alt.Chart(normalized_user_flows).mark_rect().encode( x=alt.X( "user:N", title="User (Sorted by Total GB)", axis=alt.Axis(labels=False), sort=user_sort_list, ), y=alt.Y( "category:N", title="Category (Sorted by Total GB)", sort=cat_sort_list, ), # shape="direction", color=alt.Color( "normalized_bytes:Q", title="Normalized (Per User) Traffic", scale=alt.Scale(scheme="viridis"), ), ).properties(width=500, ).save( "renders/users_per_category_normalized_user_total.png", scale_factor=2, ) # Normalize by each user's time in network to better compare users time_normalized_flows = grouped_flows time_normalized_flows["MB_per_day"] = time_normalized_flows[ "GB"] * 1000 / time_normalized_flows["days_active"] time_normalized_flows["log_MB_per_day"] = time_normalized_flows[ "MB_per_day"].transform(np.log10) alt.Chart(time_normalized_flows).mark_rect().encode( x=alt.X( "user:N", title="User (Sorted by Total)", axis=alt.Axis(labels=False), sort=user_sort_list, ), y=alt.Y( "category:N", title="Category (Sorted by Total)", sort=cat_sort_list, ), # shape="direction", color=alt.Color( "log_MB_per_day:Q", title="MB per Day (Log Transformed)", scale=alt.Scale(scheme="viridis"), ), ).properties(width=500, ).save( "renders/users_per_category_normalized_time.png", scale_factor=2, )
x, reference_feature, grid_resolution=50) ice_chart_data = pd.DataFrame(ice_values.T) ice_chart_data.columns = [str(c) for c in ice_chart_data.columns] ice_chart_data["index"] = ice_grid # ALE ale_grid, ale_values, feature_type = ale(model_fit, x, reference_feature) ale_chart_data = pd.DataFrame({"x": ale_grid, "ALE": ale_values}) col1, col2 = st.beta_columns(2) with col1: st.write( alt.Chart(pdp_chart_data, title="PDP").mark_line().encode( x=alt.X("x", title=pretty_feature_name), y=alt.Y("PD", title="")).configure_title( fontSize=18).properties(width=450)) with col2: base = alt.Chart(title="ICE").mark_line().encode( x=alt.X("index", title=pretty_feature_name)) st.write( alt.layer(*[ base.encode(y=alt.Y(col, title="")) for col in ice_chart_data.columns if col != "index" ], data=ice_chart_data).configure_title( fontSize=18).properties(width=450)) st.write( alt.Chart(ale_chart_data, title="ALE").mark_line().encode(
""" Connected Scatterplot (Lines with Custom Paths) ----------------------------------------------- This example shows how layering can be used to build a plot. This dataset tracks miles driven per capita along with gas prices annually from 1956 to 2010. It is based on the May 2, 2010 New York Times article 'Driving Shifts Into Reverse'. See http://mbostock.github.io/protovis/ex/driving.html . """ import altair as alt from vega_datasets import data driving = data.driving() lines = alt.Chart(driving).mark_line().encode( alt.X('miles', scale=alt.Scale(zero=False)), alt.Y('gas', scale=alt.Scale(zero=False)), order='year') points = alt.Chart(driving).mark_circle().encode( alt.X('miles', scale=alt.Scale(zero=False)), alt.Y('gas', scale=alt.Scale(zero=False))) lines + points
def plot(self): df_combined = self.df m = self.now_utc_int start = m - 3600 * 1000 * 12 max_time_unix = self.df.Minutes5UTC.astype(int).max() / 1000000 end = min(m + 3600 * 1000 * 12, max_time_unix) # Convert np.int64 to int to ensure that result is JSON serializable height = max(250, int(df_combined.CO2Emission.max()) + 25) today = pd.DataFrame({ 'x': [self.now, self.now], 'y': [0, self.quintiles[-1]] }) rects = [ pd.DataFrame({ 'x': [self.min_time], 'y': [self.quintiles[i]], 'x2': [self.max_time], 'y2': [self.quintiles[i + 1]] }) for i in range(5) ] interval = alt.selection_interval(encodings=['x'], init={'x': [int(start), int(end)]}) base = alt.Chart(df_combined).mark_line(strokeWidth=4).encode( alt.X('Minutes5DK:T', title=''), alt.Y('CO2Emission:Q', title='Udledningsintensitet [g CO2/kWh]', scale=alt.Scale(domain=(0, height))), alt.Color('Type:N'), ) today_line = alt.Chart(today).mark_rule(clip=True).encode(x='x:T', y='y:Q') today_chart = alt.Chart(today).mark_rule(clip=True).encode(x=alt.X( 'x:T', scale=alt.Scale(domain=interval.ref())), y='y:Q') opacity = 0.15 def make_rect(data, color): return alt.Chart(data).mark_rect( color=color, opacity=opacity).encode(x=alt.X( 'x:T', scale=alt.Scale(domain=interval.ref())), x2='x2:T', y='y:Q', y2='y2:Q') rect_charts = [ make_rect(data, color) for data, color in zip( rects, ['green', 'lightgreen', 'yellow', 'lightcoral', 'red']) ] combined_rect_chart = rect_charts[0] + rect_charts[1] + rect_charts[ 2] + rect_charts[3] + rect_charts[4] top = base.properties(width='container', height=300) \ .encode(x=alt.X('Minutes5DK:T', axis=alt.Axis(format='%H'), title='', scale=alt.Scale(domain=interval.ref()))) chart = top + today_chart + combined_rect_chart view = base.properties( width='container', height=50, selection=interval).encode(y=alt.Y( 'CO2Emission:Q', title='', scale=alt.Scale(domain=(0, height)))) full_chart = chart & (view + today_line) return full_chart.configure_axis(titleX=-25, titleY=-20, titleAlign='left', titleAngle=0, titleFont='Inter Regular', titleFontWeight='normal', titleFontSize=13).configure_legend( title=None, orient='top-right', labelFont='Inter Regular', labelFontSize=12)
def criar_histograma(coluna, df): chart = alt.Chart(df, width=600).mark_bar().encode( alt.X(coluna, bin=True), y='count()', tooltip=[coluna, 'count()'] ).interactive() return chart
def state_length_vs_timestep_sim(self, chart_type='bar', background_color='#abb2bf', lower_bound=25, upper_bound=200): ''' creates interactive charts showing distributions of on and off times for simulated data chart types: bar: bar chart (histogram) area: area chart (filled line chart) background_color: hex code for chart background color, ex: #abb2bf - grey #ffffff - white ''' full_sim_df = self.full_sim_df chart_type = chart_type.strip().lower() sel_timestep = alt.selection_multi(encodings=['y']) bar_chart = alt.Chart( full_sim_df, height=800, width=250 ).mark_bar( ).encode( alt.X( 'max(duration):Q', title='max packet duration', scale=alt.Scale(type='log') ), alt.Y( 'timestep:N', ), color=alt.condition( sel_timestep, 'timestep:N', alt.value('#96989b'), legend=None ), tooltip = [ alt.Tooltip('timestep:N'), alt.Tooltip('duration:Q', aggregate='max') ] ).add_selection( sel_timestep ) #---------- histograms ---------- detail_bar_chart = alt.Chart( full_sim_df, height=375, width=800 ).mark_bar( opacity=0.5 ).transform_filter( sel_timestep ).transform_filter( datum.duration > lower_bound ).transform_filter( datum.duration < upper_bound ).encode( alt.X('duration:Q'), alt.Y( 'frequency:Q', #scale=alt.Scale(type='log') ), color=alt.Color('timestep:N', legend=None), tooltip=[ alt.Tooltip('duration:Q'), alt.Tooltip('frequency:Q'), alt.Tooltip('timestep:N'), alt.Tooltip('bit:N'), ] ) detail_area_chart = alt.Chart( full_sim_df, height=375, width=800 ).mark_area( opacity=0.5 ).transform_filter( sel_timestep ).transform_filter( datum.duration > lower_bound ).transform_filter( datum.duration < upper_bound ).encode( alt.X('duration:Q'), alt.Y( 'frequency:Q', #scale=alt.Scale(type='log') ), color=alt.Color('timestep:N', legend=None), tooltip=[ alt.Tooltip('duration:Q'), alt.Tooltip('frequency:Q'), alt.Tooltip('timestep:N'), alt.Tooltip('bit:N'), ] ) if chart_type == 'bar': detail_chart = detail_bar_chart elif chart_type == 'area': detail_chart = detail_area_chart else: print(f'unsupported chart type {chart_type}') stacked_bit_details = alt.vconcat( detail_chart.transform_filter(datum.bit == 0).properties(title='off time distributions'), detail_chart.transform_filter(datum.bit == 1).properties(title='on time distributions'), ) full = alt.hconcat( bar_chart, stacked_bit_details, background=background_color ) display(full)
def state_length_vs_timestep_real(self, chart_type='bar', background_color='#abb2bf', lower_bound=25, upper_bound=2500): ''' creates interactive charts showing distributions of on and off times for simulated data chart types: bar: bar chart (histogram) area: area chart (filled line chart) background_color: hex code for chart background color, ex: #abb2bf - grey #ffffff - white ''' full_real_df = ( self .full_real_df .groupby(['bit','duration']) .frequency .sum() .reset_index() ) detail_bar_chart = alt.Chart( full_real_df, height=375, width=800 ).mark_bar( opacity=0.5 ).transform_filter( datum.duration > lower_bound ).transform_filter( datum.duration < upper_bound ).encode( alt.X('duration:Q'), alt.Y( 'frequency:Q', #scale=alt.Scale(type='log') ), tooltip=[ alt.Tooltip('duration:Q'), alt.Tooltip('frequency:Q'), alt.Tooltip('bit:N'), ] ) detail_area_chart = alt.Chart( full_real_df, height=375, width=800 ).mark_area( opacity=0.5 ).transform_filter( datum.duration > lower_bound ).transform_filter( datum.duration < upper_bound ).encode( alt.X('duration:Q'), alt.Y( 'frequency:Q', #scale=alt.Scale(type='log') ), tooltip=[ alt.Tooltip('duration:Q'), alt.Tooltip('frequency:Q'), alt.Tooltip('bit:N'), ] ) if chart_type == 'bar': detail_chart = detail_bar_chart elif chart_type == 'area': detail_chart = detail_area_chart else: print(f'unsupported chart type {chart_type}') stacked_bit_details = alt.vconcat( detail_chart.transform_filter(datum.bit == 0).properties(title='off time distributions'), detail_chart.transform_filter(datum.bit == 1).properties(title='on time distributions'), background=background_color ) display(stacked_bit_details)
}, { "x": 15, "y": 17 }, { "x": 16, "y": 27 }, { "x": 17, "y": 68 }, { "x": 18, "y": 16 }, { "x": 19, "y": 49 }, { "x": 20, "y": 15 }]) area1 = alt.Chart(df).mark_area(clip=True, interpolate='monotone').encode( alt.X('x', scale=alt.Scale(zero=False, nice=False)), alt.Y('y', scale=alt.Scale(domain=[0, 50]), axis=alt.Axis(title='y')), opacity=alt.value(0.6)).properties(width=500, height=75) area2 = area1.encode(alt.Y( 'ny:Q', scale=alt.Scale(domain=[0, 50]))).transform_calculate("ny", datum.y - 50) area1 + area2
def create_graph(): # retira o número máximo de linhas para pot com Altair alt.data_transformers.disable_max_rows() # faz query no banco de dados autores = queryDB('author', ['ID_author','author']) artigos = queryDB('paper', ['ID_paper','paper']) author_paper = queryDB('author_paper', ['ID_paper','ID_author']) autores['ID_author'] = autores['ID_author'].astype(str) artigos['ID_paper'] = artigos['ID_paper'].astype(str) ### renderiza os gráficos ## Grafo 1 - Autores (authors) print('Preparando grafo dos autores...') graph = nx.Graph() # dataframe com colunas: paper e [lista_autores] group = pd.DataFrame(author_paper.groupby('ID_paper')['ID_author'].apply(list)) # Adicionando "edges" for j,row in group.iterrows(): i=len(row['ID_author']) for i in range(len(row['ID_author'])): for k in range(i,len(row['ID_author'])): graph.add_edge(row['ID_author'][i], row['ID_author'][k]) pos = nx.spring_layout(graph,k=0.2, iterations=50, weight=0.1, center=(0.5,0.5)) # forces graph layout # coletando nodes nodes = to_pandas_nodes(graph,pos) nodes.reset_index(inplace=True) nodes.rename(columns={'index':'ID_author'}, inplace=True) nodes = pd.merge(nodes,autores,on='ID_author') # coletando nome dos autores nodes = pd.merge(nodes,author_paper, on='ID_author') # coletando ID_paper # coletando edges edges = to_pandas_edges(graph,pos) # Gráfico 1 print('Criando interatividade com o Altair (autores) ...') selector = alt.selection_single(empty='all',fields=['ID_author']) # iniciando seletor points = alt.Chart(nodes).add_selection(selector).mark_point(filled=True,size=90).encode( alt.X('x', axis=alt.Axis(title='')), alt.Y('y', axis=alt.Axis(title='')), tooltip='author', opacity=alt.condition(selector,alt.value(0.95),alt.value(0.4),legend=None), color=alt.condition(selector, 'ID_author', alt.value('lightgray'), legend=None) ).properties( selection=selector ).transform_filter(selector) # cria um background para efeitos de transição do seletor bk = alt.Chart(nodes).mark_point(color='lightgray',filled=True,size=90).encode( alt.X('x', axis=alt.Axis(title='')), alt.Y('y', axis=alt.Axis(title='')), tooltip='author', opacity=alt.value(0.4), ) lines = alt.Chart(edges).mark_line(color='salmon').encode( alt.X('x', axis=alt.Axis(title='')), alt.Y('y', axis=alt.Axis(title='')), detail='edge', opacity=alt.value(0.15) ) chart = alt.LayerChart(layer=(lines,bk+points)).properties( height=350, width=450 ).interactive() ## Grafo 2 - Artigos (papers) print('Preparando grafo dos artigos...') graph1 = nx.Graph() group1 = pd.DataFrame(author_paper.groupby('ID_author')['ID_paper'].apply(list)) # Adicionando "edges" for j,row in group1.iterrows(): i=len(row['ID_paper']) for i in range(len(row['ID_paper'])): for k in range(i,len(row['ID_paper'])): graph1.add_edge(row['ID_paper'][i], row['ID_paper'][k]) pos1 = nx.spring_layout(graph1,k=0.2, iterations=50, weight=0.1, center=(0.5,0.5)) # forces graph layout # coletando nodes nodes1 = to_pandas_nodes(graph1, pos1) nodes1.reset_index(inplace=True) nodes1.rename(columns={'index':'ID_paper'}, inplace=True) nodes1 = pd.merge(nodes1,artigos,on='ID_paper') # coletando nome dos papers nodes1 = pd.merge(nodes1,author_paper,on='ID_paper') # coletando ID_author # coletando edges edges1 = to_pandas_edges(graph1,pos1) # Gráfico 2 print('Criando interatividade com o Altair (artigos)...') points1 = alt.Chart(nodes1).add_selection(selector).mark_point(filled=True,size=90).encode( alt.X('x', axis=alt.Axis(title='')), alt.Y('y', axis=alt.Axis(title='')), tooltip='paper', opacity=alt.condition(selector,alt.value(0.95),alt.value(0.4),legend=None), color=alt.condition(selector, 'ID_author', alt.value('lightgray'), legend=None) ).transform_filter(selector) # cria um background para efeitos de transição do seletor bk1 = alt.Chart(nodes1).mark_point(color='lightgray',filled=True,size=90).encode( alt.X('x', axis=alt.Axis(title='')), alt.Y('y', axis=alt.Axis(title='')), tooltip='paper', opacity=alt.value(0.4), ) lines1 = alt.Chart(edges1).mark_line(color='lightblue').encode( alt.X('x', axis=alt.Axis(title='')), alt.Y('y', axis=alt.Axis(title='')), detail='edge', opacity=alt.value(0.2) ) chart1 = alt.LayerChart(layer=(lines1,bk1 + points1)).properties( height=350, width=450 ).interactive() ### Concatenando horizontamnete os gráficos 1 e 2 horiz_chart = alt.hconcat(chart, chart1 ).configure_axis( ticks=False, grid=False, domain=False, labels=False).configure_view( strokeWidth=0 ) return horiz_chart.to_json()
'country': 'United States', 'animal': 'sheep' }]) domains = ['person', 'cattle', 'pigs', 'sheep'] shape_scale = alt.Scale( domain=domains, range=[ 'M1.7 -1.7h-0.8c0.3 -0.2 0.6 -0.5 0.6 -0.9c0 -0.6 -0.4 -1 -1 -1c-0.6 0 -1 0.4 -1 1c0 0.4 0.2 0.7 0.6 0.9h-0.8c-0.4 0 -0.7 0.3 -0.7 0.6v1.9c0 0.3 0.3 0.6 0.6 0.6h0.2c0 0 0 0.1 0 0.1v1.9c0 0.3 0.2 0.6 0.3 0.6h1.3c0.2 0 0.3 -0.3 0.3 -0.6v-1.8c0 0 0 -0.1 0 -0.1h0.2c0.3 0 0.6 -0.3 0.6 -0.6v-2c0.2 -0.3 -0.1 -0.6 -0.4 -0.6z', 'M4 -2c0 0 0.9 -0.7 1.1 -0.8c0.1 -0.1 -0.1 0.5 -0.3 0.7c-0.2 0.2 1.1 1.1 1.1 1.2c0 0.2 -0.2 0.8 -0.4 0.7c-0.1 0 -0.8 -0.3 -1.3 -0.2c-0.5 0.1 -1.3 1.6 -1.5 2c-0.3 0.4 -0.6 0.4 -0.6 0.4c0 0.1 0.3 1.7 0.4 1.8c0.1 0.1 -0.4 0.1 -0.5 0c0 0 -0.6 -1.9 -0.6 -1.9c-0.1 0 -0.3 -0.1 -0.3 -0.1c0 0.1 -0.5 1.4 -0.4 1.6c0.1 0.2 0.1 0.3 0.1 0.3c0 0 -0.4 0 -0.4 0c0 0 -0.2 -0.1 -0.1 -0.3c0 -0.2 0.3 -1.7 0.3 -1.7c0 0 -2.8 -0.9 -2.9 -0.8c-0.2 0.1 -0.4 0.6 -0.4 1c0 0.4 0.5 1.9 0.5 1.9l-0.5 0l-0.6 -2l0 -0.6c0 0 -1 0.8 -1 1c0 0.2 -0.2 1.3 -0.2 1.3c0 0 0.3 0.3 0.2 0.3c0 0 -0.5 0 -0.5 0c0 0 -0.2 -0.2 -0.1 -0.4c0 -0.1 0.2 -1.6 0.2 -1.6c0 0 0.5 -0.4 0.5 -0.5c0 -0.1 0 -2.7 -0.2 -2.7c-0.1 0 -0.4 2 -0.4 2c0 0 0 0.2 -0.2 0.5c-0.1 0.4 -0.2 1.1 -0.2 1.1c0 0 -0.2 -0.1 -0.2 -0.2c0 -0.1 -0.1 -0.7 0 -0.7c0.1 -0.1 0.3 -0.8 0.4 -1.4c0 -0.6 0.2 -1.3 0.4 -1.5c0.1 -0.2 0.6 -0.4 0.6 -0.4z', 'M1.2 -2c0 0 0.7 0 1.2 0.5c0.5 0.5 0.4 0.6 0.5 0.6c0.1 0 0.7 0 0.8 0.1c0.1 0 0.2 0.2 0.2 0.2c0 0 -0.6 0.2 -0.6 0.3c0 0.1 0.4 0.9 0.6 0.9c0.1 0 0.6 0 0.6 0.1c0 0.1 0 0.7 -0.1 0.7c-0.1 0 -1.2 0.4 -1.5 0.5c-0.3 0.1 -1.1 0.5 -1.1 0.7c-0.1 0.2 0.4 1.2 0.4 1.2l-0.4 0c0 0 -0.4 -0.8 -0.4 -0.9c0 -0.1 -0.1 -0.3 -0.1 -0.3l-0.2 0l-0.5 1.3l-0.4 0c0 0 -0.1 -0.4 0 -0.6c0.1 -0.1 0.3 -0.6 0.3 -0.7c0 0 -0.8 0 -1.5 -0.1c-0.7 -0.1 -1.2 -0.3 -1.2 -0.2c0 0.1 -0.4 0.6 -0.5 0.6c0 0 0.3 0.9 0.3 0.9l-0.4 0c0 0 -0.4 -0.5 -0.4 -0.6c0 -0.1 -0.2 -0.6 -0.2 -0.5c0 0 -0.4 0.4 -0.6 0.4c-0.2 0.1 -0.4 0.1 -0.4 0.1c0 0 -0.1 0.6 -0.1 0.6l-0.5 0l0 -1c0 0 0.5 -0.4 0.5 -0.5c0 -0.1 -0.7 -1.2 -0.6 -1.4c0.1 -0.1 0.1 -1.1 0.1 -1.1c0 0 -0.2 0.1 -0.2 0.1c0 0 0 0.9 0 1c0 0.1 -0.2 0.3 -0.3 0.3c-0.1 0 0 -0.5 0 -0.9c0 -0.4 0 -0.4 0.2 -0.6c0.2 -0.2 0.6 -0.3 0.8 -0.8c0.3 -0.5 1 -0.6 1 -0.6z', 'M-4.1 -0.5c0.2 0 0.2 0.2 0.5 0.2c0.3 0 0.3 -0.2 0.5 -0.2c0.2 0 0.2 0.2 0.4 0.2c0.2 0 0.2 -0.2 0.5 -0.2c0.2 0 0.2 0.2 0.4 0.2c0.2 0 0.2 -0.2 0.4 -0.2c0.1 0 0.2 0.2 0.4 0.1c0.2 0 0.2 -0.2 0.4 -0.3c0.1 0 0.1 -0.1 0.4 0c0.3 0 0.3 -0.4 0.6 -0.4c0.3 0 0.6 -0.3 0.7 -0.2c0.1 0.1 1.4 1 1.3 1.4c-0.1 0.4 -0.3 0.3 -0.4 0.3c-0.1 0 -0.5 -0.4 -0.7 -0.2c-0.3 0.2 -0.1 0.4 -0.2 0.6c-0.1 0.1 -0.2 0.2 -0.3 0.4c0 0.2 0.1 0.3 0 0.5c-0.1 0.2 -0.3 0.2 -0.3 0.5c0 0.3 -0.2 0.3 -0.3 0.6c-0.1 0.2 0 0.3 -0.1 0.5c-0.1 0.2 -0.1 0.2 -0.2 0.3c-0.1 0.1 0.3 1.1 0.3 1.1l-0.3 0c0 0 -0.3 -0.9 -0.3 -1c0 -0.1 -0.1 -0.2 -0.3 -0.2c-0.2 0 -0.3 0.1 -0.4 0.4c0 0.3 -0.2 0.8 -0.2 0.8l-0.3 0l0.3 -1c0 0 0.1 -0.6 -0.2 -0.5c-0.3 0.1 -0.2 -0.1 -0.4 -0.1c-0.2 -0.1 -0.3 0.1 -0.4 0c-0.2 -0.1 -0.3 0.1 -0.5 0c-0.2 -0.1 -0.1 0 -0.3 0.3c-0.2 0.3 -0.4 0.3 -0.4 0.3l0.2 1.1l-0.3 0l-0.2 -1.1c0 0 -0.4 -0.6 -0.5 -0.4c-0.1 0.3 -0.1 0.4 -0.3 0.4c-0.1 -0.1 -0.2 1.1 -0.2 1.1l-0.3 0l0.2 -1.1c0 0 -0.3 -0.1 -0.3 -0.5c0 -0.3 0.1 -0.5 0.1 -0.7c0.1 -0.2 -0.1 -1 -0.2 -1.1c-0.1 -0.2 -0.2 -0.8 -0.2 -0.8c0 0 -0.1 -0.5 0.4 -0.8z' ]) color_scale = alt.Scale(domain=domains, range=[ 'rgb(162,160,152)', 'rgb(194,81,64)', 'rgb(93,93,93)', 'rgb(91,131,149)' ]) alt.Chart(source).mark_point(filled=True, opacity=1, size=100).encode( alt.X('x:O', axis=None), alt.Y('animal:O', axis=None), alt.Row('country:N', header=alt.Header(title='')), alt.Shape('animal:N', legend=None, scale=shape_scale), alt.Color('animal:N', legend=None, scale=color_scale), ).transform_window(x='rank()', groupby=['country', 'animal']).properties(width=550, height=140)
def Plots(result, nth, w, h): # This allows for data greater than 5000 rows to be plotted alt.data_transformers.disable_max_rows() x, y17, y17w, y17b, y18, y40, y40b = (result[4], result[5], result[6], result[7], result[8], result[9], result[10]) TPDsi, TPDei, Arsi, Arei = (result[14], result[15], result[16], result[17]) wf, watstart, watend, ratio1718 = (result[18], result[11], result[12], result[13]) temp = list(x[TPDsi:TPDei]) temp = temp - temp[0] temp = 323 + temp * 10 temp = list(temp[0::nth]) # cut out and keep every nth point # Raw data Cuts xr = list(x[0::nth]) y17r = list(y17[0::nth]) y18r = list(y18[0::nth]) y40r = list(y40[0::nth]) # TPDsi:TPDei cuts xt = (list(x[TPDsi:TPDei]))[0::nth] y17t = (list(y17[TPDsi:TPDei]))[0::nth] y17wt = (list(y17w[TPDsi:TPDei]))[0::nth] y17bt = (list(y17b[TPDsi:TPDei]))[0::nth] y18t = (list(y18[TPDsi:TPDei]))[0::nth] # Ar cuts xa = (list(x[Arsi:Arei]))[0::nth] y40a = (list(y40[Arsi:Arei]))[0::nth] y40ba = (list(y40b[Arsi:Arei]))[0::nth] raw_data = pd.DataFrame({'x': xr, 'y17': y17r, 'y18': y18r, 'y40': y40r}) raw_reshape = pd.melt(raw_data, id_vars=['x'], value_vars=['y17', 'y18', 'y40'], var_name='legend', value_name='y') raw_chart = alt.Chart(raw_reshape).mark_line(size=3).encode( alt.X('x', axis=alt.Axis(tickCount=7, title='Time (min)')), alt.Y('y', axis=alt.Axis(tickCount=7, title='Intensity (counts)')), alt.Color('legend', legend=alt.Legend( orient='top-left'))).configure_axis(grid=False).properties( width=w, height=h, title='Raw Data from CSV').interactive() TPD_data = pd.DataFrame({ 'x': xt, 'y17': y17t, 'y17 water corrected': y17wt, 'y17 baseline corrected': y17bt, 'y18': y18t }) TPD_reshape = pd.melt(TPD_data, id_vars=['x'], value_vars=[ 'y17', 'y17 water corrected', 'y17 baseline corrected', 'y18' ], var_name='legend', value_name='y') TPD_chart = alt.Chart(TPD_reshape).mark_line(size=3).encode( alt.X('x', axis=alt.Axis(title='Time (min)')), alt.Y('y', axis=alt.Axis(title='Intensity (counts)')), alt.Color('legend', legend=alt.Legend( orient='top-right'))).configure_axis(grid=False).properties( width=w, height=h, title='TPD: y17 Water and Baseline Correction ') Ar_data = pd.DataFrame({ 'x': xa, 'y40': y40a, 'y40 baseline corrected': y40ba }) Ar_reshape = pd.melt(Ar_data, id_vars=['x'], value_vars=['y40', 'y40 baseline corrected'], var_name='legend', value_name='y') Ar_chart = alt.Chart(Ar_reshape).mark_line(size=3).encode( alt.X('x', axis=alt.Axis(title='Time (min)')), alt.Y('y', axis=alt.Axis(title='Intensity (counts)')), alt.Color('legend', legend=alt.Legend(orient='top-right'))).configure_axis( grid=False).properties(width=w, height=h, title='Ar Pulse') T_data = pd.DataFrame({'T': temp, 'y': y17bt}) T_chart = alt.Chart(T_data).mark_line(size=3).encode( alt.X('T', axis=alt.Axis(title='Temp (K)')), alt.Y('y', axis=alt.Axis(title='Intensity (counts)')), ).configure_axis(grid=False).properties( width=w, height=h, title='TPD as a Function of Temperature') if wf == 1: W_data = pd.DataFrame({'x': x[watstart:watend], 'y': ratio1718}) W_chart = alt.Chart(W_data).mark_line(size=3).encode( alt.X('x', axis=alt.Axis(title='Time (min)')), alt.Y('y', axis=alt.Axis(title='Value of y17/y18 Preceeding TPD')), ).configure_axis(grid=False).properties( width=w, height=h, title='Water Correction Factor') else: W_chart = alt.LayerChart() return (raw_chart, TPD_chart, Ar_chart, T_chart, W_chart)
""" Normalized Stacked Bar Chart ---------------------------- This example shows how to make a normalized stacked bar chart. """ import altair as alt from altair.expr import datum, if_ from vega_datasets import data source = data.population.url alt.Chart(source).mark_bar().encode( alt.X('age:O', scale=alt.Scale(rangeStep=17)), alt.Y('sum(people):Q', axis=alt.Axis(title='population'), stack='normalize' ), alt.Color('gender:N', scale=alt.Scale(range=["#EA98D2", "#659CCA"]) ) ).transform_filter( datum.year == 2000 ).transform_calculate( "gender", if_(datum.sex == 2, 'Female', 'Male') )
def make_org_plot(infile): """ Generate plots to explore the traffic distribution across organizations """ pd.set_option('display.max_columns', None) grouped_flows = infra.pd.read_parquet(infile) grouped_flows = grouped_flows.reset_index() grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[ "bytes_down"] # If any orgs are visited by fewer than 5 participants, need to be "other" per IRB user_count = grouped_flows.copy()[["org", "user", "bytes_total"]] user_count = user_count.set_index("bytes_total") user_count = user_count.drop(0).reset_index() user_count = user_count.groupby(["org", "user"]).sum().reset_index() user_count = user_count.groupby(["org"]).count() small_orgs = user_count.loc[user_count["user"] < 5] small_orgs = small_orgs.reset_index()["org"] grouped_flows = grouped_flows.replace(small_orgs.values, value="Aggregated (Users < 5)") # Filter users by time in network to eliminate early incomplete samples user_active_ranges = infra.pd.read_parquet( "data/clean/user_active_deltas.parquet")[[ "user", "days_since_first_active", "days_active" ]] # Drop users that joined less than a week ago or were active for less than a week. users_to_analyze = user_active_ranges.loc[ user_active_ranges["days_since_first_active"] >= 7] grouped_flows = grouped_flows.merge(users_to_analyze, on="user", how="inner") print(user_active_ranges.head(10)) # Figure out sorting order by total amount. org_totals = grouped_flows.groupby("org").sum().reset_index() org_sort_order = org_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() cat_sort_list = org_sort_order["org"].tolist() user_totals = grouped_flows.groupby("user").sum().reset_index() user_sort_order = user_totals.sort_values( "bytes_total", ascending=False).set_index("bytes_total").reset_index() user_sort_list = user_sort_order["user"].tolist() grouped_flows["GB"] = grouped_flows["bytes_total"] / (1000**3) grouped_flows = grouped_flows[["org", "user", "GB"]].groupby(["user", "org"]).sum() grouped_flows = grouped_flows.reset_index() grouped_flows["logGB"] = grouped_flows["GB"].transform(np.log10) alt.Chart(grouped_flows).mark_rect().encode( x=alt.X( "user:N", title="User (Sorted by Total GB)", axis=alt.Axis(labels=False), sort=user_sort_list, ), y=alt.Y( "org:N", title="Organization (Sorted by Total GB)", sort=cat_sort_list, ), # shape="direction", color=alt.Color( "logGB:Q", title="log(Total GB)", scale=alt.Scale(scheme="viridis"), ), ).properties(width=500, ).save( "renders/users_per_category_org.png", scale_factor=2, ) # Normalize by each user's total spend to highlight categories user_total_to_merge = user_totals[[ "user", "bytes_total" ]].rename(columns={"bytes_total": "user_total_bytes"}) normalized_user_flows = grouped_flows.copy() normalized_user_flows = normalized_user_flows.merge(user_total_to_merge, on="user") normalized_user_flows["user_total_bytes"] = normalized_user_flows[ "user_total_bytes"] / 1000**3 normalized_user_flows["normalized_bytes"] = normalized_user_flows[ "GB"] / normalized_user_flows["user_total_bytes"] alt.Chart(normalized_user_flows).mark_rect().encode( x=alt.X( "user:N", title="User (Sorted by Total GB)", axis=alt.Axis(labels=False), sort=user_sort_list, ), y=alt.Y( "org:N", title="Organization (Sorted by Total GB)", sort=cat_sort_list, ), # shape="direction", color=alt.Color( "normalized_bytes:Q", title="Normalized (Per User) Traffic", scale=alt.Scale(scheme="viridis"), ), ).properties(width=500, ).save( "renders/users_per_category_org_normalized.png", scale_factor=2, )
st.write( """Customize the x and y axis through the sidebar visualization settings. You can also select binary features as labels which will be in the form of a color.""") select_graph = st.sidebar.radio('Select Graph', ('point', 'bar', 'area', 'line')) col1, col2, col3 = st.beta_columns([.5, .5, 1]) graph_hgt = col1.slider('Height', 200, 600, 400, step=10) graph_wgt = col2.slider('Width', 400, 800, 600, step=10) df = df.loc[(df.creatinine_phosphokinase < 800) & (df.platelets < 500000) & (df.serum_creatinine < 2.2) & (df.age >= 40)] chart = alt.Chart(data=df, mark=select_graph).encode( alt.X(x_axis, scale=alt.Scale(zero=False)), alt.Y(y_axis, scale=alt.Scale(zero=False)), color=label).properties(height=graph_hgt, width=graph_wgt) st.write(chart) if y_axis == 'age' and x_axis == 'platelets' and label == 'DEATH_EVENT': st.write( 'Majority of deceased patients had platelet count ranging from 150,000 - 300,000 and aged 58 - 75' ) elif y_axis == 'age' and x_axis == 'creatinine_phosphokinase' and label == 'DEATH_EVENT': st.write( 'Majority of deceased patients had creatinine phosphokinase count ranging from 100 - 250 and aged 55 - 70' ) elif y_axis == 'age' and x_axis == 'serum_creatinine' and label == 'DEATH_EVENT': st.write( 'Majority of deceased patients had serum creatinine count ranging from 1.2 - 1.9 and aged 50 - 75'
def graph_spinorama(dfu, graph_params): xmin = graph_params['xmin'] xmax = graph_params['xmax'] ymin = graph_params['ymin'] ymax = graph_params['ymax'] if xmax == xmin: logging.error('Graph configuration is incorrect: xmin==xmax') if ymax == ymin: logging.error('Graph configuration is incorrect: ymin==ymax') # add selectors selectorsMeasurements = alt.selection_multi(fields=['Measurements'], bind='legend') scales = alt.selection_interval(bind='scales') # main charts xaxis = alt.X('Freq:Q', title='Freqency (Hz)', scale=alt.Scale(type='log', base=10, nice=False, domain=[xmin, xmax]), axis=alt.Axis(format='s')) yaxis = alt.Y('dB:Q', title='Sound Pressure (dB)', scale=alt.Scale(zero=False, domain=[ymin, ymax])) # why -10? di_yaxis = alt.Y('dB:Q', title='Sound Pressure DI (dB)', scale=alt.Scale(zero=False, domain=[-5, ymax - ymin - 5])) color = alt.Color('Measurements', type='nominal', sort=None) opacity = alt.condition(selectorsMeasurements, alt.value(1), alt.value(0.2)) line = alt.Chart(dfu).mark_line().transform_filter( alt.FieldOneOfPredicate(field='Measurements', oneOf=[ 'On Axis', 'Listening Window', 'Early Reflections', 'Sound Power' ])).encode(x=xaxis, y=yaxis, color=color, opacity=opacity) circle = alt.Chart(dfu).mark_circle(size=100).transform_filter( alt.FieldOneOfPredicate( field='Measurements', oneOf=[ 'On Axis', 'Listening Window', 'Early Reflections', 'Sound Power' ])).encode(x=xaxis, y=yaxis, color=color, opacity=alt.condition(nearest, alt.value(1), alt.value(0)), tooltip=['Measurements', 'Freq', 'dB']) di = alt.Chart(dfu).mark_line().transform_filter( alt.FieldOneOfPredicate( field='Measurements', oneOf=['Early Reflections DI', 'Sound Power DI'])).encode(x=xaxis, y=di_yaxis, color=color, opacity=opacity) circle_di = alt.Chart(dfu).mark_circle(size=100).transform_filter( alt.FieldOneOfPredicate( field='Measurements', oneOf=['Early Reflections DI', 'Sound Power DI' ])).encode(x=xaxis, y=di_yaxis, color=color, opacity=alt.condition(nearest, alt.value(1), alt.value(0)), tooltip=['Measurements', 'Freq', 'dB']) # assemble elements together spin = alt.layer(circle + line, circle_di + di).resolve_scale( y='independent').add_selection(selectorsMeasurements).add_selection( scales).add_selection(nearest).properties( width=graph_params['width'], height=graph_params['height']) return spin
def main(): data = read_data() fos_level = unique_fos_level(data) model = load_bert_model() faiss_index = faiss.deserialize_index(load_faiss_index()) author_data = read_author_data() st.title("ACL Publications Explorer") filter_year = st.sidebar.slider("Filter by year", 2000, 2020, (2000, 2020), 1) filter_fos_level = st.sidebar.selectbox("Choose Field of Study level", fos_level) fields_of_study = unique_fos(data, filter_fos_level, 25) filter_fos = st.sidebar.multiselect("Choose Fields of Study", fields_of_study) author_input = st.sidebar.text_input("Search by author name") # User search user_input = st.sidebar.text_area("Search by paper title") num_results = st.sidebar.slider("Number of search results", 10, 150, 10) if filter_fos and not user_input and not author_input: frame = data[(data.name.isin(filter_fos)) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1]))] color_on_fos = True elif filter_fos and user_input and not author_input: encoded_user_input = vector_search([user_input], model, faiss_index, num_results) frame = data[(data.name.isin(filter_fos)) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1])) & (data.id.isin(encoded_user_input))] color_on_fos = True elif filter_fos and user_input and author_input: ids = author_data[author_data.name == author_input]['paper_id'] encoded_user_input = vector_search([user_input], model, faiss_index, num_results) frame = data[(data.name.isin(filter_fos)) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1])) & (data.id.isin(encoded_user_input)) & (data.id.isin(ids))] color_on_fos = True elif filter_fos and not user_input and author_input: ids = author_data[author_data.name == author_input]['paper_id'] frame = data[(data.name.isin(filter_fos)) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1])) & (data.id.isin(ids))] color_on_fos = True elif not filter_fos and user_input and not author_input: encoded_user_input = vector_search([user_input], model, faiss_index, num_results) frame = data[data.id.isin(encoded_user_input) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1]))] color_on_fos = False elif not filter_fos and user_input and author_input: encoded_user_input = vector_search([user_input], model, faiss_index, num_results=150) ids = author_data[author_data.name == author_input]['paper_id'] frame = data[(data.id.isin(ids)) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1])) & (data.id.isin(encoded_user_input))] color_on_fos = False elif not filter_fos and not user_input and author_input: ids = author_data[author_data.name == author_input]['paper_id'] frame = data[(data.id.isin(ids)) & (data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1]))] color_on_fos = False else: frame = data[(data.year >= str(filter_year[0])) & (data.year <= str(filter_year[1]))] color_on_fos = False if color_on_fos: chart = alt.Chart(frame.drop_duplicates('id')).mark_point().encode( alt.X('Component 1', scale=alt.Scale(domain=(1, 16))), alt.Y('Component 2', scale=alt.Scale(domain=(0, 18))), alt.Color('name', title='Field of Study'), alt.Size('citations', scale=alt.Scale(range=[10, 500]), title='Citations'), href='source:N', tooltip=['title', 'year']).interactive().properties(width=650, height=500) else: chart = alt.Chart(frame.drop_duplicates('id')).mark_point().encode( alt.X('Component 1', scale=alt.Scale(domain=(1, 16))), alt.Y('Component 2', scale=alt.Scale(domain=(0, 18))), alt.Size('citations', scale=alt.Scale(range=[10, 500]), title='Citations'), href='source:N', tooltip=['title', 'year']).interactive().properties(width=650, height=500) bar_data = pd.DataFrame(frame[frame.level == filter_fos_level].groupby( 'name')['id'].count()).reset_index().sort_values('id', ascending=False)[:30] barchart = alt.Chart(bar_data).mark_bar().encode( alt.X('name', sort='-y', title='Fields of Study'), alt.Y('id', title='Count')).properties(width=650, height=150) c = (chart & barchart) st.altair_chart(c, use_container_width=True) st.subheader("How to use this app") st.write(f""" This application is intended for the visual exploration and discovery of research publications that have been presented at the ACL (Annual Meeting of the Association for Computational Linguistics). Every particle in the scatterplot is an academic publication. The particles are positioned in space based on the semantic similarity of the paper titles; the closer two points are, the more semantically similar their titles. You can hover over the particles to read their titles and you can click them to be redirected to the original source. You can zoom in the visualisation by scrolling and you can reset the view by double clicking the white space within the figure. Regarding the bar chart, it shows the most used Fields of Study for the papers shown in the scatterplot. You can also **search** for publications by paper titles (more information below). #### Filters You can refine your query based on the publication year, paper content, field of study and author. You can also combine any of the filter for more granular searches. - **Filter by year**: Select a time range for the papers. For example, drag both sliders to 2020 to find out the papers that will be presented at ACL 2020. - **Field of Study level**: Microsoft Academic Graph uses a 6-level hierarchy where level 0 contains high level disciplines such as Computer science and level 5 contains the most granular paper keywords. This filter will change what's shown in the bar chart as well as the available options in the filter below. - ** Fields of Study**: Select the Fields of Study to be displayed in the visualisations. The available options are affected by your selection in the above filter. - **Search by author name**: Find an author's publications. **Note**: You need to type in the exact name. - **Search by paper title**: Type in a paper title and find its most relevant relevant publications. You should use at least a sentence to receive meaningful results. - **Number of search results**: Specify the number of papers to be returned when you search by paper title. """) st.subheader("About") st.write(f""" I am [Kostas](http://kstathou.github.io/) and I work at the intersection of knowledge discovery, data engineering and scientometrics. I am a Mozilla Open Science Fellow and a Principal Data Science Researcher at Nesta. I am currently working on [Orion](https://orion-search.org/) (work in progress), an open-source knowledge discovery and research measurement tool. If you have any questions or would like to learn more about it, you can find me on [twitter](https://twitter.com/kstathou) or send me an email at [email protected] """) st.subheader("Appendix: Data & methods") st.write(f""" I collected all of the publications from [Microsoft Academic Graph](https://www.microsoft.com/en-us/research/project/academic-knowledge/) that were published between 2000 and 2020 and were presented at the ACL. I fetched 8,724 publications. To create the 2D visualisation, I encoded the paper titles to dense vectors using a [sentence-DistilBERT](https://github.com/UKPLab/sentence-transformers) model. That produced a 768-dimensional vector for each paper which I projected to a 2D space with [UMAP](https://umap-learn.readthedocs.io/en/latest/). For the paper title search engine, I indexed the vectors with [Faiss](https://github.com/facebookresearch/faiss/tree/master/python). """)
his.update_yaxes(title="Listings") st.plotly_chart(his) st.header("Scatter Visualization of Price") st.markdown('In addition, we want to learn how the number of bedrooms, ratings and capacity affect the price.') column = ["neighbourhood_cleansed","bedrooms","beds","review_scores_rating", "accommodates"] x_axis = st.selectbox('X Axis',column) color_list = ["room_type", "neighbourhood_cleansed", "bedrooms", "review_scores_rating"] scatter_color = st.selectbox('Color', color_list) scatter = alt.Chart(filtered_listing).mark_point().encode( alt.X(x_axis), alt.Y("price"), alt.Color(scatter_color) ) st.write(scatter) st.header("Price Estimation") st.markdown('Finally, before we actually book an order, let\'s estimate how much we are going to pay.') st.write("Please provide the room type, location and the number of bedrooms") est_list = ["room_type", "neighbourhood_cleansed", "bedrooms"] est_room_type = st.selectbox('Room Type', df_listing['room_type'].unique()) est_neigh = st.selectbox('Location', df_listing['neighbourhood_cleansed'].unique()) est_bed = st.selectbox('Number of Bedroom', df_listing['bedrooms'].unique()) # kNN algorithm, take average for tie-break
# Sidebar Controls st.sidebar.header('Filter Data:') year = st.sidebar.slider('Year', 1970, 1980, (1970, 1980)) origin = st.sidebar.multiselect('Origin', ['Europe', 'Japan', 'USA'], ['Europe', 'Japan', 'USA']) # Filter data by sidebar inputs: cars = df[(df['Year'].dt.year.between(year[0], year[1])) & (df['Origin'].isin(origin))] cars # Summary of selected data chart = alt.Chart(cars).mark_bar().encode( x='count()', y='Origin', color='Origin').properties( width=300, height=200) | alt.Chart(cars).mark_bar().encode( alt.X("year(Year):N"), y='count()', color='Origin').properties( width=300, height=200) chart st.markdown('## Projection of cars') projcars = cars.dropna().reset_index(drop=True) features = st.multiselect('Features to project:', [ 'Weight_in_lbs', 'Horsepower', 'Miles_per_Gallon', 'Displacement', 'Cylinders', 'Acceleration' ], ['Weight_in_lbs', 'Horsepower', 'Miles_per_Gallon']) method_name = st.selectbox('Projection method:', ('PCA', 'MDS', 'TSNE')) projData = projcars.drop(projcars.columns.difference(features), axis=1)
def plot_mds( self, rank="auto", metric="braycurtis", method="pcoa", title=None, xlabel=None, ylabel=None, color=None, size=None, tooltip=None, return_chart=False, label=None, ): """Plot beta diversity distance matrix using multidimensional scaling (MDS). Parameters ---------- rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. metric : {'braycurtis', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac}, optional Function to use when calculating the distance between two samples. method : {'pcoa', 'smacof'} Algorithm to use for ordination. PCoA uses eigenvalue decomposition and is not well suited to non-euclidean distance functions. SMACOF is an iterative optimization strategy that can be used as an alternative. title : `string`, optional Text label at the top of the plot. xlabel : `string`, optional Text label along the horizontal axis. ylabel : `string`, optional Text label along the vertical axis. size : `string` or `tuple`, optional A string or a tuple containing strings representing metadata fields. The size of points in the resulting plot will change based on the metadata associated with each sample. color : `string` or `tuple`, optional A string or a tuple containing strings representing metadata fields. The color of points in the resulting plot will change based on the metadata associated with each sample. tooltip : `string` or `list`, optional A string or list containing strings representing metadata fields. When a point in the plot is hovered over, the value of the metadata associated with that sample will be displayed in a modal. label : `string` or `callable`, optional A metadata field (or function) used to label each analysis. If passing a function, a dict containing the metadata for each analysis is passed as the first and only positional argument. The callable function must return a string. Examples -------- Scatter plot of weighted UniFrac distance between all our samples, using counts at the genus level. >>> plot_mds(rank='genus', metric='unifrac') Notes ----- **For `smacof`**: The values reported on the axis labels are Pearson's correlations between the distances between points on each axis alone, and the corresponding distances in the distance matrix calculated using the user-specified metric. These values are related to the effectiveness of the MDS algorithm in placing points on the scatter plot in such a way that they truly represent the calculated distances. They do not reflect how well the distance metric captures similarities between the underlying data (in this case, an OTU table). """ import altair as alt import numpy as np import pandas as pd from scipy.spatial.distance import squareform from scipy.stats import pearsonr from skbio.stats import ordination from sklearn import manifold from sklearn.metrics.pairwise import euclidean_distances if len(self._results) < 2: raise OneCodexException( "`plot_mds` requires 2 or more valid classification results.") dists = self._compute_distance(rank, metric).to_data_frame() # here we figure out what to put in the tooltips and get the appropriate data if tooltip: if not isinstance(tooltip, list): tooltip = [tooltip] else: tooltip = [] tooltip.insert(0, "Label") if color and color not in tooltip: tooltip.insert(1, color) if size and size not in tooltip: tooltip.insert(2, size) magic_metadata, magic_fields = self._metadata_fetch(tooltip, label=label) if method == "smacof": # adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html x_field = "MDS1" y_field = "MDS2" seed = np.random.RandomState(seed=3) mds = manifold.MDS(max_iter=3000, eps=1e-12, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(dists).embedding_ plot_data = pd.DataFrame(pos, columns=[x_field, y_field], index=dists.index) plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1) # normalize to [0,1] # determine how much of the original distance is captured by each of the axes after MDS. # this implementation of MDS does not use eigen decomposition and so there's no simple # way of returning a 'percent of variance explained' value r_squared = [] for axis in [0, 1]: mds_dist = pos.copy() mds_dist[::, axis] = 0 mds_dist = squareform(euclidean_distances(mds_dist).round(6)) r_squared.append(pearsonr(mds_dist, squareform(dists))[0]) # label the axes x_extra_label = "r² = %.02f" % (r_squared[0], ) y_extra_label = "r² = %.02f" % (r_squared[1], ) elif method == "pcoa": # suppress eigenvalue warning from skbio--not because it's an invalid warning, but # because lots of folks in the field run pcoa on these distances functions, even if # statistically inappropriate. perhaps this will change if we ever become more # opinionated about the analyses that we allow our users to do (roo) with warnings.catch_warnings(): warnings.simplefilter("ignore") ord_result = ordination.pcoa( dists.round(6)) # round to avoid float precision errors plot_data = ord_result.samples.iloc[:, [0, 1 ]] # get first two components plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1) # normalize to [0,1] plot_data.index = dists.index x_field, y_field = plot_data.columns.tolist( ) # name of first two components x_extra_label = "%0.02f%%" % (ord_result.proportion_explained[0] * 100, ) y_extra_label = "%0.02f%%" % (ord_result.proportion_explained[1] * 100, ) else: raise OneCodexException("MDS method must be one of: smacof, pcoa") # label the axes if xlabel is None: xlabel = "{} ({})".format(x_field, x_extra_label) if ylabel is None: ylabel = "{} ({})".format(y_field, y_extra_label) plot_data = pd.concat([plot_data, magic_metadata], axis=1).reset_index() alt_kwargs = dict( x=alt.X(x_field, axis=alt.Axis(title=xlabel)), y=alt.Y(y_field, axis=alt.Axis(title=ylabel)), tooltip=[magic_fields[t] for t in tooltip], href="url:N", url="https://app.onecodex.com/classification/" + alt.datum.classification_id, ) # only add these parameters if they are in use if color: alt_kwargs["color"] = magic_fields[color] if size: alt_kwargs["size"] = magic_fields[size] chart = (alt.Chart(plot_data).transform_calculate( url=alt_kwargs.pop("url")).mark_circle().encode(**alt_kwargs)) if title: chart = chart.properties(title=title) if return_chart: return chart else: chart.interactive().display()
# Environment ------------------------------------------------------------------ from pathlib import Path import altair as alt import pandas as pd # Data I/O data = pd.read_csv('summary.csv') # define selection click = alt.selection_multi(encodings=['color']) scatter_fd = alt.Chart(data).mark_circle(size=250).encode( x=alt.X('sub:Q', title=''), y=alt.Y('fd-mean:Q', title='Mean Framewise Displacement'), color=alt.Color('task:N', scale=alt.Scale(range=['#E7DECD', '#B9314F', '#312F2F']), legend=None), tooltip=[alt.Tooltip('sub:N', title='Subject No.'), alt.Tooltip('fd-per:Q', format='.2f', title='>0.2 (%)'), alt.Tooltip('fd-mean:Q', format='.4f', title='Mean'), alt.Tooltip('fd-std:Q', format='.4f', title='SD'), alt.Tooltip('fd-max:Q', format='.4f', title='Max'), alt.Tooltip('fd-min:Q', format='.4f', title='Min')] ).properties( width=700, height=400 ).transform_filter( click ).interactive()
def plot_weather_data(obs_df, col_name, time_basis): """ Visualizes the weather station observations including air temperature, atmospheric pressure, wind speed, and wind direction changing over time. Parameters ---------- obs_df : pandas.DataFrame A dataframe that contains a time series of weather station observations. col_name : str Variables that users would like to plot on a timely basis, including 'air_temp', 'atm_press', 'wind_spd', 'wind_dir' time_basis : str The users can choose to plot the observations on 'monthly' or 'daily basis' Returns ------- altair.vegalite.v4.api.Chart A plot can visualize the changing of observation on the timely basis that user chooses. Examples -------- >>> plot_weather_data(obs_df, col_name="air_temp", time_basis="monthly") """ # Test input types assert ( type(obs_df) == pd.core.frame.DataFrame ), "Weather data should be a Pandas DataFrame." assert type(col_name) == str, "Variable name must be entered as a string" assert type(time_basis) == str, "Time basis must be entered as a string" # Test edge cases assert col_name in [ "air_temp", "atm_press", "wind_spd", "wind_dir", ], "Variable can only be one of air_temp, atm_press, wind_spd or wind_dir" assert time_basis in [ "monthly", "daily", ], "Time basis can only be monthly or daily" df = obs_df.dropna() assert ( len(df.index) > 2 ), "Dataset is not sufficient to visualize" # Test edge cases year = df.datetime.dt.year[0] if time_basis == "monthly": df = df.set_index("datetime").resample("M").mean().reset_index() assert ( len(df.index) > 2 ), "Dataset is not sufficient to visualize" # Test edge cases if col_name == "air_temp": line = ( alt.Chart(df, title="Air Temperature for " + str(year)) .mark_line(color="orange") .encode( alt.X( "month(datetime)", title="Month", axis=alt.Axis(labelAngle=-30), ), alt.Y( "air_temp", title="Air Temperature", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) elif col_name == "atm_press": line = ( alt.Chart(df, title="Atmospheric Pressure for " + str(year)) .mark_line(color="orange") .encode( alt.X( "month(datetime)", title="Month", axis=alt.Axis(labelAngle=-30), ), alt.Y( "atm_press", title="Atmospheric Pressure", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) elif col_name == "wind_spd": line = ( alt.Chart(df, title="Wind Speed for " + str(year)) .mark_line(color="orange") .encode( alt.X( "month(datetime)", title="Month", axis=alt.Axis(labelAngle=-30), ), alt.Y( "wind_spd", title="Wind Speed", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) else: line = ( alt.Chart(df, title="Wind Direction for " + str(year)) .mark_line(color="orange") .encode( alt.X( "month(datetime)", title="Month", axis=alt.Axis(labelAngle=-30), ), alt.Y( "wind_dir", title="Wind Direction", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) else: df = df.set_index("datetime").resample("D").mean().reset_index() assert ( len(df.index) > 2 ), "Dataset is not sufficient to visualize" # Test edge cases if col_name == "air_temp": line = ( alt.Chart(df, title="Air Temperature for " + str(year)) .mark_line(color="orange") .encode( alt.X( "datetime", title="Date", axis=alt.Axis(labelAngle=-30) ), alt.Y( "air_temp", title="Air Temperature", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) elif col_name == "atm_press": line = ( alt.Chart(df, title="Atmospheric Pressure for " + str(year)) .mark_line(color="orange") .encode( alt.X( "datetime", title="Date", axis=alt.Axis(labelAngle=-30) ), alt.Y( "atm_press", title="Atmospheric Pressure", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) elif col_name == "wind_spd": line = ( alt.Chart(df, title="Wind Speed for " + str(year)) .mark_line(color="orange") .encode( alt.X( "datetime", title="Date", axis=alt.Axis(labelAngle=-30) ), alt.Y( "wind_spd", title="Wind Speed", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) else: line = ( alt.Chart(df, title="Wind Direction for " + str(year)) .mark_line(color="orange") .encode( alt.X( "datetime", title="Date", axis=alt.Axis(labelAngle=-30) ), alt.Y( "wind_dir", title="Wind Direction", scale=alt.Scale(zero=False), ), alt.Tooltip(col_name), ) ) chart = ( line.properties(width=500, height=350) .configure_axis(labelFontSize=15, titleFontSize=20, grid=False) .configure_title(fontSize=25) ) return chart
def main(): #st.write(pd.value_counts(dados_covid["resultadoTeste"])) st.sidebar.title('HACKATHON 1') st.sidebar.subheader('Análise dados COVID-19') filtro_coluna = st.sidebar.selectbox( 'Selecione o filtro', ('Dados Gerais', 'Idade', 'Sexo', 'Sintomas')) if filtro_coluna: if filtro_coluna == 'Dados Gerais': st.write(dados_covid.head(1000)) if st.checkbox("Mostrar colunas"): st.write(dados_covid.count()) if filtro_coluna == 'Sexo': st.subheader('Sexo por resultado do teste') sexo_barras = alt.Chart(dados_covid, width=200).mark_bar().encode( alt.X('sexo:O', axis=alt.Axis(title='')), alt.Y('count():Q'), alt.Column('resultadoTeste:O'), color=alt.Color('sexo:N', scale=alt.Scale(range=["#EA98D2", "#659CCA"])), tooltip='count()').interactive() st.altair_chart(sexo_barras) # count_sexo = pd.value_counts(dados_covid['sexo'])color=alt.Color('gender:N', scale=alt.Scale(range=["#EA98D2", "#659CCA"])) # st.write(count_sexo) # st.bar_chart(count_sexo) if filtro_coluna == 'Idade': st.markdown('Describe da coluna Idade') st.write(dados_covid.idade.describe()) st.subheader('Idade por resultado teste') idade_barras = alt.Chart(dados_covid, width=200).mark_bar().encode( alt.X('idade', bin=alt.Bin(maxbins=20)), alt.Y('count():Q'), alt.Column('resultadoTeste:O'), color='resultadoTeste', tooltip=['idade', 'count()']).transform_filter( alt.FieldRangePredicate(field='idade', range=[0, 120])).interactive() st.altair_chart(idade_barras, use_container_width=True) idade_line = alt.Chart(dados_covid, width=600).mark_line().encode( x='idade', y='count():Q', color='resultadoTeste:O').transform_filter( alt.FieldRangePredicate(field='idade', range=[0, 100])) st.altair_chart(idade_line) if filtro_coluna == 'Sintomas': st.subheader('Sintomas relatados') sintomas = dados_covid.columns[4:] df_sintomas = dados_covid[sintomas].sum().reset_index() df_sintomas.columns = ['sintoma', 'count'] st.write(df_sintomas) sintomas_bar = alt.Chart(df_sintomas, width=700).mark_bar().encode( x='sintoma', y='count', tooltip=['count']).interactive() st.write("\n\n") st.altair_chart(sintomas_bar) st.subheader('Sintomas por resultado do teste') grouped = dados_covid.groupby(['resultadoTeste'])[sintomas].sum() st.write(grouped) st.write("\n\n\n") st.bar_chart(grouped) select = st.multiselect( "Selecione combinação de sintomas apresentados", dados_covid.columns[4:].tolist(), default=["Febre"]) df_select = countsintomas(select, sintomas) df_select # st.write("Numero de ocorrências: ", len(df_select.index)) st.write(df_select.groupby(['resultadoTeste'])[sintomas].sum()) st.markdown( '->Número de pessoas que aprensentaram apenas os sintomas selecionados' ) st.sidebar.subheader('Grupo 1') st.sidebar.markdown('Daniel Santos Pereira') st.sidebar.markdown('Fernando Henrique De Brito Borges') st.sidebar.markdown('Gláucio Ribeiro Santos') st.sidebar.markdown('Rafael Rodrigues dos Santos')
def horizon_selector( base: alt.Chart, horizon_selection_brush: alt.MultiSelection, belief_horizon_unit: str, intuitive_forecast_horizon: bool, unique_belief_horizons, ) -> alt.LayerChart: bar_chart = ( base.mark_rule(orient="vertical").transform_filter( time_selection_brush ) # Apply brush before calculating accuracy metrics for the selected events on the fly .transform_calculate( constant=1 + alt.datum.event_start - alt.datum.event_start).transform_calculate( belief_horizon_str='datum.belief_horizon + " %s"' % belief_horizon_unit). encode( opacity=alt.condition( time_selection_brush, alt.Opacity("event_start:T", scale=alt.Scale(domain=(0.9999, 1)), legend=None), alt.value(0), ), # Trick to be able to apply the selection filter for event_start (event_start must be a field in one of the encoding channels) x=alt.X( "belief_horizon:Q", axis=alt.Axis(labelFlush=False), scale=alt.Scale( zero=False, domain=(unique_belief_horizons[0], unique_belief_horizons[-1]), ), title="", ), y=alt.Y( "constant:Q", title=" ", axis=alt.Axis(values=[], domain=False, ticks=False), ), color=alt.condition( horizon_selection_brush | horizon_hover_brush, alt.ColorValue("#c21431"), alt.ColorValue(idle_color), ), size=alt.value(1), tooltip=[ alt.Tooltip( "belief_horizon_str:N", title="Click to select %s" % ("forecast horizon" if intuitive_forecast_horizon else "belief horizon"), ) ], ).properties( height=30, title="Select %s" % ("forecast horizon" if intuitive_forecast_horizon else "belief horizon"), ).transform_filter(time_selection_brush)) circle_chart = (bar_chart.mark_circle().transform_calculate( half_constant=alt.datum.constant / 2).encode( y=alt.Y("half_constant:Q", title="", axis=alt.Axis(values=[])), size=alt.value(100), )) return ( bar_chart.add_selection(horizon_selection_brush, horizon_hover_brush) + circle_chart)
""" Error Bars showing Confidence Interval ====================================== This example shows how to show error bars using covidence intervals. The confidence intervals are computed internally in vega by a non-parametric [bootstrap of the mean](https://github.com/vega/vega-statistics/blob/master/src/bootstrapCI.js). """ import altair as alt from vega_datasets import data barley = data.barley() points = alt.Chart(barley).mark_point(filled=True).encode( alt.X('mean(yield)', scale=alt.Scale(zero=False), axis=alt.Axis(title='Barley Yield')), y='variety', color=alt.value('black')) error_bars = alt.Chart(barley).mark_rule().encode(x='ci0(yield)', x2='ci1(yield)', y='variety') points + error_bars