def hist(self, bins=None, stacked=None, orientation="vertical", **kwargs): data = self._preprocess_data(with_index=False) if isinstance(bins, int): bins = alt.Bin(maxbins=bins) elif bins is None: bins = True if orientation == "vertical": Indep, Dep = alt.X, alt.Y elif orientation == "horizontal": Indep, Dep = alt.Y, alt.X else: raise ValueError("orientation must be 'horizontal' or 'vertical'.") mark = self._get_mark_def({ "type": "bar", "orient": orientation }, kwargs) chart = (alt.Chart(data, mark=mark).transform_fold( list(data.columns), as_=["column", "value"]).encode( Indep("value:Q", title=None, bin=bins), Dep("count()", title="Frequency", stack=stacked), color="column:N", )) if kwargs.get("subplots"): nrows, ncols = _get_layout(data.shape[1], kwargs.get("layout", (-1, 1))) chart = chart.encode( facet=alt.Facet("column:N", title=None)).properties( columns=ncols) return chart
def altPlotNewDeaths(df: pd.DataFrame, save_chart=False): dChart = alt.Chart( df[df['denominazione_regione'] != 'Molise']).mark_line().encode( alt.X('data:T', title=None), alt.Y('rolling_mean:Q', title=None), color=alt.Color('denominazione_regione:N', legend=None, scale=alt.Scale(scheme='dark2')), facet=alt.Facet('denominazione_regione:N', columns=4, title=None), tooltip=[ alt.Tooltip('nuovi_decessi:Q', title='Nuovi decessi') ]).properties( width=160, height=90, title='Nuovi decessi').configure_view( strokeWidth=0).configure_axis(grid=False).configure_title( color='gray', fontSize=24, ).configure_line(size=4).transform_window( rolling_mean='mean(nuovi_decessi)', frame=[-1, 1], groupby=['denominazione_regione']) if save_chart: dChart.save('newDeaths.png', scale_factor=2.0) return dChart
def visualise_tsne(tsne_df, save=True, fig_num=15): """Visualise tsne plot""" tsne_base = alt.Chart(tsne_df).encode( x=alt.X("x:Q", title="", axis=alt.Axis(ticks=False, labels=False)), y=alt.Y("y:Q", title="", axis=alt.Axis(ticks=False, labels=False)), ) tsne_points = (( tsne_base.mark_point( filled=True, opacity=0.5, stroke="black", strokeOpacity=0.5).encode( color=alt.Color("org_type", title="Organisation type"), strokeWidth=alt.Stroke("top", scale=alt.Scale(range=[0, 1]), legend=None), # stroke = alt.value('blue'), size=alt.Size("activity:Q", title="Number of papers"), facet=alt.Facet("size", columns=2, title="Number of organisations in plot"), tooltip=["index"], )).interactive().resolve_scale( y="independent", x="independent").properties(width=250, height=250)) if save is True: save_altair(tsne_points, "fig_15_tsne", driv) return tsne_points
def altPlotNewICU(df: pd.DataFrame, save_chart=False): tiChart = alt.Chart(df).mark_line().encode( alt.X('data:T', title=None), alt.Y('rolling_mean:Q', title=None), color=alt.Color('denominazione_regione:N', legend=None, scale=alt.Scale(scheme='dark2')), facet=alt.Facet('denominazione_regione:N', columns=4, title=None), tooltip=[ alt.Tooltip('ingressi_terapia_intensiva:Q', title='Ingressi TI') ]).properties( width=160, height=90, title='Terapie intensive: nuovi ingressi').configure_view( strokeWidth=0).configure_axis(grid=False).configure_title( color='gray', fontSize=24, ).configure_line(size=4).transform_window( rolling_mean='mean(ingressi_terapia_intensiva)', frame=[-1, 1], groupby=['denominazione_regione']) if save_chart: tiChart.save('newTI.png', scale_factor=2.0) return tiChart
def plot_covariate_effects(self): """Plot covariate effects """ ce = (self.covariate_effects - 1) * 100 cov_stats = pd.melt(self.covariate_statistics.reset_index(), var_name='condition', id_vars=['covariate'], value_vars=['p5', 'p95', 'other']) cov_stats = cov_stats.replace({ 'p5': '5th', 'p95': '95th' }).set_index(['covariate', 'condition']) ce = ce.join(cov_stats, how='inner') # The left join reorders the index, pandas bug #34133 ce = ce.reorder_levels(['parameter', 'covariate', 'condition']) param_names = list(ce.index.get_level_values('parameter').unique()) plots = [] for parameter in param_names: df = ce.xs(parameter, level=0) df = df.reset_index() error_bars = alt.Chart(df).mark_errorbar(ticks=True).encode( x=alt.X('p5:Q', title='Effect size in percent', scale=alt.Scale(zero=False)), x2=alt.X2('p95:Q'), y=alt.Y('condition:N', title=None), ) rule = alt.Chart(df).mark_rule( strokeDash=[10, 4], color='gray').encode( x=alt.X('xzero:Q')).transform_calculate(xzero="0") points = alt.Chart(df).mark_point(filled=True, color='black').encode( x=alt.X('mean:Q'), y=alt.Y('condition:N'), ) text = alt.Chart(df).mark_text(dy=-15, color="red").encode( x=alt.X("mean:Q"), y=alt.Y("condition:N"), text=alt.Text("value:Q")) plot = alt.layer( error_bars, rule, points, text, data=df, width=800, height=100).facet( columns=1.0, row=alt.Facet('covariate:N', title=None), title=f'{parameter}').resolve_scale(y='independent') plots.append(plot) v = alt.vconcat(*plots).resolve_scale(x='shared') return v
def display_the_plot(): df = pd.concat(esb_data[esb_data["Type"] == var1] for var1 in session["selection_city"]) df1 = pd.concat(df[df["Council"] == var2] for var2 in session["selection_council"]) df2 = pd.concat(df1[df1["Year"] == int(var3)] for var3 in session["selection_year"]) print(df) print(df1) print(df2) ## print(df.shape) ## print(session["selection"]) plot = alt.Chart(df2).mark_bar().encode( alt.X( 'No_Of_Connections:Q', sort=alt.SortField(field='No_Of_Connections', order='ascending'), scale=alt.Scale(domain=(0, 1000)), axis=alt.Axis(title="Connection Count", tickCount=20), ), alt.Y('Month:O'), alt.Color('Month:N'), alt.Facet('Year:O'), ).properties(width=200) plot.save("templates/plot.html") return render_template("plot.html")
def make_chart_topic_spec( topic_rca, topic_mix, arxiv_cat_lookup, topic_thres=0.05, topic_n=150, save=False, fig_n="extra_1", ): """Visualises prevalence of topics in a category Args: topic_rca: relative specialisation of topics in categories arxiv_cat_lookup: lookup between category ids and names topic_thres: threshold for topic topic_n: number of topics to consider save: if we want to save the figure fig_n: figure id """ logging.info("Extracting topic counts") # Visualise topic distributions topic_counts_long = topic_rca.reset_index(drop=False).melt(id_vars="index") # Extract top topics top_topics = list( topic_mix.iloc[:, 1:] .applymap(lambda x: x > topic_thres) .sum(axis=0) .sort_values(ascending=False)[:topic_n] .index ) # Focus on those for the long topic topic_counts_long_ = topic_counts_long.loc[ topic_counts_long["variable"].isin(top_topics) ] # Add nice names for categoru topic_counts_long_["arx_cat"] = [ x.split(" ") for x in topic_counts_long_["index"].map(arxiv_cat_lookup) ] topic_spec = ( alt.Chart(topic_counts_long_) .mark_bar(color="red") .encode( y=alt.Y( "variable", sort=top_topics, axis=alt.Axis(labels=False, ticks=False) ), x="value", facet=alt.Facet("arx_cat", columns=5), tooltip=["variable", "value"], ) ).properties(width=100, height=100) if save is True: save_altair(topic_spec, f"fig_{fig_n}_topic_specialisations", driv) return topic_spec
def visualize_line_facet(df): graph = alt.Chart(df).mark_line().encode( x='Year:T', y=alt.Y('£mn:Q', scale=alt.Scale(type='log', clamp=True)), color='Type', facet=alt.Facet('Region:O', columns=3), ).properties(width=175, height=150) st.write(graph)
def plot_median_ci_width_static(): data2 = pd.DataFrame(results.groupby(['methodName', 'timeSeriesLength', 'trueRho'])['ci_size'].median()).reset_index() # the base chart base = alt.Chart(data2).transform_calculate( x_jittered = '0.05*random()*datum.timeSeriesLength+datum.timeSeriesLength', ymin = "datum.confIntLow", ymax = "datum.confIntHigh", ) selector = alt.selection_single( fields=['methodName'], empty='all', bind='legend') opacity = alt.condition(selector, alt.value(1.0), alt.value(0.5)) #generate the scatter points: points = base.mark_point(filled=True).add_selection(selector).encode( x=alt.X('x_jittered:Q',scale=alt.Scale(type='log'),title='Length of Timeseries'), y=alt.Y('ci_size:Q',scale=alt.Scale(type='log'),title='Median size of the CI'), size=alt.value(80), color=alt.condition(selector, col, alt.value('lightgrey')), opacity=opacity) selector = alt.selection_single( fields=['methodName'], empty='all', bind='legend') opacity = alt.condition(selector, alt.value(1.0), alt.value(0.5)) #generate the scatter points: line = base.mark_line().add_selection(selector).encode( x=alt.X('x_jittered:Q'), y=alt.Y('ci_size:Q'), color=alt.condition(selector, col, alt.value('lightgrey')), opacity=opacity) chart = alt.layer( points, line ).properties( width=250, height=200 ).facet(facet=alt.Facet('trueRho:N',title='Autocorrelation parameter (ρ)'), columns=3) chart = chart.configure_header(titleColor='darkred', titleFontSize=16, labelColor='darkred', labelFontSize=14) chart = chart.configure_legend( strokeColor='gray', fillColor='#EEEEEE', padding=10, cornerRadius=10, orient='top') return chart
def scoring_confusion_matrix( data: pd.DataFrame, xvar: str, target_var: str, threshold: float = 0.5, bin_width: float = 0.1, width: int = 200, height: int = 200, ) -> alt.Chart: data = compute_confusion_categories(data, xvar, target_var, threshold) confusion_categories_with_counts = data[ CONFUSION_CATEGORIES_COL_NAME].unique() binning = alt.Bin(step=bin_width) base = alt.Chart( data, width=width, height=height, usermeta={ "embedOptions": { "scaleFactor": 5, "downloadFileName": "scoring_confusion_matrix", } }, ) # It is necessary to use transforms, so that the faceted chart is sorted as intended. # More info: https://github.com/altair-viz/altair/issues/2303. hist = (base.mark_bar(tooltip=True).encode( x=alt.X( f"binned_{xvar}:Q", bin="binned", axis=alt.Axis(format="~", title="Score"), ), x2=f"binned_{xvar}_end:Q", y=alt.Y("y_count:Q", axis=alt.Axis(title="Count")), facet=alt.Facet( f"{CONFUSION_CATEGORIES_COL_NAME}:O", sort=confusion_categories_with_counts, title=None, columns=2, ), ).transform_bin(f"binned_{xvar}", xvar, bin=binning).transform_joinaggregate( y_count=f"count()", groupby=[ f"binned_{xvar}", f"binned_{xvar}_end", CONFUSION_CATEGORIES_COL_NAME, ], )) return hist.properties( title={ "text": "Scoring confusion matrix", "subtitle": f"Threshold: {threshold}", })
def boxplot(df: pd.DataFrame(), xaxis: str, x_title: str, yaxis: str, y_title: str, color_col: str, color_title: str, facet_column: str, facet_title: str, title: str) -> alt.vegalite.v4.api.Chart: """ Creates a boxplot based on the df, yaxis and title """ return alt.Chart(df).mark_boxplot().encode( x=alt.X(xaxis + ':O', title=x_title), y=alt.Y(yaxis, scale=alt.Scale(type="log"), title=y_title), color=alt.Color(color_col + ':O', title=color_title), ).facet(alt.Facet(facet_column, title=facet_title)).properties( title=title, ).interactive()
def non_ducs_per_formation(df, step, facet): df = sanitize_dataframe(df) chart = alt.Chart(df).mark_bar().encode( alt.Y('Days of Uncompleted Status', bin=alt.Bin(extent=[0, 500], step=step)), alt.X('count()', title='Number Wells'), facet=alt.Facet(facet, columns=3), color=alt.value('#e68805'), opacity=alt.value(0.7)).properties(width=200).interactive() return st.altair_chart(chart)
def plot_results_static(): # the base chart base = alt.Chart(data).transform_calculate( x_jittered = '0.15*(random()-0.5)*datum.timeSeriesLength+datum.timeSeriesLength', ymin="datum.confIntLow", ymax="datum.confIntHigh", goal='0.95') #generate the scatter points: points = base.mark_point(filled=True).encode( x=alt.X('x_jittered:Q', scale=alt.Scale(type='log'), title='Length of Timeseries'), y=alt.Y('rate:Q', scale=alt.Scale(domain=[0,1.04]), title='Rate of correct SEM'), size=alt.value(80), #color=alt.Color('methodName', sort=sort, legend=alt.Legend(title="SEM method"))) color=col) #generate the scatter points: line = base.mark_line().encode( x=alt.X('x_jittered:Q'), y=alt.Y('rate:Q'), color=col) #generate the 95% mark: rule = base.mark_rule(color='black').encode( alt.Y('goal:Q')) errorbars = base.mark_rule(strokeWidth=3).encode( alt.X("x_jittered:Q"), alt.Y("ymin:Q", title=''), alt.Y2("ymax:Q"), color=col) chart = alt.layer(errorbars, points, line, rule,).properties( width=250, height=200).facet(facet=alt.Facet('trueRho:N',title='Autocorrelation parameter (ρ)'), columns=3) chart = chart.configure_header(titleColor='darkred', titleFontSize=16, labelColor='darkred', labelFontSize=14) chart = chart.configure_legend( strokeColor='gray', fillColor='#EEEEEE', padding=10, cornerRadius=10, orient='top' ) return chart
def make_chart(self, df): return alt.Chart(df).mark_area().encode( x=alt.X('yearmonthdate(bulletin_date):T', title=None, axis=alt.Axis(format='%d/%m')), y=alt.Y('new_confirmed_cases', title=None, scale=alt.Scale(type='linear'))).properties( width=175, height=75).facet( columns=3, facet=alt.Facet( 'Municipio', title=None)).resolve_axis(x='independent')
def display_the_facetplot(): facetplot = alt.Chart(tidy_df).mark_area().encode( x='Year:O', y=alt.Y('sum(esb):Q', title='ESB Connections', axis=alt.Axis(format='~s')), facet=alt.Facet('County Councils:O', columns=4), color='County Councils').properties( title='ESB Connections trend for each County', ) facetplot.save("templates/facetplot.html") return render_template("facetplot.html")
def daily_reported_deaths(df, labels): hist = (alt.Chart(df, height=100, width=100).mark_bar().encode( x=alt.X("lag:O", title="Reporting Lag", sort=labels), y=alt.Y("sum(n_diff):Q", title="Reported Deaths"), color=alt.Color("day(publication_date):N", title="Publication Day", sort=["Mon"]), )) text = (alt.Chart(df).mark_text(align="right", x=95, y=28, fontSize=20).encode( alt.Text("sum(n_diff)"), )) chart = (hist + text).facet( facet=alt.Facet("publication_date:T", title="Reported Deaths per Day"), columns=7, ) return chart
def get_peak_perf_bar_chart(csv_file) -> alt.vegalite.v4.api.Chart: """ Method that creates a simple grouped bar chart from the csv file: Parameters ---------- csv_file: str csv file from which the bar chart will be created. Returns ------- alt.vegalite.v4.api.Chart Simple, grouped bar chart. """ df = pd.read_csv(csv_file) df = df.drop(0) df = pd.melt(df, id_vars=['Hardware Platforms'], value_vars=[ 'INT2', 'INT4', 'INT8', 'FP16', 'FP32', 'Memory Bandwidth' ], var_name='Datatypes and MB') df = df.dropna() bars = alt.Chart().mark_bar().encode( x=alt.X('Datatypes and MB:O', title=''), y=alt.Y('value:Q', scale=alt.Scale(type='log'), title='Peak Performance [TOP/sec] and MB [GBps]'), color='Datatypes and MB:N', ) text = bars.mark_text( dy=-5 # Nudges text upwards so it doesn't appear on top of the bar ).encode(text='value:Q') return alt.layer(bars, text, data=df).facet( columns=5, facet=alt.Facet('Hardware Platforms:N', title='Hardware Platforms') ).properties( title='Peak Performance and Memory Bandwidth for All Hardware Platforms' )
def _xy(self, mark, x=None, y=None, stacked=False, subplots=False, **kwargs): data = self._preprocess_data(with_index=True) if x is None: x = data.columns[0] else: x = _valid_column(x) assert x in data.columns if y is None: y_values = list(data.columns[1:]) else: y = _valid_column(y) assert y in data.columns y_values = [y] chart = (alt.Chart(data, mark=self._get_mark_def( mark, kwargs)).transform_fold(y_values, as_=["column", "value"]).encode( x=x, y=alt.Y("value:Q", title=None, stack=stacked), color=alt.Color("column:N", title=None), tooltip=[x] + y_values, ).interactive()) if subplots: nrows, ncols = _get_layout(len(y_values), kwargs.get("layout", (-1, 1))) chart = chart.encode( facet=alt.Facet("column:N", title=None)).properties( columns=ncols) return chart
def html_detail_company(): df = create_df_service_amount() df.loc[df.ttc_amount.isnull(), 'payment_status'] = 'Refused' df.loc[~df.ttc_amount.isnull(), 'payment_status'] = 'Accepted' df.label_services = df.label_services.str.title() df.compagnie = df.compagnie.str.title() chart = alt.Chart(df).mark_bar().encode( y=alt.Y('label_services', title='Types prestations'), x=alt.X('count(label_services)', title='Nombre de contrats'), color=alt.Color('payment_status', title='Status paiement'), tooltip=[ alt.Tooltip('label_services', title='Prestation'), alt.Tooltip('max(ht_amount)', title='Max amount'), alt.Tooltip('min(ht_amount)', title='Min amount'), alt.Tooltip('count()', title='Devis count') ]).facet( facet=alt.Facet('compagnie:N', title='Compagnies'), columns=2, ) chart.save("app/templates/plot/detail_company.html")
def make_cat_trend(linech, save=True, fig_n=2): """Makes chart 2""" ai_subtrends_chart = ( alt.Chart(linech) .transform_window( roll="mean(n)", frame=[-10, 0], groupby=["category_clean", "type"] ) .mark_line() .encode( x=alt.X("index:T", title=""), y=alt.X("roll:Q", title="Number of papers"), color=alt.Color("type", title="Source"), ) .properties(width=200, height=100) ).facet(alt.Facet("category_clean", title="Category"), columns=2) if save is True: save_altair(ai_subtrends_chart, f"fig_{fig_n}_ai_subtrends", driver=driv) return ai_subtrends_chart
def make_chart(self, df): sort_order = ['Confirmados', 'Probables', 'Muertes'] bars = alt.Chart(df).mark_bar().encode( x=alt.X('value', title="Rezago estimado (días)"), y=alt.Y('variable', title=None, sort=sort_order, axis=None), color=alt.Color('variable', sort=sort_order, legend=alt.Legend(orient='bottom', title=None)), tooltip=[ 'variable', 'bulletin_date', alt.Tooltip(field='value', type='quantitative', format=".1f") ]) text = bars.mark_text(align='right', baseline='middle', size=12, dx=-5).encode(text=alt.Text('value:Q', format='.1f'), color=alt.value('white')) return (bars + text).properties(width=300, ).facet( columns=2, facet=alt.Facet("bulletin_date", sort="descending", title="Fecha del boletín"))
def altPosRate(df: pd.DataFrame): prChart = alt.Chart(df).mark_line().encode( alt.X('data:T', title=None), alt.Y('positivity_rate:Q', axis=alt.Axis(format='%'), title=None), color=alt.Color('denominazione_regione:N', legend=None, scale=alt.Scale(scheme='dark2')), facet=alt.Facet('denominazione_regione:N', columns=4, title=None), tooltip=[ alt.Tooltip('positivity_rate:Q', title='Tasso positivi al tampone') ]).properties( width=160, height=90, title='Tasso di positivi al tampone').configure_view( strokeWidth=0).configure_axis(grid=False).configure_title( color='gray', fontSize=24, ).configure_line(size=4) # .transform_window( # rolling_mean='mean(positivity_rate)', # frame=[-1, 1], # groupby=['denominazione_regione'] # ) return prChart
embeddings_to_compare = compute_tsne_2d_components(embedding_dict) t2dm_embeddings = concat_embeddings(embeddings_to_compare, t2dm_concept_list, 't2dm') breast_cancer_embeddings = concat_embeddings(embeddings_to_compare, breast_cancer, 'breast_cancer') union_embeddings = pd.concat([t2dm_embeddings, breast_cancer_embeddings], axis=0) columns = union_embeddings.columns.to_list() + ['concept_name', 'domain_id'] union_embeddings = union_embeddings.merge(concept, left_on='standard_concept_id', right_on='concept_id')[columns] alt.data_transformers.disable_max_rows() alt.Chart(union_embeddings, title='embeddings').mark_point().encode( x='tsne-2d-one:Q', y='tsne-2d-two:Q', color='phenotype', facet=alt.Facet('name:O', columns=2), tooltip=['concept_name'] ).interactive() # ### Measure the average cosine distances for breast cancer cumc_visit_pairwise_dist = EuclideanDistance(name='cumc_visit', path=get_pairwise_euclidean_distance_output(create_file_path(cumc_embeddings_folder, 'visit'))) ccae_visit_pairwise_dist = EuclideanDistance(name='ccae_visit', path=get_pairwise_euclidean_distance_output(create_file_path(ccae_embeddings_folder, 'visit'))) pd.concat([cumc_visit_pairwise_dist.compute_average_dist(breast_cancer), cumc_visit_pairwise_dist.compute_random_average_dist(205), ccae_visit_pairwise_dist.compute_average_dist(breast_cancer), ccae_visit_pairwise_dist.compute_random_average_dist(392)], axis=1) cumc_visit_pairwise_dist = EuclideanDistance(name='cumc_visit', path=get_pairwise_cosine_similarity_output(create_file_path(cumc_embeddings_folder, 'visit')))
""" Anscombe's Quartet ------------------ This example shows how to use the column channel to make a trellis plot. Anscombe's Quartet is a famous dataset constructed by Francis Anscombe. Common summary statistics are identical for each subset of the data, despite the subsets having vastly different characteristics. """ # category: case studies import altair as alt from vega_datasets import data source = data.anscombe() alt.Chart(source).mark_circle().encode( alt.X('X', scale=alt.Scale(zero=False)), alt.Y('Y', scale=alt.Scale(zero=False)), alt.Facet('Series', columns=2), ).properties( width=180, height=180, )
ac_layer = (ac_uncertainty + ac_best + ac_points).properties( title= f"acetate secretion = {yield_params['acetate']['slope']:0.2f} ± {yield_params['acetate']['err']:0.2f} mM / OD" ) save(glu_layer | ac_layer, './output/2021-04-04_REL606_glucose_turnover.pdf') save(glu_layer | ac_layer, './output/2021-04-04_REL606_glucose_turnover.png') #%% points = alt.Chart(samp_data, width=350, height=300).mark_point(size=80).encode( x=alt.X('od_600nm:Q', title='optical density [a.u.]'), y=alt.Y('conc_mM:Q', title='concentration [mM]'), color=alt.Color('replicate:N', title='biological replicate'), facet=alt.Facet('compound:N', header=alt.Header(labelFontSize=15))) fit = alt.Chart(fit_df, width=350, height=300).mark_line(color=colors['black']).encode( x=alt.X('od_600nm:Q', title='optical density [a.u.]'), y=alt.Y('conc_mM:Q', title='concentration [mM]'), facet=alt.Facet('compound:N', header=alt.Header(labelFontSize=15))) points + fit # %% # Load the calibration data
def make_chart_topic_comparison( topic_mix, arxiv_cat_lookup, comparison_ids, selected_categories, comparison_names, topic_list, topic_category_map, highlights=False, highlight_topics=None, highlight_class_table="Company", save=True, fig_num=15, ): """Creates a chart that compares the topic specialisations of different groups of organisations Args: topic_mix: topic mix arxiv_cat_lookup: lookup between category ids and names comparison_ids: ids we want to compare selected_categories: arXiv categories to focus on comparison_names: names for the categories we are comparing highlights: if we want to highlight particular topics highlight_topics: which ones highlight_class_table: topics to highlight in the table """ # Extract the representations of categories comp_topic_rel = pd.DataFrame([ topic_rep( ids, topic_mix, selected_categories, topic_list=topic_list, topic_category_map=topic_category_map, )[1].loc[True] for ids in comparison_ids ]) comparison_df = comp_topic_rel.T comparison_df.columns = comparison_names comparison_df_long = comparison_df.reset_index(drop=False).melt( id_vars="index") comparison_df_long["cat"] = comparison_df_long["index"].map( topic_category_map) order = (comparison_df_long.groupby( ["index", "cat"])["value"].sum().reset_index(drop=False).sort_values( by=["cat", "value"], ascending=[True, False])["index"].tolist()) comparison_df_filter = comparison_df_long.loc[ comparison_df_long["cat"].isin(selected_categories)] comparison_df_filter["cat_clean"] = [ arxiv_cat_lookup[x][:35] + "..." for x in comparison_df_filter["cat"] ] # Sort categories by biggest differences? diff_comp = (comparison_df_filter.pivot_table( index=["index", "cat_clean"], columns="variable", values="value").assign( diff=lambda x: x["company"] - x["academia"]).reset_index( drop=False).groupby("cat_clean")["diff"].mean().sort_values( ascending=False).index.tolist()) # Plot comp_ch = (alt.Chart(comparison_df_filter).mark_point( filled=True, opacity=0.5, stroke="black", strokeWidth=0.5).encode( x=alt.X("index", title="", sort=order, axis=alt.Axis(labels=False, ticks=False)), y=alt.Y("value", title=["Share of papers", "with topic"]), color=alt.Color("variable", title="Organisation type"), tooltip=["index"], )) comp_lines = (alt.Chart(comparison_df_filter).mark_line( strokeWidth=1, strokeDash=[1, 1], stroke="grey").encode( x=alt.X("index", sort=order, axis=alt.Axis(labels=False, ticks=False)), y="value", detail="index", )) topic_comp_type = ((comp_ch + comp_lines).properties( width=200, height=150).facet(alt.Facet("cat_clean", sort=diff_comp, title="arXiv category"), columns=3).resolve_scale(x="independent")) if highlights is False: topic_comp_type = ((comp_ch + comp_lines).properties( width=200, height=150).facet( alt.Facet("cat_clean", sort=diff_comp, title="arXiv category"), columns=3, ).resolve_scale(x="independent")) if save is True: save_altair(topic_comp_type, f"fig_{fig_num}_topic_comp", driv) return topic_comp_type else: # Lookup for the selected categories code_topic_lookup = { v: str(n + 1) for n, v in enumerate(highlight_topics) } # Add a label per topic for the selected topics comparison_df_filter["code"] = [ code_topic_lookup[x] if x in code_topic_lookup.keys() else "no_label" for x in comparison_df_filter["index"] ] # Need to find a way to remove the bottom one max_val = comparison_df_filter.groupby( "index")["value"].max().to_dict() comparison_df_filter["max"] = comparison_df_filter["index"].map( max_val) comp_text = (alt.Chart(comparison_df_filter).transform_filter( alt.datum.code != "no_label").mark_text( yOffset=-10, color="red").encode( text=alt.Text("code"), x=alt.X("index", sort=order, axis=alt.Axis(labels=False, ticks=False)), y=alt.Y("max", title=""), detail="index", )) topic_comp_type = ((comp_ch + comp_lines + comp_text).properties( width=200, height=150).facet( alt.Facet("cat_clean", sort=diff_comp, title="arXiv category"), columns=3, ).resolve_scale(x="independent")) if save is True: save_altair(topic_comp_type, "fig_9_topic_comp", driv) save_highlights_table( comparison_df_filter, highlight_topics, highlight_class_table, topic_category_map, ) return topic_comp_type, comparison_df_filter
""" US Population: Wrapped Facet ============================ This chart visualizes the age distribution of the US population over time, using a wrapped faceting of the data by decade. """ # category: case studies import altair as alt from vega_datasets import data source = data.population.url alt.Chart(source).mark_area().encode( x='age:O', y=alt.Y('sum(people):Q', title='Population', axis=alt.Axis(format='~s')), facet=alt.Facet('year:O', columns=5), ).properties(title='US Age Distribution By Year', width=90, height=80)
) hline = alt.Chart().mark_rule(size=1, strokeDash=[10, 10]).encode(y=alt.Y('a:Q'), ) ch = alt.layer( hline, errorbars, lines, points, # hline, data=results_df ).transform_calculate(a="0.003").properties( width=350, height=250, ).facet( #'Fingerprint', columns=2 facet=alt.Facet( 'Fingerprint', header=alt.Header(labelFontSize=15), ), #header=alt.Header(labelFontSize=25), #column=alt.Column(field=alt.Field('Fingerprint'),type='nominal'), columns=2).configure_axis( #labelFontSize=10, #titleFontSize=15 ).configure_header(titleFontSize=15, ) ch.save('../../figures/trainingSetSize.html')
def lambda_handler(event, context): # Get the secret sm = boto3.client('secretsmanager') secretobj = sm.get_secret_value(SecretId='ni-covid-tweets') secret = json.loads(secretobj['SecretString']) s3 = boto3.client('s3') messages = [] # Download the most recently updated PDF file for change in event: tmp = tempfile.NamedTemporaryFile(suffix='.pdf') with open(tmp.name, 'wb') as fp: s3.download_fileobj(secret['bucketname'],change['keyname'],fp) # Get the date range covered by the report text = textract.process(tmp.name, method='pdfminer').decode('utf-8') regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})') start_date = None end_date = None for line in text.split('\n'): m = regex.search(line) if m: start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date() end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date() break if start_date is None: logging.error('Unable to find start date in report') return { "statusCode": 404, "body": 'Unable to find start date in report %s' %change['url'], } # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to # tables that will not parse properly in the PDF tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True) tablecount = 0 dataset = pandas.DataFrame() for df in tables: if 'Total' not in df.columns: firstrow = df.iloc[0] newcols = [] for i in range(len(firstrow)): if isinstance(firstrow[i], float) and math.isnan(firstrow[i]): newcols.append(df.columns[i]) else: newcols.append(firstrow[i]) df.columns = newcols df = df[1:] df['Setting'] = df['Setting'].str.strip() df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True) df['Total'] = df['Total'].astype(int) df['Open'] = df['Open'].astype(int) df['Closed'] = df['Closed'].astype(int) df = df[df['Setting']!='Total'] if tablecount==0: df['Type'] = 'Probable Outbreak' elif tablecount==1: df['Type'] = 'Cluster' else: logging.warning('Unexpected table: %s' %df) tablecount += 1 dataset = pandas.concat([dataset, df]) dataset['Start Date'] = pandas.to_datetime(start_date) dataset['End Date'] = pandas.to_datetime(end_date) week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7) dataset['Week'] = week # Create a simple summary and the tweet text summary = dataset.groupby('Type').sum() tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y')) for Type,data in summary.to_dict('index').items(): tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed']) tweet += '\n%s' %change['url'] # Pull current data from s3 try: obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body'] except s3.exceptions.NoSuchKey: print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname'])) datastore = pandas.DataFrame(columns=['Week']) else: stream = io.BytesIO(obj.read()) datastore = pandas.read_csv(stream) # Clean out any data with matching dates datastore = datastore[datastore['Week'] != week] # Append the new data datastore = pandas.concat([datastore, dataset]) datastore['Start Date'] = pandas.to_datetime(datastore['Start Date']) datastore['End Date'] = pandas.to_datetime(datastore['End Date']) # Replace any known duplicates datastore['Setting'] = datastore['Setting'].replace({ 'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue', 'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue', 'Funeral / Wakes': 'Funeral / Wake', 'Restaurant / Cafe': 'Restaurant / Café' }) # Push the data to s3 stream = io.BytesIO() datastore.to_csv(stream, index=False) stream.seek(0) s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore']) # Set up chromedriver so we can save altair plots driver = get_chrome_driver() plots = [] if driver is None: logging.error('Failed to start chrome') else: p = altair.vconcat( altair.Chart( dataset ).mark_bar().encode( x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')), y = altair.Y('Setting:O'), color='Type', order=altair.Order( 'Type', sort='ascending' ), ).properties( height=450, width=800, title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Covers the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) p = altair.vconcat( altair.Chart( datastore.groupby(['End Date','Type'])['Total'].sum().reset_index() ).mark_area().encode( x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')), y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")), color='Type', order=altair.Order( 'Type', sort='ascending' ), ).properties( height=450, width=800, title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Reported weekly for the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) p = altair.vconcat( altair.Chart( datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index() ).mark_area().encode( x = altair.X('End Date:T', axis=altair.Axis(title='')), y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")), color='Type', facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0), order=altair.Order( 'Type', sort='ascending' ), ).properties( height=90, width=160, title=altair.TitleParams( 'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')), anchor='middle', ), ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Reported weekly for the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) # Convert to dates to ensure correct output to CSV datastore['Start Date'] = datastore['Start Date'].dt.date datastore['End Date'] = datastore['End Date'].dt.date # Tweet out the text and images if change.get('notweet') is not True: api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret']) upload_ids = api.upload_multiple(plots) if change.get('testtweet') is True: if len(upload_ids) > 0: resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0]) if len(upload_ids) > 1: resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1]) if len(upload_ids) > 2: resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2]) else: resp = api.dm(secret['twitter_dmaccount'], tweet) messages.append('Tweeted DM ID %s' %(resp.id)) else: if len(upload_ids) > 0: resp = api.tweet(tweet, media_ids=upload_ids) else: resp = api.tweet(tweet) # Download and update the index status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index']) index = status.get_dict() for i in range(len(index)): if index[i]['filedate'] == change['filedate']: index[i]['tweet'] = resp.id break status.put_dict(index) messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index'])) else: print(tweet) messages.append('Did not tweet') return { "statusCode": 200, "body": json.dumps({ "message": messages, }), }
""" US Income by State: Wrapped Facet --------------------------------- This example shows how to create a map of income in the US by state, faceted over income brackets """ # category: maps import altair as alt from vega_datasets import data states = alt.topo_feature(data.us_10m.url, 'states') source = data.income.url alt.Chart(source).mark_geoshape().encode( shape='geo:G', color='pct:Q', tooltip=['name:N', 'pct:Q'], facet=alt.Facet('group:N', columns=2), ).transform_lookup(lookup='id', from_=alt.LookupData(data=states, key='id'), as_='geo').properties( width=300, height=175, ).project(type='albersUsa')