def determine_plot_title( metric_plot_component: MetricPlotComponent, domain_plot_component: DomainPlotComponent, ) -> alt.TitleParams: """Determines the appropriate title for a chart based on input componentsself. Conditionally renders a subtitle if relevant (specifically with column domain) Args: metric_plot_component: Plot utility corresponding to a given metric. domain_plot_component: Plot utility corresponding to a given domain. Returns: An Altair TitleParam object """ contents: str = f"{metric_plot_component.title} per {domain_plot_component.title}" subtitle: Optional[str] = domain_plot_component.subtitle title: alt.TitleParams if subtitle: title = alt.TitleParams(contents, subtitle=[subtitle]) else: title = alt.TitleParams(contents) return title
def plot_points_average_and_trend(configs, title, footer): return altair.concat( altair.vconcat(*[points_average_and_trend( **c) for c in configs]).resolve_scale(x='shared').properties( title=altair.TitleParams( footer, baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10), )).properties(title=altair.TitleParams( title, anchor='middle', ))
def make_etc_coverage_heatmap(etc_coverage, mag_order=None, module_order=None): num_mags_in_frame = len(set(etc_coverage['genome'])) charts = list() for i, (etc_complex, frame) in enumerate(etc_coverage.groupby('complex')): # if this is the first chart then make y-ticks otherwise none c = alt.Chart(frame, title=etc_complex).encode( x=alt.X('module_name', title=None, axis=alt.Axis(labelLimit=0, labelAngle=90), sort=module_order), y=alt.Y('genome', axis=alt.Axis(title=None, labels=False, ticks=False), sort=mag_order), tooltip=[ alt.Tooltip('genome', title='Genome'), alt.Tooltip('module_name', title='Module Name'), alt.Tooltip('path_length', title='Module Subunits'), alt.Tooltip('path_length_coverage', title='Subunits present'), alt.Tooltip('genes', title='Genes present'), alt.Tooltip('missing_genes', title='Genes missing') ]).mark_rect().encode( color=alt.Color('percent_coverage', legend=alt.Legend(title='% Complete'), scale=alt.Scale(domain=(0, 1)))).properties( width=HEATMAP_CELL_WIDTH * len(set(frame['module_name'])), height=HEATMAP_CELL_HEIGHT * num_mags_in_frame) charts.append(c) concat_title = alt.TitleParams('ETC Complexes', anchor='middle') return alt.hconcat(*charts, spacing=5, title=concat_title)
def plot_heatmap(df, ptitle, x, y, xtitle="", ytitle="", annot_fmt=".3f", ptitle_offset=-5): if ptitle != "": ptitle = alt.TitleParams(ptitle, offset=ptitle_offset) base = (alt.Chart(df, title=ptitle).mark_rect().encode( x=alt.X(x, title=xtitle), y=alt.Y(y, title=ytitle), )) heatmap = base.mark_rect(stroke="white", strokeWidth=2).encode(color=alt.Color( "value:Q", scale=alt.Scale(type="log", scheme="yelloworangered"), legend=None, ), ) text = base.mark_text(baseline="middle").encode( text=alt.Text("value:Q", format=annot_fmt), color=alt.condition( alt.datum.value > df["value"].mean(), alt.value("white"), alt.value("black"), ), ) crchart = heatmap + text return crchart
def plot_density(qual, winetype, xcol): if qual in [0, 1, 2]: subset = wine.loc[(wine["Quality Factor Numeric"] == qual) & (wine["Wine"].isin(winetype))] else: subset = wine.loc[wine["Wine"].isin(winetype)] chart = alt.Chart(subset).transform_density( density=xcol, groupby=['Wine', 'Quality Factor'], as_=['value', 'density'], steps=200, # bandwidth=5 ).mark_area(opacity=0.5).encode( alt.X('value:Q', title=xcol, axis=alt.Axis(labels=True, grid=True)), alt.Y('density:Q', title=None, axis=alt.Axis(labels=False, grid=False, ticks=False)), alt.Color( "Wine", scale=alt.Scale(domain=['red', 'white'], range=['darkred', 'blue']))).properties( height=300, width=1000, title=alt.TitleParams( text='Wine Quality Factor Distributions', align='left', fontSize=14)).configure_view(stroke=None) return chart.to_html()
def plot_rate_bw(rate, bw, yinnerdomain, yclientdomain, use_legend, xdomain=None, rate_height=60, bw_height=40): rate_chart = alt.Chart(rate).mark_line(color='#5e6472', clip=True).encode( x=alt.X('time', axis=alt.Axis(title='', labels=False), scale=alt.Scale(nice=False, domain=xdomain)), y=alt.Y('success_rate:Q', axis=alt.Axis(title=['Success', 'rate (s⁻¹)'])), ).properties(height=rate_height) client_comm = bw.apply(lambda row: 'Client' in row['variable'], axis=1) bw_inner_chart = alt.Chart(bw[~client_comm]).mark_line(clip=True).encode( x=alt.X("time", axis=alt.Axis(title='', labels=False), scale=alt.Scale(nice=False, domain=([0, bw['time'].max()] if xdomain == None else xdomain))), y=alt.Y("value", axis=alt.Axis(title=''), scale=alt.Scale(domain=yinnerdomain)), color=alt.Color( 'variable', legend=(alt.Legend( title=['Traffic', 'Direction']) if use_legend else None)), strokeDash=alt.StrokeDash('variable', legend=None)).properties(height=bw_height) bw_client_chart = alt.Chart(bw[client_comm]).mark_line(clip=True).encode( x=alt.X("time", axis=alt.Axis(title='Timestamp (s)'), scale=alt.Scale(nice=False, domain=([0, bw['time'].max()] if xdomain == None else xdomain))), y=alt.Y("value", axis=alt.Axis(title=''), scale=alt.Scale(domain=yclientdomain)), color=alt.Color('variable', legend=(alt.Legend(title='') if use_legend else None)), strokeDash=alt.StrokeDash('variable', legend=None)).properties(height=bw_height) upper = rate_chart lower = alt.vconcat(bw_inner_chart, bw_client_chart, title=alt.TitleParams('Bandwidth (MB/s)', orient='left', anchor='middle', dx=15)).resolve_scale( color='shared', strokeDash='independent') return alt.vconcat(upper, lower).resolve_scale(x=alt.ResolveMode('shared'))
def plot_factor(regressor, grouper, query_string): """Function to create a scatter plot showing the association of obesity rate vs other factors Function to create a scatter plot showing the association of obesity rate vs other factors and grouped by different aggregators as selected by the user through the different dropdown filters. Args: regressor ([str]): the regressor to be used in the scatter plot grouper ([str]): the attribute to be used for grouping the data in the scatter plot query_string ([str]): string containing the attributes to be used in a pandas query for filtering the data for the bar plot Returns: [altair chart]: An altair scatter plot showing the association of obesity rate vs other factors """ label_dict = { "primedu": "Primary Education Completion Rate", "smoke": "Smoking Rate", "unemployed": "Unemployment Rate", "income": "Income Group", "sex": "Sex", "region": "Region", } title_label = "Obesity Rate vs " + label_dict[regressor] sub_label = "" if grouper == "none" else "by " + label_dict[grouper] temp = he.make_rate_data(["country", grouper], ["primedu", "smoke", "unemployed", "obese"], query_string) chart = (alt.Chart( temp, title=alt.TitleParams( text=title_label, subtitle=sub_label)).mark_circle(opacity=0.25).encode( x=alt.X( regressor, type="quantitative", title=label_dict[regressor], axis=alt.Axis(format="%", grid=False), ), y=alt.Y("obese", title="Obesity Rate", axis=alt.Axis(format="%", grid=False)), color=alt.Color(grouper, type="nominal", title="Legend"), tooltip=[ alt.Tooltip("country:N", title="Country"), alt.Tooltip(grouper, title="Grouping Variable"), alt.Tooltip("obese:Q", format=".1%", title="Obesity Rate"), ], ).properties(width=450, height=150).interactive()) factor_chart = chart return factor_chart
def source_selector(source) -> alt.Chart: return (alt.Chart(source).mark_square(size=50, opacity=0.3).encode( y=alt.Y( "source:N", axis=alt.Axis(orient="right", domain=False, ticks=False), title=None, ), color=source_color_or(idle_color), ).add_selection(source_selection_brush).properties( title=alt.TitleParams("Select model", anchor="start")))
def source_vs_hour_chart( base: alt.Chart, sensor_unit: str, max_absolute_error: float, faceted: bool = False ) -> Union[alt.Chart, alt.FacetChart]: hd_chart = ( base.mark_rect() .transform_joinaggregate( on_the_fly_mae="mean(mae)", on_the_fly_reference="mean(reference_value)", groupby=["event_start", "source"], ) .transform_calculate(accuracy=alt.datum.on_the_fly_mae) .encode( x=alt.X( "event_start:O", timeUnit="hours", axis=alt.Axis(domain=False, ticks=False, labelAngle=0), scale=alt.Scale(domain=list(range(24))), title="Hour of day", # "UTC hour of day" ), color=alt.condition( selectors.time_selection_brush, alt.Color( "accuracy:Q", scale=alt.Scale( domain=(max_absolute_error, 0), scheme="redyellowgreen" ), title="Error", ), alt.value(selectors.idle_color), ), tooltip=[ alt.Tooltip("event_start:T", timeUnit="hours", title="Hour of day"), alt.Tooltip( "accuracy:Q", title="Mean absolute error (%s)" % sensor_unit, format=".2f", ), ], ) ) if faceted: hd_chart = hd_chart.facet( row=alt.Row("source:O", title=None, header=alt.Header(labelAngle=0)) ) else: hd_chart = hd_chart.encode( y=alt.Y( "source:O", axis=alt.Axis(domain=False, ticks=False, labelAngle=0, labelPadding=5), title=None, ) ) return hd_chart.properties( title=alt.TitleParams("Model performance given a time of day", anchor="middle") )
def __create_ranking_bar_chart(self, data, type): chart = (alt.Chart( data, title=alt.TitleParams(text="Top 30 Countries", subtitle="By " + type + " Cases"), width=130).mark_bar().encode( x=alt.X("Cases", title=" ", axis=alt.Axis(labels=False)), y=alt.Y("Country_Region", sort="-x", title=" "), color=alt.Color("Cases", scale=alt.Scale(scheme="orangered")), tooltip=alt.Tooltip(["Cases:Q"], format=",.0f"), ).configure_axis(grid=False).configure_title( anchor="start").configure_legend(orient="bottom")) return chart.to_html()
def get_group_chart(grid_df, min_value: float, max_value: float, title: str = ''): rcharts = list() for gv in group_values: bar_df = grid_df[grid_df[group_var] == gv] rcharts.append( get_bar_chart(bar_df, min_value, max_value)) return alt.hconcat(*rcharts, title=alt.TitleParams(title, anchor='middle', align='center', orient='top'))
def make_functional_heatmap(functional_df, mag_order=None): # build heatmaps charts = list() for i, (group, frame) in enumerate(functional_df.groupby('category', sort=False)): # set variables for chart function_order = get_ordered_uniques(list(frame.function_name)) num_mags_in_frame = len(set(frame['genome'])) chart_width = HEATMAP_CELL_WIDTH * len(function_order) chart_height = HEATMAP_CELL_HEIGHT * num_mags_in_frame # if this is the first chart then make y-ticks otherwise none if i == 0: y = alt.Y('genome', title=None, sort=mag_order, axis=alt.Axis( labelLimit=0, labelExpr="replace(datum.label, /_\d*$/gi, '')")) else: y = alt.Y('genome', axis=alt.Axis(title=None, labels=False, ticks=False), sort=mag_order) # set up colors for chart rect_colors = alt.Color('present', legend=alt.Legend(title="Function is Present", symbolType='square', values=[True, False]), sort=[True, False], scale=alt.Scale(range=['#2ca25f', '#e5f5f9'])) # define chart # TODO: Figure out how to angle title to take up less space c = alt.Chart(frame, title=alt.TitleParams(group)).encode( x=alt.X('function_name', title=None, axis=alt.Axis(labelLimit=0, labelAngle=90), sort=function_order), tooltip=[ alt.Tooltip('genome', title='Genome'), alt.Tooltip('category', title='Category'), alt.Tooltip('subcategory', title='Subcategory'), alt.Tooltip('function_ids', title='Function IDs'), alt.Tooltip('function_name', title='Function'), alt.Tooltip('long_function_name', title='Description'), alt.Tooltip('gene_symbol', title='Gene Symbol') ]).mark_rect().encode(y=y, color=rect_colors).properties( width=chart_width, height=chart_height) charts.append(c) # merge and return function_heatmap = alt.hconcat(*charts, spacing=5) return function_heatmap
def bar_chart( data: pd.DataFrame, xvar: str = "category", yvar: str = "value", yscale: str = "linear", w: int = 400, h: int = 400, bar_color: str = "#F3852A", title: str = "Bar Chart", partial_title: bool = True, xvar_complement: Optional[str] = None, ) -> alt.Chart: base = alt.Chart( data, width=w, height=h, title=alt.TitleParams( f"{yscale.capitalize()} {title}" if partial_title else title, anchor="start"), ) tooltip_list = [ alt.Tooltip(f"{xvar}:N", title=xvar.capitalize()), alt.Tooltip(f"{yvar}:Q", format=",", title=yvar.capitalize()), ] if xvar_complement: tooltip_list.append( alt.Tooltip(f"{xvar_complement}:N", title=xvar_complement.capitalize())) bar = base.mark_bar(color=bar_color).encode( x=alt.X(f"{xvar}:N", axis=alt.Axis(title=xvar.capitalize())), y=alt.Y( f"{yvar}:Q", scale=alt.Scale(type=yscale) if yscale == "log" else alt.Scale( type=yscale, domain=[0, log10_ceiling(data[yvar].max())]), axis=alt.Axis( title=yvar.capitalize(), titleAngle=0, titleAlign="left", titleY=-5, titleX=0, ), ), tooltip=tooltip_list, ) return bar
def scoring_quadrant( data: pd.DataFrame, xvar: str, bin_width: float, width: int, height: int, title: Optional[str] = None, xtitle: Optional[str] = None, ytitle: Optional[str] = None, ) -> alt.Chart: binning = alt.Bin(step=bin_width) base = alt.Chart( data, width=width, height=height, title=alt.TitleParams(title, style="guide-label", dy=-5 if ytitle is None else 0), ) hist = (base.mark_bar(tooltip=True).encode( x=alt.X( f"binned_{xvar}:Q", bin="binned", axis=alt.Axis(format="~", title=["Score", xtitle]) if xtitle is not None else no_axis(), ), x2=f"binned_{xvar}_end:Q", y=alt.Y( "y_count:Q", axis=alt.Axis( title=["Count", ytitle]) if ytitle is not None else no_axis(), ), ).transform_bin(f"binned_{xvar}", xvar, bin=binning).transform_joinaggregate( y_count=f"count()", groupby=[ f"binned_{xvar}", f"binned_{xvar}_end", CONFUSION_CATEGORIES_COL_NAME, ], )) return hist
def plot_bar(query_string, year): """Function to create an altair bar plot of the top 10 countries Function to create an altair chart of the top 10 countries ordered based on obesity rate and disaggregated as per the user inputs received through the app dropdown filters. Args: query_string ([str]): string containing the attributes to be used in a pandas query for filtering the data for the bar plot year ([float]): year Returns: [altair chart]: An altair bar plot of the top 10 countries """ n = 10 title_label = "Top " + str(n) + " Countries" sub_label = str(year) n = 10 temp = he.make_rate_data(["country"], ["obese"], query_string) ob_sorted = temp.sort_values("obese", ascending=False).head(n).reset_index() chart = (alt.Chart(ob_sorted, title=alt.TitleParams( text=title_label, subtitle=sub_label)).mark_bar().encode( x=alt.X( "obese", type="quantitative", title="Obesity Rate", scale=alt.Scale(domain=[0.1, 0.8]), axis=alt.Axis(format="%", grid=False), ), y=alt.Y("country", sort="-x", title=""), color="obese", tooltip=alt.Tooltip("obese:Q", format=".1%", title="Obesity Rate"), ).properties(width=450, height=150).interactive()) return chart
def __create_world_timeseries_chart(self, case_type, ntype="New"): """create trend chart for global numbers Args: case_type (string): "confirmed", "recovered", "death" ntype (string): "Total" or "New" Returns: [type]: [description] """ if case_type == "confirmed": chart_title = "Global Confirmed Cases" case_type = 1 elif case_type == "death": chart_title = "Global Deaths" case_type = 2 elif case_type == "recovered": chart_title = "Global Recovered Cases" case_type = 3 if ntype == "Total": chart_title = chart_title + " Over Time" else: chart_title = "New " + chart_title + " Per Day" data = self.data_reader.get_timeserie_data_by_country("all", case_type) chart = ( alt.Chart( data, title=alt.TitleParams(text=chart_title), height=200, ) .mark_line() .transform_filter(alt.FieldEqualPredicate(field="type", equal=ntype)) .encode( x=alt.X("date:T", title="", axis=alt.Axis(format=("%b %Y"))), y=alt.Y("count:Q", title=""), ) .configure_axis(grid=False) .configure_title(anchor="start") .properties(width=735) ) return chart.to_html()
def __create_timeserie_chart(self, country, case_type=1, ntype="Total"): data = self.data_reader.get_timeserie_data_by_country( country, case_type) if case_type == 1: chart_title = "Cases over time" elif case_type == 2: chart_title = "Deaths over time" chart = ( alt.Chart( data, title=alt.TitleParams(text=chart_title, subtitle=country), ).mark_line().transform_filter( alt.FieldEqualPredicate(field="type", equal=ntype)).encode( x=alt.X("date:T", title=""), y=alt.Y("count:Q", title="")).configure_axis( grid=False).configure_title(anchor="start").properties( width=350, height=200) # .properties(width="container", height="container") ) return chart.to_html()
def alt_plot_metric_based_threshold_tuning_plots(df, ptitle_offset=-5, legend_offset=5, figsize=(450, 300)): highlight = alt.selection(type="single", on="mouseover", fields=["metric"], nearest=True) base = alt.Chart( df, title=alt.TitleParams("Scoring Metrics, as threshold is changed", offset=ptitle_offset), ).encode( x=alt.X("threshold:Q", title="threshold"), y=alt.Y("value:Q", title=""), color=alt.Color("metric:N", legend=alt.Legend(offset=legend_offset, title="")), tooltip=[ alt.Tooltip("metric", title="Metric"), alt.Tooltip("threshold", title="Threshold", format=".2f"), alt.Tooltip("value", title="Value", format=".2f"), ], ) overlay = pd.DataFrame({"default": [0.5]}) rules = (alt.Chart(overlay).mark_rule().encode( x=alt.X("default:Q", title=""))) points_opacity = alt.value(0) points = (base.mark_circle().encode( opacity=points_opacity).add_selection(highlight)) lines = base.mark_line().encode( size=alt.condition(~highlight, alt.value(1.5), alt.value(3))) combo = (points + lines + rules).properties(width=figsize[0], height=figsize[1]) return combo
def make_viral_functional_heatmap(functional_df, vgf_order=None): # build heatmaps charts = list() for i, (group, frame) in enumerate(functional_df.groupby('Category', sort=False)): # set variables for chart function_order = get_ordered_uniques(list(frame['Function'])) num_vgfs_in_frame = len(set(frame['Contig Name'])) chart_width = HEATMAP_CELL_WIDTH * len(function_order) chart_height = HEATMAP_CELL_HEIGHT * num_vgfs_in_frame # set up colors for chart rect_colors = alt.Color('Present in Contig', legend=alt.Legend(symbolType='square', values=[True, False]), sort=[True, False], scale=alt.Scale(range=['#e5f5f9', '#2ca25f'])) # define chart # TODO: Figure out how to angle title to take up less space c = alt.Chart(frame, title=alt.TitleParams(group)).encode( x=alt.X('Function', title=None, axis=alt.Axis(labelLimit=0, labelAngle=90), sort=function_order), y=alt.Y('Contig Name', axis=alt.Axis(title=None, labels=False, ticks=False), sort=vgf_order), tooltip=[ alt.Tooltip('Contig Name'), alt.Tooltip('Category'), alt.Tooltip('Function'), alt.Tooltip('AMG Genes'), alt.Tooltip('Genes Present') ]).mark_rect().encode(color=rect_colors).properties( width=chart_width, height=chart_height) charts.append(c) # merge and return function_heatmap = alt.hconcat(*charts, spacing=5) return function_heatmap
def plot_map(query_string, year): """Fuction to create an altair chloropleth world map plot showing the global obesity rates Args: query_string ([str]): string containing the attributes to be used in a pandas query for filtering the data for the bar plot year ([float]): year Returns: [altair chart]: An altair chloropleth world map plot showing the global obesity rates """ title_label = "Obesity Rates" sub_label = str(year) df = (he.make_rate_data(["country"], ["obese"], query_string).merge( cy_ids, "right", on="country").sort_values("obese", ascending=False)) world = ((alt.Chart( geojson, title=alt.TitleParams( text=title_label, subtitle=sub_label)).mark_geoshape().transform_lookup( lookup="id", from_=alt.LookupData(df, "id", ["country", "obese"]), ).encode( color=alt.Color( "obese:Q", scale=alt.Scale(scheme="viridis"), title="Obesity", legend=alt.Legend(format=".0%"), ), stroke=alt.value("black"), tooltip=[ alt.Tooltip("country:N", title="Country"), alt.Tooltip("obese:Q", format=".1%", title="Obesity Rate"), ], )).project("naturalEarth1").properties(width=450, height=300)) return world
title='NI COVID-19 Positive Tests by Age Band from %s to %s' % (toplot['Date'].min().strftime('%-d %B %Y'), toplot['Date'].max().strftime('%-d %B %Y'))) plt = altair.vconcat( altair.layer( heatmap, heatmap.mark_text(align='right', baseline='middle', dx=43).encode( text=altair.Text('Most Recent Positive Tests'), color=altair.value('black')))).properties(title=altair.TitleParams( [ 'Data from DoH daily downloads', 'Numbers to right of chart show most recent 7 day total', 'https://twitter.com/ni_covid19_data on %s' % datetime.datetime.now().strftime('%A %-d %B %Y') ], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10), ) plt # %% bands = datastore.groupby(['Age_Band_5yr', 'Band Start', 'Band End'], dropna=False).size().reset_index()[[ 'Age_Band_5yr', 'Band Start', 'Band End' ]] bands = bands[bands['Age_Band_5yr'] != 'Not Known'] bands.fillna(90, inplace=True)
def lambda_handler(event, context): # Get the secret sm = boto3.client('secretsmanager') secretobj = sm.get_secret_value(SecretId='ni-covid-tweets') secret = json.loads(secretobj['SecretString']) tweets = [] # Download the most recently updated Excel file s3 = boto3.client('s3') for change in event: obj = s3.get_object(Bucket=secret['bucketname'],Key=change['keyname'])['Body'] stream = io.BytesIO(obj.read()) # Load test data and add extra fields df = pandas.read_excel(stream,engine='openpyxl',sheet_name='Table 7', header=3) df.dropna('columns',how='all',inplace=True) df.rename(columns=colclean,inplace=True) df.dropna('rows',subset=['Total'],inplace=True) # Get the latest dates with values for tests and rolling df['date'] = pandas.to_datetime(df['Week Ending'], format='%d/%m/%Y') df.sort_values('date', inplace=True) latest = df.iloc[-1] # Check against previous day's reports status = S3_scraper_index(s3, secret['bucketname'], secret['nisra-deaths-index']) index = status.get_dict() plots = [] if latest['Total'] == 0: tweet = '''No deaths registered in Northern Ireland, week ended {date} '''.format( date=latest['date'].strftime('%A %-d %B %Y'), ) else: if latest['Total'] == 1: tweet = '''One death registered in Northern Ireland, week ended {date}, in: '''.format( date=latest['date'].strftime('%A %-d %B %Y') ) else: tweet = '''{deaths:,} deaths registered in Northern Ireland, week ended {date}, in: '''.format( date=latest['date'].strftime('%A %-d %B %Y'), deaths=int(latest['Total']) ) for name in ['Hospital', 'Care Home', 'Hospice', 'Home', 'Other']: if latest[name] > 0: tweet += '\u2022 %s: %s\n' %(name, int(latest[name])) tweet += '\n' if len(df) > 1: prev = df.iloc[-2] diff = latest['Total'] - prev['Total'] tweet += '''{symb} {diff} {comp} than previous week '''.format( symb=good_symb if diff < 0 else bad_symb, diff=abs(int(diff)), comp='fewer' if diff < 0 else 'more' ) try: driver = get_chrome_driver() plots = [] if driver is None: logging.error('Failed to start chrome') else: toplot = df[(df['Week Ending'] > df['Week Ending'].max()-pandas.to_timedelta(84, unit='d'))] toplot = toplot.drop(columns=['Week of Death','date','Total']).melt(id_vars='Week Ending', var_name='Location', value_name='Deaths') print(toplot) p = altair.vconcat( altair.Chart( toplot ).mark_area().encode( x = altair.X('Week Ending:T', axis=altair.Axis(title='Week of death')), y = altair.Y('sum(Deaths):Q', axis=altair.Axis(title='Deaths', orient="right", tickMinStep=1)), color=altair.Color('Location', sort=altair.SortField('order',order='descending')), ).properties( height=450, width=800, title='NI COVID-19 Deaths reported by NISRA from %s to %s' %(toplot['Week Ending'].min().strftime('%-d %B %Y'), toplot['Week Ending'].max().strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from NISRA', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'nisra-deaths-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) except: logging.exception('Error creating plot') tweets.append({ 'text': tweet, 'url': change['url'], 'notweet': change.get('notweet'), 'filedate': change['filedate'], 'plots': plots }) donottweet = [] if len(tweets) > 1: for i in range(1,len(tweets)): for j in range(0, i): if (tweets[i]['text'] == tweets[j]['text']): donottweet.append(i) messages = [] for idx in range(len(tweets)): tweet = tweets[idx]['text'] + tweets[idx]['url'] if (idx not in donottweet): if tweets[idx].get('notweet') is not True: api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret']) upload_ids = api.upload_multiple(tweets[idx]['plots']) if change.get('testtweet') is True: if len(upload_ids) > 0: resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0]) else: resp = api.dm(secret['twitter_dmaccount'], tweet) messages.append('Tweeted DM ID %s' %(resp.id)) else: if len(upload_ids) > 0: resp = api.tweet(tweet, media_ids=upload_ids) else: resp = api.tweet(tweet) messages.append('Tweeted ID %s, ' %resp.id) # Update the file index for i in range(len(index)): if index[i]['filedate'] == tweets[idx]['filedate']: index[i]['tweet'] = resp.id break status.put_dict(index) messages[-1] += ('updated %s' %secret['nisra-deaths-index']) else: messages.append('Did not tweet') print(tweet) else: messages.append('Duplicate found %s, did not tweet, ' %tweets[idx]['filedate']) return { "statusCode": 200, "body": json.dumps({ "message:": messages, }), }
def data_properties(y, selected_examples, num_tails: int = 2, dataset_name="test", model_name: str = "model", rspath: str = ".", display_dataframe: bool = False, display_figure: bool = False): save_name = model_name.lower() + "_" + dataset_name.lower() args_list = [] hold_list = [[ 'Number of examples', 'Number of labels', 'Label cardinality', 'Label density', 'Distinct labels', 'Distinct label sets', 'Frequency of distinct label sets', 'Mean imbalance ratio intra-class for all labels', 'Mean imbalance ratio inter-class for all labels', 'Mean imbalance ratio labelsets for all labels', 'Labels having less than or equal to {0} examples'.format(num_tails), 'Labels having more than {0} examples'.format(num_tails + 1), 'KL difference between complete and data partition' ]] # 1. Compute properties of complete data L_S = total_labels(y) LCard_S = cardinality(y) LDen_S = density(y) DL_S = distinct_labels(y) DLS_S = distinct_labelsets(y) PDL_S = propportion_distinct_labelsets(y) IR_intra = mean_ir_intra_class(y) IR_inter = mean_ir_inter_class(y) IR_labelset = mean_ir_labelset(y) # 1.1. Compute tail labels properties for the complete data tail = np.sum(y.toarray(), axis=0) tail = tail[np.nonzero(tail)[0]] tail[tail <= num_tails] = 1 tail[tail > num_tails] = 0 tail_sum = int(tail.sum()) tail[tail == 0] = -1 tail[tail == 1] = 0 tail_count = int(np.count_nonzero(tail)) args_list.append('## PROPERTIES for {0}...'.format(dataset_name)) args_list.append('\t>> Number of examples: {0}'.format(y.shape[0])) args_list.append('\t>> Number of labels: {0}'.format(L_S)) args_list.append('\t>> Label cardinality: {0:.6f}'.format(LCard_S)) args_list.append('\t>> Label density: {0:.6f}'.format(LDen_S)) args_list.append('\t>> Distinct labels: {0}'.format(DL_S)) args_list.append('\t>> Distinct label sets: {0}'.format(DLS_S)) args_list.append( '\t>> Frequency of distinct label sets: {0:.6f}'.format(PDL_S)) args_list.append( '\t>> Mean imbalance ratio intra-class for all labels: {0:.6f}'.format( IR_intra)) args_list.append( '\t>> Mean imbalance ratio inter-class for all labels: {0:.6f}'.format( IR_inter)) args_list.append( '\t>> Mean imbalance ratio labelsets for all labels: {0:.6f}'.format( IR_labelset)) args_list.append( '\t>> Labels having less than or equal to {0} examples: {1}'.format( num_tails, tail_sum)) args_list.append('\t>> Labels having more than {0} examples: {1}'.format( num_tails + 1, tail_count)) hold_list.append([ y.shape[0], L_S, LCard_S, LDen_S, DL_S, DLS_S, PDL_S, IR_intra, IR_inter, IR_labelset, tail_sum, tail_count, 0 ]) # 2. Compute properties of complete data distr_y = np.sum(y.toarray(), axis=0) ntail_idx = np.nonzero(distr_y)[0] tail = distr_y[ntail_idx] tail_idx = np.argsort(tail) tail = tail[tail_idx] distr_y = distr_y / np.sum(y.toarray()) # 3. Iteratively calculate properties of training and test data, respectively split_set_name = ["training set", "test set"] tail_selected_list = [] for idx in range(len(selected_examples)): y_tmp = y[selected_examples[idx]] distr_y_selected = np.sum(y_tmp.toarray(), axis=0) tail_selected = distr_y_selected[ntail_idx] tail_selected = tail_selected[tail_idx] distr_y_selected = distr_y_selected / np.sum(y.toarray()) tail_selected_list.append(tail_selected) L_S_selected = total_labels(y_tmp) LCard_S_selected = cardinality(y_tmp) LDen_S_selected = density(y_tmp) DL_S_selected = distinct_labels(y_tmp) DLS_S_selected = distinct_labelsets(y_tmp) PDL_S_selected = propportion_distinct_labelsets(y) IR_intra_selected = mean_ir_intra_class(y_tmp) IR_inter_selected = mean_ir_inter_class(y_tmp) IR_labelset_selected = mean_ir_labelset(y_tmp) kl = entropy(pk=distr_y_selected, qk=distr_y) # 3.1. Compute tail labels properties for the complete data temp = np.sum(y_tmp.toarray(), axis=0) temp = temp[np.nonzero(temp)[0]] temp[temp <= num_tails] = 1 temp[temp > num_tails] = 0 temp_sum = int(temp.sum()) temp[temp == 0] = -1 temp[temp == 1] = 0 temp_count = int(np.count_nonzero(temp)) args_list.append('## PROPERTIES for {0} ({1})...'.format( dataset_name, split_set_name[idx])) args_list.append('\t>> Number of examples: {0}'.format(y_tmp.shape[0])) args_list.append('\t>> Number of labels: {0}'.format(L_S_selected)) args_list.append( '\t>> Label cardinality: {0:.6f}'.format(LCard_S_selected)) args_list.append('\t>> Label density: {0:.6f}'.format(LDen_S_selected)) args_list.append('\t>> Distinct labels: {0}'.format(DL_S_selected)) args_list.append( '\t>> Distinct label sets: {0}'.format(DLS_S_selected)) args_list.append( '\t>> Frequency of distinct label set: {0:.6f}'.format( PDL_S_selected)) args_list.append( '\t>> Mean imbalance ratio intra-class for all labels: {0:.6f}'. format(IR_intra_selected)) args_list.append( '\t>> Mean imbalance ratio inter-class for all labels: {0:.6f}'. format(IR_inter_selected)) args_list.append( '\t>> Mean imbalance ratio labelsets for all labels: {0:.6f}'. format(IR_labelset_selected)) args_list.append( '\t>> Labels having less than or equal to {0} examples: {1}'. format(num_tails, temp_sum)) args_list.append( '\t>> Labels having more than {0} examples: {1}'.format( num_tails + 1, temp_count)) args_list.append('\t>> KL difference between complete ' 'and data partition: {0:.6f}'.format(kl)) hold_list.append([ y_tmp.shape[0], L_S_selected, LCard_S_selected, LDen_S_selected, DL_S_selected, DLS_S_selected, PDL_S_selected, IR_intra_selected, IR_inter_selected, IR_labelset_selected, temp_sum, temp_count, kl ]) if not display_dataframe: for args in args_list: print(args) # Plotting utilities df_comp = pd.DataFrame({ "Label": np.arange(1, 1 + tail.shape[0]), "Complete": tail, "Train": tail_selected_list[0], "Test": tail_selected_list[1] }) df_comp = df_comp.melt(['Label'], var_name='Dataset', value_name='Sum') temp_text = "Number of examples for each label given {0} data.".format( dataset_name) plot_title = alt.TitleParams( temp_text, subtitle=[ "The horizontal axis indicates the " "indices of labels while the vertical " "axis represents the number of associated " "examples" ]) # Bar plot alt.themes.enable('none') chart = alt.Chart(df_comp, title=plot_title).properties( width=600, height=350).mark_bar(color="grey").encode( x=alt.X('Label:O', title="Label ID", sort='ascending'), y=alt.Y('Sum:Q', title="Number of Examples", stack=None), color=alt.Color('Dataset:N', scale=alt.Scale(range=['red', 'black', 'blue'])), ).configure_header(titleFontSize=20, labelFontSize=15).configure_axis( labelLimit=500, titleFontSize=20, labelFontSize=12, labelPadding=5, ).configure_axisY(grid=False).configure_legend( strokeColor='gray', fillColor='white', padding=10, cornerRadius=10).resolve_scale(x='independent') # save chart.save(os.path.join(rspath, save_name + '.html')) df = pd.DataFrame(hold_list).T df.columns = [ 'Properties for {0}'.format(dataset_name), 'Complete set', 'Training set', 'Test set' ] df.to_csv(path_or_buf=os.path.join(rspath, save_name + ".tsv"), sep='\t', index=False) if display_dataframe and display_figure: return df, chart elif display_dataframe and not display_figure: return df elif not display_dataframe and display_figure: return chart
def lambda_handler(event, context): messages = [] try: # Get the secret sm = boto3.client('secretsmanager') secretobj = sm.get_secret_value(SecretId='ni-covid-tweets') secret = json.loads(secretobj['SecretString']) # Get the index s3 = boto3.client('s3') status = S3_scraper_index(s3, secret['bucketname'], secret['cog-variants-index']) index = status.get_dict() # Create a copy of the file in s3 if 'keyname' not in event: keyname = "COG-variants/%s/%s-%s.csv" % ( event['filedate'], event['modified'].replace( ':', '_'), event['length']) print('getting URL') with requests.get(event['url'], stream=True) as stream: stream.raise_for_status() stream.raw.decode_content = True s3.upload_fileobj( stream.raw, secret['bucketname'], keyname, Config=boto3.s3.transfer.TransferConfig(use_threads=False)) print('done') else: keyname = event['keyname'] # Download the most recently updated CSV file obj = s3.get_object(Bucket=secret['bucketname'], Key=keyname)['Body'] stream = io.BytesIO(obj.read()) # Dataframe for converting between pango lineage and WHO labels # Get the mapping from the raw Github URL resp = requests.get( 'https://github.com/pbarber/covid19-pango-lineage-to-who-label/raw/main/mapping.json' ) # Make sure that the request was successful resp.raise_for_status() # Convert the request data to a Python dictionary mapping = resp.json() # Expand the Pango column mapping = pandas.DataFrame(mapping).explode( 'Pango lineages').reset_index(drop=True) # Filter out old designations mapping_current = mapping[ mapping['Designation'] != 'Former Variant of Interest'] # Load variant data, aggregate and push back to S3 df = pandas.read_csv(stream) df = df[df['adm1'] == 'UK-NIR'] df['Sample Date'] = pandas.to_datetime(df['sample_date']) df['Week of sample'] = df['Sample Date'] - pandas.to_timedelta( df['Sample Date'].dt.dayofweek, unit='d') # Join the lineage data matches = mapping['Pango lineages'].apply(match, col=df['lineage']) match_idx = matches.idxmax() # Filter out indexes where there is no match match_idx[match_idx == matches.idxmin()] = pandas.NA df['idx'] = match_idx # Join to the mapping based on indexes df = df.merge(mapping, how='left', left_on='idx', right_index=True).drop(columns=['idx', 'Pango lineages']) df['WHO label'] = df['WHO label'].fillna('Other') lin_by_week = df.groupby(['Week of sample', 'WHO label']).size().rename('count') lin_pc_by_week = lin_by_week / lin_by_week.groupby(level=0).sum() lin_by_week = pandas.DataFrame(lin_by_week).reset_index() lin_pc_by_week = pandas.DataFrame(lin_pc_by_week).reset_index() stream = io.BytesIO() lin_by_week.to_csv(stream, index=False) stream.seek(0) lineage_key = '%s_lineage.csv' % keyname.rsplit('.', maxsplit=1)[0] s3.upload_fileobj(stream, secret['bucketname'], lineage_key) messages.append('Wrote lineage summary to s3') # Update the S3 index and find the previous date previous = '1970-01-01' prev_lineagekey = None thisindex = None for i in range(len(index)): if index[i]['modified'] == event['modified']: index[i]['lineage'] = lineage_key index[i]['keyname'] = keyname thisindex = i elif index[i]['filedate'] != event['filedate']: if (index[i]['filedate'] > previous) and (index[i]['filedate'] < event['filedate']): previous = index[i]['filedate'] prev_lineagekey = index[i].get('lineage') status.put_dict(index) # If there is a previous file, then load it and work out the differences if prev_lineagekey is not None: obj = s3.get_object(Bucket=secret['bucketname'], Key=prev_lineagekey)['Body'] stream = io.BytesIO(obj.read()) prev_lineage = pandas.read_csv(stream) if 'WHO label' not in prev_lineage.columns: prev_lineage['WHO label'] = 'Other' prev_lineage = prev_lineage.groupby('WHO label')['count'].sum() lineage = lin_by_week.groupby( 'WHO label')['count'].sum().reset_index() lineage = lineage.merge(prev_lineage, how='left', on='WHO label') lineage = lineage.groupby('WHO label').sum()[[ 'count_x', 'count_y' ]] lineage['count_y'] = lineage['count_y'].fillna(0) lineage['diff'] = (lineage['count_x'] - lineage['count_y']).fillna(0).astype(int) top5 = lineage.nlargest(5, 'diff') tweet = """{total:,d} new variant analyses reported for NI on {currdate} since {prevdate} ({altogether:,d} total): """.format(total=lineage['diff'].sum(), prevdate=datetime.datetime.strptime( previous, '%Y-%m-%d').date().strftime('%A %-d %B %Y'), currdate=datetime.datetime.strptime( event['filedate'], '%Y-%m-%d').date().strftime('%A %-d %B %Y'), altogether=lineage['count_x'].sum()) for variant, data in top5.to_dict('index').items(): if data['diff'] > 0: tweet += f"\u2022 {variant}: {data['diff']:,d} (of {data['count_x']:,d})\n" others = int(lineage['diff'].sum() - top5['diff'].sum()) if others != 0: tweet += f"\u2022 Others: {others:,d}\n" tweet += '\nSource: https://beta.microreact.org/' driver = get_chrome_driver() if driver is None: raise Exception('Failed to start chrome') p = altair.vconcat( altair.Chart(lin_by_week[ lin_by_week['Week of sample'] > lin_by_week['Week of sample'].max() - pandas.to_timedelta(84, unit='d')]).mark_line().encode( x=altair.X('Week of sample:T', axis=altair.Axis(title='', labels=False, ticks=False)), y=altair.Y('count:Q', axis=altair.Axis(title='Samples')), color='WHO label'). properties( height=225, width=800, title= 'NI COVID-19 variants identified by COG-UK over the most recent 12 weeks' ), altair.Chart(lin_pc_by_week[ lin_pc_by_week['Week of sample'] > lin_pc_by_week['Week of sample'].max() - pandas.to_timedelta(84, unit='d')]).mark_area().encode( x='Week of sample:T', y=altair.Y('sum(count):Q', axis=altair.Axis(format='%', title='% of samples', orient="right")), color='WHO label').properties( height=225, width=800, ) ).properties(title=altair.TitleParams([ 'Variant identification can take up to 3 weeks, so recent totals are likely to be revised upwards', 'https://twitter.com/ni_covid19_data on %s' % datetime.datetime.now().date().strftime('%A %-d %B %Y') ], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10), ) plotname = 'ni-variants-%s.png' % datetime.datetime.now().date( ).strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) if event.get('notweet') is not True: api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret']) resp = api.upload(plotstore, plotname) if event.get('testtweet') is True: resp = api.dm(secret['twitter_dmaccount'], tweet, resp.media_id) messages.append('Tweeted DM ID %s, ' % resp.id) else: resp = api.tweet(tweet, media_ids=[resp.media_id]) messages.append('Tweeted ID %s, ' % resp.id) # Update the file index index[thisindex]['tweet'] = resp.id status.put_dict(index) else: messages.append('Did not tweet') print(tweet) else: messages.append('Did not find previous lineage data') except: logging.exception('Caught exception in COG variants tweeter') return { "statusCode": 200, "body": json.dumps({ "message:": messages, }), }
def multi_emissions_chart(council: Council): # get just the total emissions data totals = [ "Industry Total", "Commercial Total", "Public Sector Total", "Transport Total", "Domestic Total", ] df = DataPoint.objects.filter( council=council, data_type__name_in_source__in=totals).to_dataframe( "year", ("data_type__name_in_source", "emissions_type"), "value") # get row percentages pdf = df.pivot_table("value", index="year", columns="emissions_type", aggfunc="sum") pdf = (pdf.div(pdf.sum(axis=1), axis=0).reset_index().melt( id_vars=["year"]).rename(columns={"value": "row_percentage"})) df = df.merge(pdf) # Tidy Emissions type label df["emissions_type"] = df["emissions_type"].str.replace(" Total", "") # altair doesn't pick up years right unless in a fake date format df["Year"] = df["year"] df["year"] = df["year"].astype(int).apply(lambda x: f"{x}-01-01") chart = ( alt.Chart(df).mark_area().encode( x=alt.X("year:T", title="", axis=alt.Axis(labelAlign="center")), y=alt.Y( "value", title="", ), color=alt.Color( "emissions_type", scale=alt.Scale( domain=[ "Commercial", "Domestic", "Industry", "Public Sector", "Transport", ], range=[ "#00aeee", # $color-ceuk-blue "#005cab", # $color-ceuk-navy "#e11d21", # $color-ceuk-red "#f29e1a", # $color-ceuk-orange "#ffd80b", # $color-ceuk-yellow ], ), ), tooltip=[ "Year", alt.Tooltip("emissions_type", title="Type"), alt.Tooltip("value", title="Emissions in ktCO2e", format=",.2f"), alt.Tooltip("row_percentage", title="% of Emissions in year", format=".1%"), ], ).properties( title=alt.TitleParams( "Historic emissions by sector, 2005–2019", subtitle=[f"{council.name}, ktCO2e"], ), width="container", height=300, )) # wide format table for longdesc wide_table = (df.rename(columns={ "emissions_type": "Emissions Type" }).pivot_table( ["value"], index="Year", columns="Emissions Type", aggfunc="sum", ).style.format("{:.2f}".format)) alt_title = f"Chart showing emissions by sector for {council.name}" data_source = "Data source: 2020 BEIS Emissions data" return ChartBundle( label="multi_emissions", df=wide_table, chart=chart, alt_title=alt_title, data_source=data_source, )
def lambda_handler(event, context): # Get the secret sm = boto3.client('secretsmanager') secretobj = sm.get_secret_value(SecretId='ni-covid-tweets') secret = json.loads(secretobj['SecretString']) s3 = boto3.client('s3') messages = [] # Download the most recently updated PDF file for change in event: tmp = tempfile.NamedTemporaryFile(suffix='.pdf') with open(tmp.name, 'wb') as fp: s3.download_fileobj(secret['bucketname'],change['keyname'],fp) # Get the date range covered by the report text = textract.process(tmp.name, method='pdfminer').decode('utf-8') regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})') start_date = None end_date = None for line in text.split('\n'): m = regex.search(line) if m: start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date() end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date() break if start_date is None: logging.error('Unable to find start date in report') return { "statusCode": 404, "body": 'Unable to find start date in report %s' %change['url'], } # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to # tables that will not parse properly in the PDF tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True) tablecount = 0 dataset = pandas.DataFrame() for df in tables: if 'Total' not in df.columns: firstrow = df.iloc[0] newcols = [] for i in range(len(firstrow)): if isinstance(firstrow[i], float) and math.isnan(firstrow[i]): newcols.append(df.columns[i]) else: newcols.append(firstrow[i]) df.columns = newcols df = df[1:] df['Setting'] = df['Setting'].str.strip() df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True) df['Total'] = df['Total'].astype(int) df['Open'] = df['Open'].astype(int) df['Closed'] = df['Closed'].astype(int) df = df[df['Setting']!='Total'] if tablecount==0: df['Type'] = 'Probable Outbreak' elif tablecount==1: df['Type'] = 'Cluster' else: logging.warning('Unexpected table: %s' %df) tablecount += 1 dataset = pandas.concat([dataset, df]) dataset['Start Date'] = pandas.to_datetime(start_date) dataset['End Date'] = pandas.to_datetime(end_date) week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7) dataset['Week'] = week # Create a simple summary and the tweet text summary = dataset.groupby('Type').sum() tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y')) for Type,data in summary.to_dict('index').items(): tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed']) tweet += '\n%s' %change['url'] # Pull current data from s3 try: obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body'] except s3.exceptions.NoSuchKey: print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname'])) datastore = pandas.DataFrame(columns=['Week']) else: stream = io.BytesIO(obj.read()) datastore = pandas.read_csv(stream) # Clean out any data with matching dates datastore = datastore[datastore['Week'] != week] # Append the new data datastore = pandas.concat([datastore, dataset]) datastore['Start Date'] = pandas.to_datetime(datastore['Start Date']) datastore['End Date'] = pandas.to_datetime(datastore['End Date']) # Replace any known duplicates datastore['Setting'] = datastore['Setting'].replace({ 'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue', 'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue', 'Funeral / Wakes': 'Funeral / Wake', 'Restaurant / Cafe': 'Restaurant / Café' }) # Push the data to s3 stream = io.BytesIO() datastore.to_csv(stream, index=False) stream.seek(0) s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore']) # Set up chromedriver so we can save altair plots driver = get_chrome_driver() plots = [] if driver is None: logging.error('Failed to start chrome') else: p = altair.vconcat( altair.Chart( dataset ).mark_bar().encode( x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')), y = altair.Y('Setting:O'), color='Type', order=altair.Order( 'Type', sort='ascending' ), ).properties( height=450, width=800, title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Covers the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) p = altair.vconcat( altair.Chart( datastore.groupby(['End Date','Type'])['Total'].sum().reset_index() ).mark_area().encode( x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')), y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")), color='Type', order=altair.Order( 'Type', sort='ascending' ), ).properties( height=450, width=800, title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Reported weekly for the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) p = altair.vconcat( altair.Chart( datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index() ).mark_area().encode( x = altair.X('End Date:T', axis=altair.Axis(title='')), y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")), color='Type', facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0), order=altair.Order( 'Type', sort='ascending' ), ).properties( height=90, width=160, title=altair.TitleParams( 'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')), anchor='middle', ), ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Reported weekly for the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) # Convert to dates to ensure correct output to CSV datastore['Start Date'] = datastore['Start Date'].dt.date datastore['End Date'] = datastore['End Date'].dt.date # Tweet out the text and images if change.get('notweet') is not True: api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret']) upload_ids = api.upload_multiple(plots) if change.get('testtweet') is True: if len(upload_ids) > 0: resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0]) if len(upload_ids) > 1: resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1]) if len(upload_ids) > 2: resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2]) else: resp = api.dm(secret['twitter_dmaccount'], tweet) messages.append('Tweeted DM ID %s' %(resp.id)) else: if len(upload_ids) > 0: resp = api.tweet(tweet, media_ids=upload_ids) else: resp = api.tweet(tweet) # Download and update the index status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index']) index = status.get_dict() for i in range(len(index)): if index[i]['filedate'] == change['filedate']: index[i]['tweet'] = resp.id break status.put_dict(index) messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index'])) else: print(tweet) messages.append('Did not tweet') return { "statusCode": 200, "body": json.dumps({ "message": messages, }), }
import altair as alt import pandas as pd penguins_df = pd.read_csv('data/penguins.csv') # The base plot base = alt.Chart(penguins_df).mark_bar().encode( alt.Y('species', title=None), alt.X('count()', title='Number of penguins')) base # Create added text text = alt.Chart(penguins_df).mark_text(align='center', dx=10).encode(alt.Y('species'), alt.X('count()'), alt.Text('count()')) # Set up the title and subtitle formatting penguin_title = alt.TitleParams( "Adelie Penguins species most abundant in the Antarctic", subtitle= "The Chinstrap species appears to have the lowest penguin population.", fontSize=18, subtitleColor='firebrick') formatted_plot = (base + text).configure_view(strokeWidth=0).properties( height=200, width=300, title=penguin_title) formatted_plot
def render_object(self): df = self.fix_df() obj = alt.Chart(df) if self.chart_type == "line": obj = obj.mark_line(point={"size": 100}) if self.chart_type == "bar": obj = obj.mark_bar() if self.chart_type == "step": obj = obj.mark_line(interpolate='step-after', point=True) options = self.safe_options() x_axis = options['x'] y_axis = options['y'] # hack to push the y-axis to the rough position of the left most label # on the y axis axis_name = "" if not isinstance(y_axis.shorthand, UndefinedType): axis_name = y_axis.shorthand if not isinstance(y_axis.field, UndefinedType): axis_name = y_axis.field if isinstance(y_axis.axis, UndefinedType): y_axis.axis = alt.Axis() # if any kind of formatting of number, assume the default is fine if isinstance(y_axis.axis.format, UndefinedType): format_str = "" else: format_str = y_axis.axis.format if axis_name and not format_str: col = df[axis_name] try: col = col.astype(int) except ValueError: pass try: col = col.map('{:,d}'.format) except ValueError: pass max_len = col.astype(str).str.len().max() if max_len > 5: y_axis.axis.titleX = 0 - (int(max_len * 6.5) + 10) # add spacing to x axis to match ggplot approach values = None try: values = x_axis["axis"]["values"] except Exception: pass if isinstance(values, pd.Series) is False: values = None try: values = df[x_axis.shorthand] except Exception as e: pass if values is not None and values.dtype in [np.int64, np.int32]: maxv = values.max() + 0.5 minv = values.min() - 0.5 options["x"].scale = alt.Scale(domain=[minv, maxv]) obj = obj.encode(**options) if self.interactive: obj = obj.interactive() # process any label functions if self.text_options: text_opts = dict(self.text_options) text_option = text_opts["text"] del text_opts["text"] text_obj = obj.mark_text(**text_opts) text_obj = text_obj.encode(text=text_option) obj = (obj + text_obj) properties = {} if self.default_width: properties["width"] = self.default_width if self.ratio: properties["height"] = "container" if self.title and self.html_chart_titles is False: properties["title"] = self.title obj = obj.properties(**properties) if self.footer: obj = obj.properties(title=alt.TitleParams(self.footer, baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10 )) obj = self.custom_settings(obj) return obj
alt.Color('count()', title='Quantity', scale=alt.Scale(scheme='blues'))).properties(width=150, height=100) # Histograms culmen_facet_plot = alt.Chart(penguins_df.dropna( subset=['sex', 'species'])).mark_bar(opacity=0.5).encode( alt.X('culmen_depth_mm', bin=alt.Bin(maxbins=40), title='Culmen depth (mm)'), alt.Y('count()', stack=None, title='Number of penguins'), alt.Color('species', title='Species')).properties(width=180, height=100).facet( 'sex', title='').resolve_scale(y='independent') # Titles for full visualization titles = alt.TitleParams( "We've discovered many insights from the Penguins dataset", subtitle=[ "We've learned that the Adelie and Chinstrap penguin", " species are have similar culmen depth and body mass, however quite different culmen length" ], fontSize=18, align='center', anchor='middle') # Organize the plots above so it looks like the example provided above. combined_plot = (mass_density_plot & (mass_boxplot | penguin_heatmap) & culmen_facet_plot).properties(title=titles) combined_plot
def plot_time(query_string, highlight_country, year_range): """Function to create a time series plot showing the country-wise global obesity rates Function to create a time series(spaghetti) plot showing the global obesity rates for all the countries for a range of years as selected by the user Args: query_string ([str]): string containing the attributes to be used in a pandas query for filtering the data for the bar plot highlight_country ([str]): name of the country to be highlighted in the time series plot year_range ([float]): range of years to be selected for the time series plot Returns: [altair chart]: An altair time series plot showing the country-wise global obesity rates """ # Filter data ob_yr = he.make_rate_data(["country", "year"], ["obese"], query_string) # Create labels title_label = "World Obesity" sub_label = str(year_range[0]) + "-" + str(year_range[1]) # Format country highlight_country = ([highlight_country] if type(highlight_country) == str else highlight_country) # Get data for highlighted countries highlighted_data = ob_yr[ob_yr["country"].isin(highlight_country)] highlighted_data.loc[:, "highlighted"] = [ country if country in highlight_country else "other" for country in highlighted_data["country"] ] # Create chart country_time_chart = ( alt.Chart( ob_yr, title=alt.TitleParams( text=title_label, subtitle=sub_label)).mark_line().encode( x=alt.X( "year:O", scale=alt.Scale(zero=False), title="Years", axis=alt.Axis(grid=False), ), y=alt.Y( "obese:Q", title="Obesity Rate", axis=alt.Axis(format="%"), ), color=alt.condition( alt.Predicate( alt.FieldOneOfPredicate(field="country", oneOf=highlight_country)), "country", alt.value("lightgray"), # legend=None, ), opacity=alt.condition( alt.Predicate( alt.FieldOneOfPredicate(field="country", oneOf=highlight_country)), alt.value(1), alt.value(0.2), ), tooltip="country", ).properties(width=450, height=300).interactive()) highlighted_time_chart = (alt.Chart(highlighted_data).mark_line().encode( x=alt.X( "year:O", scale=alt.Scale(zero=False), title="Years", axis=alt.Axis(grid=False), ), y=alt.Y( "obese:Q", title="Obesity Rate", axis=alt.Axis(format="%"), ), color=alt.Color( "highlighted", legend=alt.Legend(title="Countries", values=highlight_country), ), tooltip="country", )) # return country_time_chart return country_time_chart + highlighted_time_chart