예제 #1
0
def determine_plot_title(
    metric_plot_component: MetricPlotComponent,
    domain_plot_component: DomainPlotComponent,
) -> alt.TitleParams:
    """Determines the appropriate title for a chart based on input componentsself.

    Conditionally renders a subtitle if relevant (specifically with column domain)

    Args:
        metric_plot_component: Plot utility corresponding to a given metric.
        domain_plot_component: Plot utility corresponding to a given domain.

    Returns:
        An Altair TitleParam object

    """
    contents: str = f"{metric_plot_component.title} per {domain_plot_component.title}"
    subtitle: Optional[str] = domain_plot_component.subtitle

    title: alt.TitleParams
    if subtitle:
        title = alt.TitleParams(contents, subtitle=[subtitle])
    else:
        title = alt.TitleParams(contents)

    return title
예제 #2
0
def plot_points_average_and_trend(configs, title, footer):
    return altair.concat(
        altair.vconcat(*[points_average_and_trend(
            **c) for c in configs]).resolve_scale(x='shared').properties(
                title=altair.TitleParams(
                    footer,
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10), )).properties(title=altair.TitleParams(
                        title,
                        anchor='middle',
                    ))
예제 #3
0
def make_etc_coverage_heatmap(etc_coverage, mag_order=None, module_order=None):
    num_mags_in_frame = len(set(etc_coverage['genome']))
    charts = list()
    for i, (etc_complex, frame) in enumerate(etc_coverage.groupby('complex')):
        # if this is the first chart then make y-ticks otherwise none
        c = alt.Chart(frame, title=etc_complex).encode(
            x=alt.X('module_name',
                    title=None,
                    axis=alt.Axis(labelLimit=0, labelAngle=90),
                    sort=module_order),
            y=alt.Y('genome',
                    axis=alt.Axis(title=None, labels=False, ticks=False),
                    sort=mag_order),
            tooltip=[
                alt.Tooltip('genome', title='Genome'),
                alt.Tooltip('module_name', title='Module Name'),
                alt.Tooltip('path_length', title='Module Subunits'),
                alt.Tooltip('path_length_coverage', title='Subunits present'),
                alt.Tooltip('genes', title='Genes present'),
                alt.Tooltip('missing_genes', title='Genes missing')
            ]).mark_rect().encode(
                color=alt.Color('percent_coverage',
                                legend=alt.Legend(title='% Complete'),
                                scale=alt.Scale(domain=(0, 1)))).properties(
                                    width=HEATMAP_CELL_WIDTH *
                                    len(set(frame['module_name'])),
                                    height=HEATMAP_CELL_HEIGHT *
                                    num_mags_in_frame)
        charts.append(c)
    concat_title = alt.TitleParams('ETC Complexes', anchor='middle')
    return alt.hconcat(*charts, spacing=5, title=concat_title)
예제 #4
0
def plot_heatmap(df,
                 ptitle,
                 x,
                 y,
                 xtitle="",
                 ytitle="",
                 annot_fmt=".3f",
                 ptitle_offset=-5):
    if ptitle != "":
        ptitle = alt.TitleParams(ptitle, offset=ptitle_offset)
    base = (alt.Chart(df, title=ptitle).mark_rect().encode(
        x=alt.X(x, title=xtitle),
        y=alt.Y(y, title=ytitle),
    ))
    heatmap = base.mark_rect(stroke="white",
                             strokeWidth=2).encode(color=alt.Color(
                                 "value:Q",
                                 scale=alt.Scale(type="log",
                                                 scheme="yelloworangered"),
                                 legend=None,
                             ), )
    text = base.mark_text(baseline="middle").encode(
        text=alt.Text("value:Q", format=annot_fmt),
        color=alt.condition(
            alt.datum.value > df["value"].mean(),
            alt.value("white"),
            alt.value("black"),
        ),
    )
    crchart = heatmap + text
    return crchart
예제 #5
0
def plot_density(qual, winetype, xcol):
    if qual in [0, 1, 2]:
        subset = wine.loc[(wine["Quality Factor Numeric"] == qual)
                          & (wine["Wine"].isin(winetype))]
    else:
        subset = wine.loc[wine["Wine"].isin(winetype)]
    chart = alt.Chart(subset).transform_density(
        density=xcol,
        groupby=['Wine', 'Quality Factor'],
        as_=['value', 'density'],
        steps=200,  # bandwidth=5
    ).mark_area(opacity=0.5).encode(
        alt.X('value:Q', title=xcol, axis=alt.Axis(labels=True, grid=True)),
        alt.Y('density:Q',
              title=None,
              axis=alt.Axis(labels=False, grid=False, ticks=False)),
        alt.Color(
            "Wine",
            scale=alt.Scale(domain=['red', 'white'],
                            range=['darkred', 'blue']))).properties(
                                height=300,
                                width=1000,
                                title=alt.TitleParams(
                                    text='Wine Quality Factor Distributions',
                                    align='left',
                                    fontSize=14)).configure_view(stroke=None)

    return chart.to_html()
예제 #6
0
def plot_rate_bw(rate,
                 bw,
                 yinnerdomain,
                 yclientdomain,
                 use_legend,
                 xdomain=None,
                 rate_height=60,
                 bw_height=40):
    rate_chart = alt.Chart(rate).mark_line(color='#5e6472', clip=True).encode(
        x=alt.X('time',
                axis=alt.Axis(title='', labels=False),
                scale=alt.Scale(nice=False, domain=xdomain)),
        y=alt.Y('success_rate:Q',
                axis=alt.Axis(title=['Success', 'rate (s⁻¹)'])),
    ).properties(height=rate_height)

    client_comm = bw.apply(lambda row: 'Client' in row['variable'], axis=1)

    bw_inner_chart = alt.Chart(bw[~client_comm]).mark_line(clip=True).encode(
        x=alt.X("time",
                axis=alt.Axis(title='', labels=False),
                scale=alt.Scale(nice=False,
                                domain=([0, bw['time'].max()]
                                        if xdomain == None else xdomain))),
        y=alt.Y("value",
                axis=alt.Axis(title=''),
                scale=alt.Scale(domain=yinnerdomain)),
        color=alt.Color(
            'variable',
            legend=(alt.Legend(
                title=['Traffic', 'Direction']) if use_legend else None)),
        strokeDash=alt.StrokeDash('variable',
                                  legend=None)).properties(height=bw_height)

    bw_client_chart = alt.Chart(bw[client_comm]).mark_line(clip=True).encode(
        x=alt.X("time",
                axis=alt.Axis(title='Timestamp (s)'),
                scale=alt.Scale(nice=False,
                                domain=([0, bw['time'].max()]
                                        if xdomain == None else xdomain))),
        y=alt.Y("value",
                axis=alt.Axis(title=''),
                scale=alt.Scale(domain=yclientdomain)),
        color=alt.Color('variable',
                        legend=(alt.Legend(title='') if use_legend else None)),
        strokeDash=alt.StrokeDash('variable',
                                  legend=None)).properties(height=bw_height)

    upper = rate_chart
    lower = alt.vconcat(bw_inner_chart,
                        bw_client_chart,
                        title=alt.TitleParams('Bandwidth (MB/s)',
                                              orient='left',
                                              anchor='middle',
                                              dx=15)).resolve_scale(
                                                  color='shared',
                                                  strokeDash='independent')

    return alt.vconcat(upper, lower).resolve_scale(x=alt.ResolveMode('shared'))
예제 #7
0
def plot_factor(regressor, grouper, query_string):
    """Function to create a scatter plot showing the association of obesity rate vs other factors

    Function to create a scatter plot showing the association of obesity rate
    vs other factors and grouped by different aggregators as selected by the user
    through the different dropdown filters.

    Args:
        regressor ([str]): the regressor to be used in the scatter plot

        grouper ([str]): the attribute to be used for grouping the data in the scatter plot

        query_string ([str]): string containing the attributes to be used in a pandas query
                               for filtering the data for the bar plot

    Returns:
        [altair chart]: An altair scatter plot showing the association of obesity rate vs other factors
    """

    label_dict = {
        "primedu": "Primary Education Completion Rate",
        "smoke": "Smoking Rate",
        "unemployed": "Unemployment Rate",
        "income": "Income Group",
        "sex": "Sex",
        "region": "Region",
    }

    title_label = "Obesity Rate vs " + label_dict[regressor]
    sub_label = "" if grouper == "none" else "by " + label_dict[grouper]

    temp = he.make_rate_data(["country", grouper],
                             ["primedu", "smoke", "unemployed", "obese"],
                             query_string)

    chart = (alt.Chart(
        temp, title=alt.TitleParams(
            text=title_label,
            subtitle=sub_label)).mark_circle(opacity=0.25).encode(
                x=alt.X(
                    regressor,
                    type="quantitative",
                    title=label_dict[regressor],
                    axis=alt.Axis(format="%", grid=False),
                ),
                y=alt.Y("obese",
                        title="Obesity Rate",
                        axis=alt.Axis(format="%", grid=False)),
                color=alt.Color(grouper, type="nominal", title="Legend"),
                tooltip=[
                    alt.Tooltip("country:N", title="Country"),
                    alt.Tooltip(grouper, title="Grouping Variable"),
                    alt.Tooltip("obese:Q", format=".1%", title="Obesity Rate"),
                ],
            ).properties(width=450, height=150).interactive())

    factor_chart = chart

    return factor_chart
예제 #8
0
def source_selector(source) -> alt.Chart:
    return (alt.Chart(source).mark_square(size=50, opacity=0.3).encode(
        y=alt.Y(
            "source:N",
            axis=alt.Axis(orient="right", domain=False, ticks=False),
            title=None,
        ),
        color=source_color_or(idle_color),
    ).add_selection(source_selection_brush).properties(
        title=alt.TitleParams("Select model", anchor="start")))
예제 #9
0
def source_vs_hour_chart(
    base: alt.Chart, sensor_unit: str, max_absolute_error: float, faceted: bool = False
) -> Union[alt.Chart, alt.FacetChart]:
    hd_chart = (
        base.mark_rect()
        .transform_joinaggregate(
            on_the_fly_mae="mean(mae)",
            on_the_fly_reference="mean(reference_value)",
            groupby=["event_start", "source"],
        )
        .transform_calculate(accuracy=alt.datum.on_the_fly_mae)
        .encode(
            x=alt.X(
                "event_start:O",
                timeUnit="hours",
                axis=alt.Axis(domain=False, ticks=False, labelAngle=0),
                scale=alt.Scale(domain=list(range(24))),
                title="Hour of day",  # "UTC hour of day"
            ),
            color=alt.condition(
                selectors.time_selection_brush,
                alt.Color(
                    "accuracy:Q",
                    scale=alt.Scale(
                        domain=(max_absolute_error, 0), scheme="redyellowgreen"
                    ),
                    title="Error",
                ),
                alt.value(selectors.idle_color),
            ),
            tooltip=[
                alt.Tooltip("event_start:T", timeUnit="hours", title="Hour of day"),
                alt.Tooltip(
                    "accuracy:Q",
                    title="Mean absolute error (%s)" % sensor_unit,
                    format=".2f",
                ),
            ],
        )
    )
    if faceted:
        hd_chart = hd_chart.facet(
            row=alt.Row("source:O", title=None, header=alt.Header(labelAngle=0))
        )
    else:
        hd_chart = hd_chart.encode(
            y=alt.Y(
                "source:O",
                axis=alt.Axis(domain=False, ticks=False, labelAngle=0, labelPadding=5),
                title=None,
            )
        )
    return hd_chart.properties(
        title=alt.TitleParams("Model performance given a time of day", anchor="middle")
    )
예제 #10
0
 def __create_ranking_bar_chart(self, data, type):
     chart = (alt.Chart(
         data,
         title=alt.TitleParams(text="Top 30 Countries",
                               subtitle="By " + type + " Cases"),
         width=130).mark_bar().encode(
             x=alt.X("Cases", title=" ", axis=alt.Axis(labels=False)),
             y=alt.Y("Country_Region", sort="-x", title=" "),
             color=alt.Color("Cases", scale=alt.Scale(scheme="orangered")),
             tooltip=alt.Tooltip(["Cases:Q"], format=",.0f"),
         ).configure_axis(grid=False).configure_title(
             anchor="start").configure_legend(orient="bottom"))
     return chart.to_html()
예제 #11
0
 def get_group_chart(grid_df,
                     min_value: float,
                     max_value: float,
                     title: str = ''):
     rcharts = list()
     for gv in group_values:
         bar_df = grid_df[grid_df[group_var] == gv]
         rcharts.append(
             get_bar_chart(bar_df, min_value, max_value))
     return alt.hconcat(*rcharts,
                        title=alt.TitleParams(title,
                                              anchor='middle',
                                              align='center',
                                              orient='top'))
예제 #12
0
def make_functional_heatmap(functional_df, mag_order=None):
    # build heatmaps
    charts = list()
    for i, (group,
            frame) in enumerate(functional_df.groupby('category', sort=False)):
        # set variables for chart
        function_order = get_ordered_uniques(list(frame.function_name))
        num_mags_in_frame = len(set(frame['genome']))
        chart_width = HEATMAP_CELL_WIDTH * len(function_order)
        chart_height = HEATMAP_CELL_HEIGHT * num_mags_in_frame
        # if this is the first chart then make y-ticks otherwise none
        if i == 0:
            y = alt.Y('genome',
                      title=None,
                      sort=mag_order,
                      axis=alt.Axis(
                          labelLimit=0,
                          labelExpr="replace(datum.label, /_\d*$/gi, '')"))
        else:
            y = alt.Y('genome',
                      axis=alt.Axis(title=None, labels=False, ticks=False),
                      sort=mag_order)
        # set up colors for chart
        rect_colors = alt.Color('present',
                                legend=alt.Legend(title="Function is Present",
                                                  symbolType='square',
                                                  values=[True, False]),
                                sort=[True, False],
                                scale=alt.Scale(range=['#2ca25f', '#e5f5f9']))
        # define chart
        # TODO: Figure out how to angle title to take up less space
        c = alt.Chart(frame, title=alt.TitleParams(group)).encode(
            x=alt.X('function_name',
                    title=None,
                    axis=alt.Axis(labelLimit=0, labelAngle=90),
                    sort=function_order),
            tooltip=[
                alt.Tooltip('genome', title='Genome'),
                alt.Tooltip('category', title='Category'),
                alt.Tooltip('subcategory', title='Subcategory'),
                alt.Tooltip('function_ids', title='Function IDs'),
                alt.Tooltip('function_name', title='Function'),
                alt.Tooltip('long_function_name', title='Description'),
                alt.Tooltip('gene_symbol', title='Gene Symbol')
            ]).mark_rect().encode(y=y, color=rect_colors).properties(
                width=chart_width, height=chart_height)
        charts.append(c)
    # merge and return
    function_heatmap = alt.hconcat(*charts, spacing=5)
    return function_heatmap
예제 #13
0
def bar_chart(
    data: pd.DataFrame,
    xvar: str = "category",
    yvar: str = "value",
    yscale: str = "linear",
    w: int = 400,
    h: int = 400,
    bar_color: str = "#F3852A",
    title: str = "Bar Chart",
    partial_title: bool = True,
    xvar_complement: Optional[str] = None,
) -> alt.Chart:
    base = alt.Chart(
        data,
        width=w,
        height=h,
        title=alt.TitleParams(
            f"{yscale.capitalize()} {title}" if partial_title else title,
            anchor="start"),
    )

    tooltip_list = [
        alt.Tooltip(f"{xvar}:N", title=xvar.capitalize()),
        alt.Tooltip(f"{yvar}:Q", format=",", title=yvar.capitalize()),
    ]

    if xvar_complement:
        tooltip_list.append(
            alt.Tooltip(f"{xvar_complement}:N",
                        title=xvar_complement.capitalize()))

    bar = base.mark_bar(color=bar_color).encode(
        x=alt.X(f"{xvar}:N", axis=alt.Axis(title=xvar.capitalize())),
        y=alt.Y(
            f"{yvar}:Q",
            scale=alt.Scale(type=yscale) if yscale == "log" else alt.Scale(
                type=yscale, domain=[0, log10_ceiling(data[yvar].max())]),
            axis=alt.Axis(
                title=yvar.capitalize(),
                titleAngle=0,
                titleAlign="left",
                titleY=-5,
                titleX=0,
            ),
        ),
        tooltip=tooltip_list,
    )

    return bar
예제 #14
0
def scoring_quadrant(
    data: pd.DataFrame,
    xvar: str,
    bin_width: float,
    width: int,
    height: int,
    title: Optional[str] = None,
    xtitle: Optional[str] = None,
    ytitle: Optional[str] = None,
) -> alt.Chart:
    binning = alt.Bin(step=bin_width)
    base = alt.Chart(
        data,
        width=width,
        height=height,
        title=alt.TitleParams(title,
                              style="guide-label",
                              dy=-5 if ytitle is None else 0),
    )

    hist = (base.mark_bar(tooltip=True).encode(
        x=alt.X(
            f"binned_{xvar}:Q",
            bin="binned",
            axis=alt.Axis(format="~", title=["Score", xtitle])
            if xtitle is not None else no_axis(),
        ),
        x2=f"binned_{xvar}_end:Q",
        y=alt.Y(
            "y_count:Q",
            axis=alt.Axis(
                title=["Count", ytitle]) if ytitle is not None else no_axis(),
        ),
    ).transform_bin(f"binned_{xvar}", xvar,
                    bin=binning).transform_joinaggregate(
                        y_count=f"count()",
                        groupby=[
                            f"binned_{xvar}",
                            f"binned_{xvar}_end",
                            CONFUSION_CATEGORIES_COL_NAME,
                        ],
                    ))

    return hist
예제 #15
0
def plot_bar(query_string, year):
    """Function to create an altair bar plot of the top 10 countries

    Function to create an altair chart of the top 10 countries
    ordered based on obesity rate and disaggregated as per the
    user inputs received through the app dropdown filters.

    Args:
        query_string ([str]): string containing the attributes to be used in a pandas query
                               for filtering the data for the bar plot

        year ([float]): year

    Returns:
        [altair chart]: An altair bar plot of the top 10 countries
    """

    n = 10

    title_label = "Top " + str(n) + " Countries"
    sub_label = str(year)

    n = 10
    temp = he.make_rate_data(["country"], ["obese"], query_string)
    ob_sorted = temp.sort_values("obese",
                                 ascending=False).head(n).reset_index()
    chart = (alt.Chart(ob_sorted,
                       title=alt.TitleParams(
                           text=title_label,
                           subtitle=sub_label)).mark_bar().encode(
                               x=alt.X(
                                   "obese",
                                   type="quantitative",
                                   title="Obesity Rate",
                                   scale=alt.Scale(domain=[0.1, 0.8]),
                                   axis=alt.Axis(format="%", grid=False),
                               ),
                               y=alt.Y("country", sort="-x", title=""),
                               color="obese",
                               tooltip=alt.Tooltip("obese:Q",
                                                   format=".1%",
                                                   title="Obesity Rate"),
                           ).properties(width=450, height=150).interactive())
    return chart
예제 #16
0
    def __create_world_timeseries_chart(self, case_type, ntype="New"):
        """create trend chart for global numbers

        Args:
            case_type (string): "confirmed", "recovered", "death"
            ntype (string): "Total" or "New"

        Returns:
            [type]: [description]
        """

        if case_type == "confirmed":
            chart_title = "Global Confirmed Cases"
            case_type = 1
        elif case_type == "death":
            chart_title = "Global Deaths"
            case_type = 2
        elif case_type == "recovered":
            chart_title = "Global Recovered Cases"
            case_type = 3
        if ntype == "Total":
            chart_title = chart_title + " Over Time"
        else:
            chart_title = "New " + chart_title + " Per Day"
        data = self.data_reader.get_timeserie_data_by_country("all", case_type)

        chart = (
            alt.Chart(
                data,
                title=alt.TitleParams(text=chart_title),
                height=200,
            )
            .mark_line()
            .transform_filter(alt.FieldEqualPredicate(field="type", equal=ntype))
            .encode(
                x=alt.X("date:T", title="", axis=alt.Axis(format=("%b %Y"))),
                y=alt.Y("count:Q", title=""),
            )
            .configure_axis(grid=False)
            .configure_title(anchor="start")
            .properties(width=735)
        )
        return chart.to_html()
예제 #17
0
    def __create_timeserie_chart(self, country, case_type=1, ntype="Total"):
        data = self.data_reader.get_timeserie_data_by_country(
            country, case_type)
        if case_type == 1:
            chart_title = "Cases over time"
        elif case_type == 2:
            chart_title = "Deaths over time"

        chart = (
            alt.Chart(
                data,
                title=alt.TitleParams(text=chart_title, subtitle=country),
            ).mark_line().transform_filter(
                alt.FieldEqualPredicate(field="type", equal=ntype)).encode(
                    x=alt.X("date:T", title=""),
                    y=alt.Y("count:Q", title="")).configure_axis(
                        grid=False).configure_title(anchor="start").properties(
                            width=350, height=200)
            # .properties(width="container", height="container")
        )
        return chart.to_html()
예제 #18
0
def alt_plot_metric_based_threshold_tuning_plots(df,
                                                 ptitle_offset=-5,
                                                 legend_offset=5,
                                                 figsize=(450, 300)):
    highlight = alt.selection(type="single",
                              on="mouseover",
                              fields=["metric"],
                              nearest=True)
    base = alt.Chart(
        df,
        title=alt.TitleParams("Scoring Metrics, as threshold is changed",
                              offset=ptitle_offset),
    ).encode(
        x=alt.X("threshold:Q", title="threshold"),
        y=alt.Y("value:Q", title=""),
        color=alt.Color("metric:N",
                        legend=alt.Legend(offset=legend_offset, title="")),
        tooltip=[
            alt.Tooltip("metric", title="Metric"),
            alt.Tooltip("threshold", title="Threshold", format=".2f"),
            alt.Tooltip("value", title="Value", format=".2f"),
        ],
    )

    overlay = pd.DataFrame({"default": [0.5]})
    rules = (alt.Chart(overlay).mark_rule().encode(
        x=alt.X("default:Q", title="")))

    points_opacity = alt.value(0)
    points = (base.mark_circle().encode(
        opacity=points_opacity).add_selection(highlight))

    lines = base.mark_line().encode(
        size=alt.condition(~highlight, alt.value(1.5), alt.value(3)))

    combo = (points + lines + rules).properties(width=figsize[0],
                                                height=figsize[1])
    return combo
예제 #19
0
def make_viral_functional_heatmap(functional_df, vgf_order=None):
    # build heatmaps
    charts = list()
    for i, (group,
            frame) in enumerate(functional_df.groupby('Category', sort=False)):
        # set variables for chart
        function_order = get_ordered_uniques(list(frame['Function']))
        num_vgfs_in_frame = len(set(frame['Contig Name']))
        chart_width = HEATMAP_CELL_WIDTH * len(function_order)
        chart_height = HEATMAP_CELL_HEIGHT * num_vgfs_in_frame
        # set up colors for chart
        rect_colors = alt.Color('Present in Contig',
                                legend=alt.Legend(symbolType='square',
                                                  values=[True, False]),
                                sort=[True, False],
                                scale=alt.Scale(range=['#e5f5f9', '#2ca25f']))
        # define chart
        # TODO: Figure out how to angle title to take up less space
        c = alt.Chart(frame, title=alt.TitleParams(group)).encode(
            x=alt.X('Function',
                    title=None,
                    axis=alt.Axis(labelLimit=0, labelAngle=90),
                    sort=function_order),
            y=alt.Y('Contig Name',
                    axis=alt.Axis(title=None, labels=False, ticks=False),
                    sort=vgf_order),
            tooltip=[
                alt.Tooltip('Contig Name'),
                alt.Tooltip('Category'),
                alt.Tooltip('Function'),
                alt.Tooltip('AMG Genes'),
                alt.Tooltip('Genes Present')
            ]).mark_rect().encode(color=rect_colors).properties(
                width=chart_width, height=chart_height)
        charts.append(c)
    # merge and return
    function_heatmap = alt.hconcat(*charts, spacing=5)
    return function_heatmap
예제 #20
0
def plot_map(query_string, year):
    """Fuction to create an altair chloropleth world map plot showing the global obesity rates

    Args:
        query_string ([str]): string containing the attributes to be used in a pandas query
                               for filtering the data for the bar plot

        year ([float]): year

    Returns:
        [altair chart]: An altair chloropleth world map plot showing the global obesity rates
    """

    title_label = "Obesity Rates"
    sub_label = str(year)

    df = (he.make_rate_data(["country"], ["obese"], query_string).merge(
        cy_ids, "right", on="country").sort_values("obese", ascending=False))
    world = ((alt.Chart(
        geojson, title=alt.TitleParams(
            text=title_label,
            subtitle=sub_label)).mark_geoshape().transform_lookup(
                lookup="id",
                from_=alt.LookupData(df, "id", ["country", "obese"]),
            ).encode(
                color=alt.Color(
                    "obese:Q",
                    scale=alt.Scale(scheme="viridis"),
                    title="Obesity",
                    legend=alt.Legend(format=".0%"),
                ),
                stroke=alt.value("black"),
                tooltip=[
                    alt.Tooltip("country:N", title="Country"),
                    alt.Tooltip("obese:Q", format=".1%", title="Obesity Rate"),
                ],
            )).project("naturalEarth1").properties(width=450, height=300))
    return world
예제 #21
0
        title='NI COVID-19 Positive Tests by Age Band from %s to %s' %
        (toplot['Date'].min().strftime('%-d %B %Y'),
         toplot['Date'].max().strftime('%-d %B %Y')))

plt = altair.vconcat(
    altair.layer(
        heatmap,
        heatmap.mark_text(align='right', baseline='middle', dx=43).encode(
            text=altair.Text('Most Recent Positive Tests'),
            color=altair.value('black')))).properties(title=altair.TitleParams(
                [
                    'Data from DoH daily downloads',
                    'Numbers to right of chart show most recent 7 day total',
                    'https://twitter.com/ni_covid19_data on %s' %
                    datetime.datetime.now().strftime('%A %-d %B %Y')
                ],
                baseline='bottom',
                orient='bottom',
                anchor='end',
                fontWeight='normal',
                fontSize=10,
                dy=10), )
plt

# %%
bands = datastore.groupby(['Age_Band_5yr', 'Band Start', 'Band End'],
                          dropna=False).size().reset_index()[[
                              'Age_Band_5yr', 'Band Start', 'Band End'
                          ]]
bands = bands[bands['Age_Band_5yr'] != 'Not Known']
bands.fillna(90, inplace=True)
예제 #22
0
def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    tweets = []
    # Download the most recently updated Excel file
    s3 = boto3.client('s3')
    for change in event:
        obj = s3.get_object(Bucket=secret['bucketname'],Key=change['keyname'])['Body']
        stream = io.BytesIO(obj.read())

        # Load test data and add extra fields
        df = pandas.read_excel(stream,engine='openpyxl',sheet_name='Table 7', header=3)
        df.dropna('columns',how='all',inplace=True)
        df.rename(columns=colclean,inplace=True)
        df.dropna('rows',subset=['Total'],inplace=True)

        # Get the latest dates with values for tests and rolling
        df['date'] = pandas.to_datetime(df['Week Ending'], format='%d/%m/%Y')
        df.sort_values('date', inplace=True)
        latest = df.iloc[-1]

        # Check against previous day's reports
        status = S3_scraper_index(s3, secret['bucketname'], secret['nisra-deaths-index'])
        index = status.get_dict()
        plots = []
        if latest['Total'] == 0:
            tweet = '''No deaths registered in Northern Ireland, week ended {date}

'''.format(
                date=latest['date'].strftime('%A %-d %B %Y'),
            )
        else:
            if latest['Total'] == 1:
                tweet = '''One death registered in Northern Ireland, week ended {date}, in:
'''.format(
                    date=latest['date'].strftime('%A %-d %B %Y')
                )
            else:
                tweet = '''{deaths:,} deaths registered in Northern Ireland, week ended {date}, in:
'''.format(
                    date=latest['date'].strftime('%A %-d %B %Y'),
                    deaths=int(latest['Total'])
                )
            for name in ['Hospital', 'Care Home', 'Hospice', 'Home', 'Other']:
                if latest[name] > 0:
                    tweet += '\u2022 %s: %s\n' %(name, int(latest[name]))
            tweet += '\n'
        if len(df) > 1:
            prev = df.iloc[-2]
            diff = latest['Total'] - prev['Total']
            tweet += '''{symb} {diff} {comp} than previous week

'''.format(
                symb=good_symb if diff < 0 else bad_symb,
                diff=abs(int(diff)),
                comp='fewer' if diff < 0 else 'more'
            )
            try:
                driver = get_chrome_driver()
                plots = []
                if driver is None:
                    logging.error('Failed to start chrome')
                else:
                    toplot = df[(df['Week Ending'] > df['Week Ending'].max()-pandas.to_timedelta(84, unit='d'))]
                    toplot = toplot.drop(columns=['Week of Death','date','Total']).melt(id_vars='Week Ending', var_name='Location', value_name='Deaths')
                    print(toplot)
                    p = altair.vconcat(
                        altair.Chart(
                            toplot
                        ).mark_area().encode(
                            x = altair.X('Week Ending:T', axis=altair.Axis(title='Week of death')),
                            y = altair.Y('sum(Deaths):Q', axis=altair.Axis(title='Deaths', orient="right", tickMinStep=1)),
                            color=altair.Color('Location', sort=altair.SortField('order',order='descending')),
                        ).properties(
                            height=450,
                            width=800,
                            title='NI COVID-19 Deaths reported by NISRA from %s to %s' %(toplot['Week Ending'].min().strftime('%-d %B %Y'), toplot['Week Ending'].max().strftime('%-d %B %Y'))
                        ),
                    ).properties(
                        title=altair.TitleParams(
                            ['Data from NISRA',
                            'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                            baseline='bottom',
                            orient='bottom',
                            anchor='end',
                            fontWeight='normal',
                            fontSize=10,
                            dy=10
                        ),
                    )
                    plotname = 'nisra-deaths-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
                    plotstore = io.BytesIO()
                    p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
                    plotstore.seek(0)
                    plots.append({'name': plotname, 'store': plotstore})
            except:
                logging.exception('Error creating plot')

        tweets.append({
            'text': tweet,
            'url': change['url'],
            'notweet': change.get('notweet'),
            'filedate': change['filedate'],
            'plots': plots
        })

    donottweet = []
    if len(tweets) > 1:
        for i in range(1,len(tweets)):
            for j in range(0, i):
                if (tweets[i]['text'] == tweets[j]['text']):
                    donottweet.append(i)

    messages = []
    for idx in range(len(tweets)):
        tweet = tweets[idx]['text'] + tweets[idx]['url']
        if (idx not in donottweet):
            if tweets[idx].get('notweet') is not True:
                api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
                upload_ids = api.upload_multiple(tweets[idx]['plots'])
                if change.get('testtweet') is True:
                    if len(upload_ids) > 0:
                        resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                    else:
                        resp = api.dm(secret['twitter_dmaccount'], tweet)
                    messages.append('Tweeted DM ID %s' %(resp.id))
                else:
                    if len(upload_ids) > 0:
                        resp = api.tweet(tweet, media_ids=upload_ids)
                    else:
                        resp = api.tweet(tweet)
                    messages.append('Tweeted ID %s, ' %resp.id)

                    # Update the file index
                    for i in range(len(index)):
                        if index[i]['filedate'] == tweets[idx]['filedate']:
                            index[i]['tweet'] = resp.id
                            break
                    status.put_dict(index)

                    messages[-1] += ('updated %s' %secret['nisra-deaths-index'])
            else:
                messages.append('Did not tweet')
                print(tweet)
        else:
            messages.append('Duplicate found %s, did not tweet, ' %tweets[idx]['filedate'])

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message:": messages,
        }),
    }
예제 #23
0
파일: utils.py 프로젝트: arbasher/straSplit
def data_properties(y,
                    selected_examples,
                    num_tails: int = 2,
                    dataset_name="test",
                    model_name: str = "model",
                    rspath: str = ".",
                    display_dataframe: bool = False,
                    display_figure: bool = False):
    save_name = model_name.lower() + "_" + dataset_name.lower()
    args_list = []
    hold_list = [[
        'Number of examples', 'Number of labels', 'Label cardinality',
        'Label density', 'Distinct labels', 'Distinct label sets',
        'Frequency of distinct label sets',
        'Mean imbalance ratio intra-class for all labels',
        'Mean imbalance ratio inter-class for all labels',
        'Mean imbalance ratio labelsets for all labels',
        'Labels having less than or equal to {0} examples'.format(num_tails),
        'Labels having more than {0} examples'.format(num_tails + 1),
        'KL difference between complete and data partition'
    ]]

    # 1. Compute properties of complete data
    L_S = total_labels(y)
    LCard_S = cardinality(y)
    LDen_S = density(y)
    DL_S = distinct_labels(y)
    DLS_S = distinct_labelsets(y)
    PDL_S = propportion_distinct_labelsets(y)
    IR_intra = mean_ir_intra_class(y)
    IR_inter = mean_ir_inter_class(y)
    IR_labelset = mean_ir_labelset(y)

    # 1.1. Compute tail labels properties for the complete data
    tail = np.sum(y.toarray(), axis=0)
    tail = tail[np.nonzero(tail)[0]]
    tail[tail <= num_tails] = 1
    tail[tail > num_tails] = 0
    tail_sum = int(tail.sum())
    tail[tail == 0] = -1
    tail[tail == 1] = 0
    tail_count = int(np.count_nonzero(tail))

    args_list.append('## PROPERTIES for {0}...'.format(dataset_name))
    args_list.append('\t>> Number of examples: {0}'.format(y.shape[0]))
    args_list.append('\t>> Number of labels: {0}'.format(L_S))
    args_list.append('\t>> Label cardinality: {0:.6f}'.format(LCard_S))
    args_list.append('\t>> Label density: {0:.6f}'.format(LDen_S))
    args_list.append('\t>> Distinct labels: {0}'.format(DL_S))
    args_list.append('\t>> Distinct label sets: {0}'.format(DLS_S))
    args_list.append(
        '\t>> Frequency of distinct label sets: {0:.6f}'.format(PDL_S))
    args_list.append(
        '\t>> Mean imbalance ratio intra-class for all labels: {0:.6f}'.format(
            IR_intra))
    args_list.append(
        '\t>> Mean imbalance ratio inter-class for all labels: {0:.6f}'.format(
            IR_inter))
    args_list.append(
        '\t>> Mean imbalance ratio labelsets for all labels: {0:.6f}'.format(
            IR_labelset))
    args_list.append(
        '\t>> Labels having less than or equal to {0} examples: {1}'.format(
            num_tails, tail_sum))
    args_list.append('\t>> Labels having more than {0} examples: {1}'.format(
        num_tails + 1, tail_count))

    hold_list.append([
        y.shape[0], L_S, LCard_S, LDen_S, DL_S, DLS_S, PDL_S, IR_intra,
        IR_inter, IR_labelset, tail_sum, tail_count, 0
    ])

    # 2. Compute properties of complete data
    distr_y = np.sum(y.toarray(), axis=0)
    ntail_idx = np.nonzero(distr_y)[0]
    tail = distr_y[ntail_idx]
    tail_idx = np.argsort(tail)
    tail = tail[tail_idx]
    distr_y = distr_y / np.sum(y.toarray())

    # 3. Iteratively calculate properties of training and test data, respectively
    split_set_name = ["training set", "test set"]
    tail_selected_list = []
    for idx in range(len(selected_examples)):
        y_tmp = y[selected_examples[idx]]
        distr_y_selected = np.sum(y_tmp.toarray(), axis=0)
        tail_selected = distr_y_selected[ntail_idx]
        tail_selected = tail_selected[tail_idx]
        distr_y_selected = distr_y_selected / np.sum(y.toarray())
        tail_selected_list.append(tail_selected)

        L_S_selected = total_labels(y_tmp)
        LCard_S_selected = cardinality(y_tmp)
        LDen_S_selected = density(y_tmp)
        DL_S_selected = distinct_labels(y_tmp)
        DLS_S_selected = distinct_labelsets(y_tmp)
        PDL_S_selected = propportion_distinct_labelsets(y)
        IR_intra_selected = mean_ir_intra_class(y_tmp)
        IR_inter_selected = mean_ir_inter_class(y_tmp)
        IR_labelset_selected = mean_ir_labelset(y_tmp)
        kl = entropy(pk=distr_y_selected, qk=distr_y)

        # 3.1. Compute tail labels properties for the complete data
        temp = np.sum(y_tmp.toarray(), axis=0)
        temp = temp[np.nonzero(temp)[0]]
        temp[temp <= num_tails] = 1
        temp[temp > num_tails] = 0
        temp_sum = int(temp.sum())
        temp[temp == 0] = -1
        temp[temp == 1] = 0
        temp_count = int(np.count_nonzero(temp))

        args_list.append('## PROPERTIES for {0} ({1})...'.format(
            dataset_name, split_set_name[idx]))
        args_list.append('\t>> Number of examples: {0}'.format(y_tmp.shape[0]))
        args_list.append('\t>> Number of labels: {0}'.format(L_S_selected))
        args_list.append(
            '\t>> Label cardinality: {0:.6f}'.format(LCard_S_selected))
        args_list.append('\t>> Label density: {0:.6f}'.format(LDen_S_selected))
        args_list.append('\t>> Distinct labels: {0}'.format(DL_S_selected))
        args_list.append(
            '\t>> Distinct label sets: {0}'.format(DLS_S_selected))
        args_list.append(
            '\t>> Frequency of distinct label set: {0:.6f}'.format(
                PDL_S_selected))
        args_list.append(
            '\t>> Mean imbalance ratio intra-class for all labels: {0:.6f}'.
            format(IR_intra_selected))
        args_list.append(
            '\t>> Mean imbalance ratio inter-class for all labels: {0:.6f}'.
            format(IR_inter_selected))
        args_list.append(
            '\t>> Mean imbalance ratio labelsets for all labels: {0:.6f}'.
            format(IR_labelset_selected))
        args_list.append(
            '\t>> Labels having less than or equal to {0} examples: {1}'.
            format(num_tails, temp_sum))
        args_list.append(
            '\t>> Labels having more than {0} examples: {1}'.format(
                num_tails + 1, temp_count))
        args_list.append('\t>> KL difference between complete '
                         'and data partition: {0:.6f}'.format(kl))
        hold_list.append([
            y_tmp.shape[0], L_S_selected, LCard_S_selected, LDen_S_selected,
            DL_S_selected, DLS_S_selected, PDL_S_selected, IR_intra_selected,
            IR_inter_selected, IR_labelset_selected, temp_sum, temp_count, kl
        ])

    if not display_dataframe:
        for args in args_list:
            print(args)

    # Plotting utilities
    df_comp = pd.DataFrame({
        "Label": np.arange(1, 1 + tail.shape[0]),
        "Complete": tail,
        "Train": tail_selected_list[0],
        "Test": tail_selected_list[1]
    })
    df_comp = df_comp.melt(['Label'], var_name='Dataset', value_name='Sum')

    temp_text = "Number of examples for each label given {0} data.".format(
        dataset_name)
    plot_title = alt.TitleParams(
        temp_text,
        subtitle=[
            "The horizontal axis indicates the "
            "indices of labels while the vertical "
            "axis represents the number of associated "
            "examples"
        ])

    # Bar plot
    alt.themes.enable('none')
    chart = alt.Chart(df_comp, title=plot_title).properties(
        width=600, height=350).mark_bar(color="grey").encode(
            x=alt.X('Label:O', title="Label ID", sort='ascending'),
            y=alt.Y('Sum:Q', title="Number of Examples", stack=None),
            color=alt.Color('Dataset:N',
                            scale=alt.Scale(range=['red', 'black', 'blue'])),
        ).configure_header(titleFontSize=20,
                           labelFontSize=15).configure_axis(
                               labelLimit=500,
                               titleFontSize=20,
                               labelFontSize=12,
                               labelPadding=5,
                           ).configure_axisY(grid=False).configure_legend(
                               strokeColor='gray',
                               fillColor='white',
                               padding=10,
                               cornerRadius=10).resolve_scale(x='independent')

    # save
    chart.save(os.path.join(rspath, save_name + '.html'))

    df = pd.DataFrame(hold_list).T
    df.columns = [
        'Properties for {0}'.format(dataset_name), 'Complete set',
        'Training set', 'Test set'
    ]
    df.to_csv(path_or_buf=os.path.join(rspath, save_name + ".tsv"),
              sep='\t',
              index=False)
    if display_dataframe and display_figure:
        return df, chart
    elif display_dataframe and not display_figure:
        return df
    elif not display_dataframe and display_figure:
        return chart
예제 #24
0
def lambda_handler(event, context):
    messages = []

    try:
        # Get the secret
        sm = boto3.client('secretsmanager')
        secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
        secret = json.loads(secretobj['SecretString'])

        # Get the index
        s3 = boto3.client('s3')
        status = S3_scraper_index(s3, secret['bucketname'],
                                  secret['cog-variants-index'])
        index = status.get_dict()

        # Create a copy of the file in s3
        if 'keyname' not in event:
            keyname = "COG-variants/%s/%s-%s.csv" % (
                event['filedate'], event['modified'].replace(
                    ':', '_'), event['length'])
            print('getting URL')
            with requests.get(event['url'], stream=True) as stream:
                stream.raise_for_status()
                stream.raw.decode_content = True
                s3.upload_fileobj(
                    stream.raw,
                    secret['bucketname'],
                    keyname,
                    Config=boto3.s3.transfer.TransferConfig(use_threads=False))
            print('done')
        else:
            keyname = event['keyname']

        # Download the most recently updated CSV file
        obj = s3.get_object(Bucket=secret['bucketname'], Key=keyname)['Body']
        stream = io.BytesIO(obj.read())

        # Dataframe for converting between pango lineage and WHO labels
        # Get the mapping from the raw Github URL
        resp = requests.get(
            'https://github.com/pbarber/covid19-pango-lineage-to-who-label/raw/main/mapping.json'
        )
        # Make sure that the request was successful
        resp.raise_for_status()
        # Convert the request data to a Python dictionary
        mapping = resp.json()
        # Expand the Pango column
        mapping = pandas.DataFrame(mapping).explode(
            'Pango lineages').reset_index(drop=True)
        # Filter out old designations
        mapping_current = mapping[
            mapping['Designation'] != 'Former Variant of Interest']

        # Load variant data, aggregate and push back to S3
        df = pandas.read_csv(stream)
        df = df[df['adm1'] == 'UK-NIR']
        df['Sample Date'] = pandas.to_datetime(df['sample_date'])
        df['Week of sample'] = df['Sample Date'] - pandas.to_timedelta(
            df['Sample Date'].dt.dayofweek, unit='d')
        # Join the lineage data
        matches = mapping['Pango lineages'].apply(match, col=df['lineage'])
        match_idx = matches.idxmax()
        # Filter out indexes where there is no match
        match_idx[match_idx == matches.idxmin()] = pandas.NA
        df['idx'] = match_idx
        # Join to the mapping based on indexes
        df = df.merge(mapping, how='left', left_on='idx',
                      right_index=True).drop(columns=['idx', 'Pango lineages'])
        df['WHO label'] = df['WHO label'].fillna('Other')
        lin_by_week = df.groupby(['Week of sample',
                                  'WHO label']).size().rename('count')
        lin_pc_by_week = lin_by_week / lin_by_week.groupby(level=0).sum()
        lin_by_week = pandas.DataFrame(lin_by_week).reset_index()
        lin_pc_by_week = pandas.DataFrame(lin_pc_by_week).reset_index()
        stream = io.BytesIO()
        lin_by_week.to_csv(stream, index=False)
        stream.seek(0)
        lineage_key = '%s_lineage.csv' % keyname.rsplit('.', maxsplit=1)[0]
        s3.upload_fileobj(stream, secret['bucketname'], lineage_key)
        messages.append('Wrote lineage summary to s3')

        # Update the S3 index and find the previous date
        previous = '1970-01-01'
        prev_lineagekey = None
        thisindex = None
        for i in range(len(index)):
            if index[i]['modified'] == event['modified']:
                index[i]['lineage'] = lineage_key
                index[i]['keyname'] = keyname
                thisindex = i
            elif index[i]['filedate'] != event['filedate']:
                if (index[i]['filedate'] > previous) and (index[i]['filedate']
                                                          < event['filedate']):
                    previous = index[i]['filedate']
                    prev_lineagekey = index[i].get('lineage')
        status.put_dict(index)

        # If there is a previous file, then load it and work out the differences
        if prev_lineagekey is not None:
            obj = s3.get_object(Bucket=secret['bucketname'],
                                Key=prev_lineagekey)['Body']
            stream = io.BytesIO(obj.read())
            prev_lineage = pandas.read_csv(stream)
            if 'WHO label' not in prev_lineage.columns:
                prev_lineage['WHO label'] = 'Other'
            prev_lineage = prev_lineage.groupby('WHO label')['count'].sum()
            lineage = lin_by_week.groupby(
                'WHO label')['count'].sum().reset_index()
            lineage = lineage.merge(prev_lineage, how='left', on='WHO label')
            lineage = lineage.groupby('WHO label').sum()[[
                'count_x', 'count_y'
            ]]
            lineage['count_y'] = lineage['count_y'].fillna(0)
            lineage['diff'] = (lineage['count_x'] -
                               lineage['count_y']).fillna(0).astype(int)
            top5 = lineage.nlargest(5, 'diff')
            tweet = """{total:,d} new variant analyses reported for NI on {currdate} since {prevdate} ({altogether:,d} total):
""".format(total=lineage['diff'].sum(),
            prevdate=datetime.datetime.strptime(
               previous, '%Y-%m-%d').date().strftime('%A %-d %B %Y'),
            currdate=datetime.datetime.strptime(
               event['filedate'], '%Y-%m-%d').date().strftime('%A %-d %B %Y'),
            altogether=lineage['count_x'].sum())
            for variant, data in top5.to_dict('index').items():
                if data['diff'] > 0:
                    tweet += f"\u2022 {variant}: {data['diff']:,d} (of {data['count_x']:,d})\n"
            others = int(lineage['diff'].sum() - top5['diff'].sum())
            if others != 0:
                tweet += f"\u2022 Others: {others:,d}\n"
            tweet += '\nSource: https://beta.microreact.org/'

            driver = get_chrome_driver()
            if driver is None:
                raise Exception('Failed to start chrome')

            p = altair.vconcat(
                altair.Chart(lin_by_week[
                    lin_by_week['Week of sample'] >
                    lin_by_week['Week of sample'].max() -
                    pandas.to_timedelta(84, unit='d')]).mark_line().encode(
                        x=altair.X('Week of sample:T',
                                   axis=altair.Axis(title='',
                                                    labels=False,
                                                    ticks=False)),
                        y=altair.Y('count:Q',
                                   axis=altair.Axis(title='Samples')),
                        color='WHO label').
                properties(
                    height=225,
                    width=800,
                    title=
                    'NI COVID-19 variants identified by COG-UK over the most recent 12 weeks'
                ),
                altair.Chart(lin_pc_by_week[
                    lin_pc_by_week['Week of sample'] >
                    lin_pc_by_week['Week of sample'].max() -
                    pandas.to_timedelta(84, unit='d')]).mark_area().encode(
                        x='Week of sample:T',
                        y=altair.Y('sum(count):Q',
                                   axis=altair.Axis(format='%',
                                                    title='% of samples',
                                                    orient="right")),
                        color='WHO label').properties(
                            height=225,
                            width=800,
                        )
            ).properties(title=altair.TitleParams([
                'Variant identification can take up to 3 weeks, so recent totals are likely to be revised upwards',
                'https://twitter.com/ni_covid19_data on %s' %
                datetime.datetime.now().date().strftime('%A %-d %B %Y')
            ],
                                                  baseline='bottom',
                                                  orient='bottom',
                                                  anchor='end',
                                                  fontWeight='normal',
                                                  fontSize=10,
                                                  dy=10), )
            plotname = 'ni-variants-%s.png' % datetime.datetime.now().date(
            ).strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore,
                   format='png',
                   method='selenium',
                   webdriver=driver)
            plotstore.seek(0)

            if event.get('notweet') is not True:
                api = TwitterAPI(secret['twitter_apikey'],
                                 secret['twitter_apisecretkey'],
                                 secret['twitter_accesstoken'],
                                 secret['twitter_accesstokensecret'])
                resp = api.upload(plotstore, plotname)
                if event.get('testtweet') is True:
                    resp = api.dm(secret['twitter_dmaccount'], tweet,
                                  resp.media_id)
                    messages.append('Tweeted DM ID %s, ' % resp.id)
                else:
                    resp = api.tweet(tweet, media_ids=[resp.media_id])
                    messages.append('Tweeted ID %s, ' % resp.id)
                    # Update the file index
                    index[thisindex]['tweet'] = resp.id
                    status.put_dict(index)
            else:
                messages.append('Did not tweet')
                print(tweet)
        else:
            messages.append('Did not find previous lineage data')
    except:
        logging.exception('Caught exception in COG variants tweeter')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message:": messages,
        }),
    }
예제 #25
0
def multi_emissions_chart(council: Council):

    # get just the total emissions data
    totals = [
        "Industry Total",
        "Commercial Total",
        "Public Sector Total",
        "Transport Total",
        "Domestic Total",
    ]

    df = DataPoint.objects.filter(
        council=council, data_type__name_in_source__in=totals).to_dataframe(
            "year", ("data_type__name_in_source", "emissions_type"), "value")

    # get row percentages
    pdf = df.pivot_table("value",
                         index="year",
                         columns="emissions_type",
                         aggfunc="sum")
    pdf = (pdf.div(pdf.sum(axis=1), axis=0).reset_index().melt(
        id_vars=["year"]).rename(columns={"value": "row_percentage"}))
    df = df.merge(pdf)

    # Tidy Emissions type label
    df["emissions_type"] = df["emissions_type"].str.replace(" Total", "")

    # altair doesn't pick up years right unless in a fake date format
    df["Year"] = df["year"]
    df["year"] = df["year"].astype(int).apply(lambda x: f"{x}-01-01")

    chart = (
        alt.Chart(df).mark_area().encode(
            x=alt.X("year:T", title="", axis=alt.Axis(labelAlign="center")),
            y=alt.Y(
                "value",
                title="",
            ),
            color=alt.Color(
                "emissions_type",
                scale=alt.Scale(
                    domain=[
                        "Commercial",
                        "Domestic",
                        "Industry",
                        "Public Sector",
                        "Transport",
                    ],
                    range=[
                        "#00aeee",  # $color-ceuk-blue
                        "#005cab",  # $color-ceuk-navy
                        "#e11d21",  # $color-ceuk-red
                        "#f29e1a",  # $color-ceuk-orange
                        "#ffd80b",  # $color-ceuk-yellow
                    ],
                ),
            ),
            tooltip=[
                "Year",
                alt.Tooltip("emissions_type", title="Type"),
                alt.Tooltip("value",
                            title="Emissions in ktCO2e",
                            format=",.2f"),
                alt.Tooltip("row_percentage",
                            title="% of Emissions in year",
                            format=".1%"),
            ],
        ).properties(
            title=alt.TitleParams(
                "Historic emissions by sector, 2005–2019",
                subtitle=[f"{council.name}, ktCO2e"],
            ),
            width="container",
            height=300,
        ))

    # wide format table for longdesc
    wide_table = (df.rename(columns={
        "emissions_type": "Emissions Type"
    }).pivot_table(
        ["value"],
        index="Year",
        columns="Emissions Type",
        aggfunc="sum",
    ).style.format("{:.2f}".format))

    alt_title = f"Chart showing emissions by sector for {council.name}"
    data_source = "Data source: 2020 BEIS Emissions data"

    return ChartBundle(
        label="multi_emissions",
        df=wide_table,
        chart=chart,
        alt_title=alt_title,
        data_source=data_source,
    )
예제 #26
0
def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    s3 = boto3.client('s3')

    messages = []
    # Download the most recently updated PDF file
    for change in event:
        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
        with open(tmp.name, 'wb') as fp:
            s3.download_fileobj(secret['bucketname'],change['keyname'],fp)
        # Get the date range covered by the report
        text = textract.process(tmp.name, method='pdfminer').decode('utf-8')
        regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})')
        start_date = None
        end_date = None
        for line in text.split('\n'):
            m = regex.search(line)
            if m:
                start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date()
                end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date()
                break
        if start_date is None:
            logging.error('Unable to find start date in report')
            return {
                "statusCode": 404,
                "body": 'Unable to find start date in report %s' %change['url'],
            }
        # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to
        # tables that will not parse properly in the PDF
        tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True)
        tablecount = 0
        dataset = pandas.DataFrame()
        for df in tables:
            if 'Total' not in df.columns:
                firstrow = df.iloc[0]
                newcols = []
                for i in range(len(firstrow)):
                    if isinstance(firstrow[i], float) and math.isnan(firstrow[i]):
                        newcols.append(df.columns[i])
                    else:
                        newcols.append(firstrow[i])
                df.columns = newcols
                df = df[1:]
            df['Setting'] = df['Setting'].str.strip()
            df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True)
            df['Total'] = df['Total'].astype(int)
            df['Open'] = df['Open'].astype(int)
            df['Closed'] = df['Closed'].astype(int)
            df = df[df['Setting']!='Total']
            if tablecount==0:
                df['Type'] = 'Probable Outbreak'
            elif tablecount==1:
                df['Type'] = 'Cluster'
            else:
                logging.warning('Unexpected table: %s' %df)
            tablecount += 1
            dataset = pandas.concat([dataset, df])
        dataset['Start Date'] = pandas.to_datetime(start_date)
        dataset['End Date'] = pandas.to_datetime(end_date)
        week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7)
        dataset['Week'] = week
        # Create a simple summary and the tweet text
        summary = dataset.groupby('Type').sum()
        tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
        for Type,data in summary.to_dict('index').items():
            tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed'])
        tweet += '\n%s' %change['url']
        # Pull current data from s3
        try:
            obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body']
        except s3.exceptions.NoSuchKey:
            print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname']))
            datastore = pandas.DataFrame(columns=['Week'])
        else:
            stream = io.BytesIO(obj.read())
            datastore = pandas.read_csv(stream)
        # Clean out any data with matching dates
        datastore = datastore[datastore['Week'] != week]
        # Append the new data
        datastore = pandas.concat([datastore, dataset])
        datastore['Start Date'] = pandas.to_datetime(datastore['Start Date'])
        datastore['End Date'] = pandas.to_datetime(datastore['End Date'])
        # Replace any known duplicates
        datastore['Setting'] = datastore['Setting'].replace({
            'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue',
            'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue',
            'Funeral / Wakes': 'Funeral / Wake',
            'Restaurant / Cafe': 'Restaurant / Café'
        })
        # Push the data to s3
        stream = io.BytesIO()
        datastore.to_csv(stream, index=False)
        stream.seek(0)
        s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore'])
        # Set up chromedriver so we can save altair plots
        driver = get_chrome_driver()
        plots = []
        if driver is None:
            logging.error('Failed to start chrome')
        else:
            p = altair.vconcat(
                altair.Chart(
                    dataset
                ).mark_bar().encode(
                    x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')),
                    y = altair.Y('Setting:O'),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Covers the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")),
                    color='Type',
                    facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0),
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=90,
                    width=160,
                    title=altair.TitleParams(
                        'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')),
                        anchor='middle',
                    ),
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})

        # Convert to dates to ensure correct output to CSV
        datastore['Start Date'] = datastore['Start Date'].dt.date
        datastore['End Date'] = datastore['End Date'].dt.date

        # Tweet out the text and images
        if change.get('notweet') is not True:
            api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
            upload_ids = api.upload_multiple(plots)
            if change.get('testtweet') is True:
                if len(upload_ids) > 0:
                    resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                    if len(upload_ids) > 1:
                        resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1])
                        if len(upload_ids) > 2:
                            resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2])
                else:
                    resp = api.dm(secret['twitter_dmaccount'], tweet)
                messages.append('Tweeted DM ID %s' %(resp.id))
            else:
                if len(upload_ids) > 0:
                    resp = api.tweet(tweet, media_ids=upload_ids)
                else:
                    resp = api.tweet(tweet)
                # Download and update the index
                status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index'])
                index = status.get_dict()
                for i in range(len(index)):
                    if index[i]['filedate'] == change['filedate']:
                        index[i]['tweet'] = resp.id
                        break
                status.put_dict(index)
                messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index']))
        else:
            print(tweet)
            messages.append('Did not tweet')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": messages,
        }),
    }
예제 #27
0
import altair as alt
import pandas as pd

penguins_df = pd.read_csv('data/penguins.csv')

# The base plot
base = alt.Chart(penguins_df).mark_bar().encode(
    alt.Y('species', title=None), alt.X('count()', title='Number of penguins'))
base

# Create added text
text = alt.Chart(penguins_df).mark_text(align='center',
                                        dx=10).encode(alt.Y('species'),
                                                      alt.X('count()'),
                                                      alt.Text('count()'))

# Set up the title and subtitle formatting
penguin_title = alt.TitleParams(
    "Adelie Penguins species most abundant in the Antarctic",
    subtitle=
    "The Chinstrap species appears to have the lowest penguin population.",
    fontSize=18,
    subtitleColor='firebrick')

formatted_plot = (base + text).configure_view(strokeWidth=0).properties(
    height=200, width=300, title=penguin_title)

formatted_plot
예제 #28
0
    def render_object(self):
        df = self.fix_df()
        obj = alt.Chart(df)
        if self.chart_type == "line":
            obj = obj.mark_line(point={"size": 100})
        if self.chart_type == "bar":
            obj = obj.mark_bar()
        if self.chart_type == "step":
            obj = obj.mark_line(interpolate='step-after', point=True)
        options = self.safe_options()
        x_axis = options['x']
        y_axis = options['y']

        # hack to push the y-axis to the rough position of the left most label
        # on the y axis
        axis_name = ""
        if not isinstance(y_axis.shorthand, UndefinedType):
            axis_name = y_axis.shorthand
        if not isinstance(y_axis.field, UndefinedType):
            axis_name = y_axis.field
        if isinstance(y_axis.axis, UndefinedType):
            y_axis.axis = alt.Axis()
        # if any kind of formatting of number, assume the default is fine
        if isinstance(y_axis.axis.format, UndefinedType):
            format_str = ""
        else:
            format_str = y_axis.axis.format
        if axis_name and not format_str:
            col = df[axis_name]
            try:
                col = col.astype(int)
            except ValueError:
                pass
            try:
                col = col.map('{:,d}'.format)
            except ValueError:
                pass
            max_len = col.astype(str).str.len().max()
            if max_len > 5:
                y_axis.axis.titleX = 0 - (int(max_len * 6.5) + 10)

        # add spacing to x axis to match ggplot approach
        values = None
        try:
            values = x_axis["axis"]["values"]
        except Exception:
            pass
        if isinstance(values, pd.Series) is False:
            values = None
            try:
                values = df[x_axis.shorthand]
            except Exception as e:
                pass

        if values is not None and values.dtype in [np.int64, np.int32]:
            maxv = values.max() + 0.5
            minv = values.min() - 0.5
            options["x"].scale = alt.Scale(domain=[minv, maxv])
        obj = obj.encode(**options)
        if self.interactive:
            obj = obj.interactive()

        # process any label functions
        if self.text_options:
            text_opts = dict(self.text_options)
            text_option = text_opts["text"]
            del text_opts["text"]
            text_obj = obj.mark_text(**text_opts)
            text_obj = text_obj.encode(text=text_option)
            obj = (obj + text_obj)

        properties = {}

        if self.default_width:
            properties["width"] = self.default_width

        if self.ratio:
            properties["height"] = "container"

        if self.title and self.html_chart_titles is False:
            properties["title"] = self.title

        obj = obj.properties(**properties)

        if self.footer:
            obj = obj.properties(title=alt.TitleParams(self.footer,
                                                       baseline='bottom',
                                                       orient='bottom',
                                                       anchor='end',
                                                       fontWeight='normal',
                                                       fontSize=10
                                                       ))

        obj = self.custom_settings(obj)

        return obj
예제 #29
0
    alt.Color('count()', title='Quantity',
              scale=alt.Scale(scheme='blues'))).properties(width=150,
                                                           height=100)

# Histograms
culmen_facet_plot = alt.Chart(penguins_df.dropna(
    subset=['sex', 'species'])).mark_bar(opacity=0.5).encode(
        alt.X('culmen_depth_mm',
              bin=alt.Bin(maxbins=40),
              title='Culmen depth (mm)'),
        alt.Y('count()', stack=None, title='Number of penguins'),
        alt.Color('species',
                  title='Species')).properties(width=180, height=100).facet(
                      'sex', title='').resolve_scale(y='independent')

# Titles for full visualization
titles = alt.TitleParams(
    "We've discovered many insights from the Penguins dataset",
    subtitle=[
        "We've learned that the Adelie and Chinstrap penguin",
        " species are have similar culmen depth and body mass, however quite different culmen length"
    ],
    fontSize=18,
    align='center',
    anchor='middle')

# Organize the plots above so it looks like the example provided above.
combined_plot = (mass_density_plot & (mass_boxplot | penguin_heatmap)
                 & culmen_facet_plot).properties(title=titles)

combined_plot
예제 #30
0
def plot_time(query_string, highlight_country, year_range):
    """Function to create a time series plot showing the country-wise global obesity rates

    Function to create a time series(spaghetti) plot showing the global obesity rates
    for all the countries for a range of years as selected by the user

    Args:
        query_string ([str]): string containing the attributes to be used in a pandas query
                               for filtering the data for the bar plot

        highlight_country ([str]): name of the country to be highlighted in the time series plot

        year_range ([float]): range of years to be selected for the time series plot

    Returns:
        [altair chart]: An altair time series plot showing the country-wise global obesity rates
    """

    # Filter data
    ob_yr = he.make_rate_data(["country", "year"], ["obese"], query_string)

    # Create labels
    title_label = "World Obesity"
    sub_label = str(year_range[0]) + "-" + str(year_range[1])

    # Format country
    highlight_country = ([highlight_country] if type(highlight_country) == str
                         else highlight_country)

    # Get data for highlighted countries
    highlighted_data = ob_yr[ob_yr["country"].isin(highlight_country)]
    highlighted_data.loc[:, "highlighted"] = [
        country if country in highlight_country else "other"
        for country in highlighted_data["country"]
    ]

    # Create chart
    country_time_chart = (
        alt.Chart(
            ob_yr, title=alt.TitleParams(
                text=title_label, subtitle=sub_label)).mark_line().encode(
                    x=alt.X(
                        "year:O",
                        scale=alt.Scale(zero=False),
                        title="Years",
                        axis=alt.Axis(grid=False),
                    ),
                    y=alt.Y(
                        "obese:Q",
                        title="Obesity Rate",
                        axis=alt.Axis(format="%"),
                    ),
                    color=alt.condition(
                        alt.Predicate(
                            alt.FieldOneOfPredicate(field="country",
                                                    oneOf=highlight_country)),
                        "country",
                        alt.value("lightgray"),
                        # legend=None,
                    ),
                    opacity=alt.condition(
                        alt.Predicate(
                            alt.FieldOneOfPredicate(field="country",
                                                    oneOf=highlight_country)),
                        alt.value(1),
                        alt.value(0.2),
                    ),
                    tooltip="country",
                ).properties(width=450, height=300).interactive())

    highlighted_time_chart = (alt.Chart(highlighted_data).mark_line().encode(
        x=alt.X(
            "year:O",
            scale=alt.Scale(zero=False),
            title="Years",
            axis=alt.Axis(grid=False),
        ),
        y=alt.Y(
            "obese:Q",
            title="Obesity Rate",
            axis=alt.Axis(format="%"),
        ),
        color=alt.Color(
            "highlighted",
            legend=alt.Legend(title="Countries", values=highlight_country),
        ),
        tooltip="country",
    ))

    # return country_time_chart
    return country_time_chart + highlighted_time_chart