Example #1
0
 def _init_glyph(self, plot, mapping, properties):
     """
     Returns a Bokeh glyph object.
     """
     slope = Slope(level=properties.get('level', 'glyph'), **mapping)
     plot.add_layout(slope)
     return None, slope
Example #2
0
    def plot_observed_vs_expected_correlations(self, value, counts):
        """Plot_observed_vs_expected_correlations."""
        names = []
        obs = []
        exps = []

        countdict = counts.to_dict()['expected_count']
        for name, exp in countdict.items():
            names.append(name)
            exps.append(exp)
            obs.append(value['observed_references'].get(name, 0))

        data = pd.DataFrame(
            zip(names, obs, exps),
            columns=['Name', 'Observed', 'Expected']
        )

        data['log_obs'] = [math.log(y+1, 10) for y in data['Observed']]
        data['log_exp'] = [math.log(y+1, 10) for y in data['Expected']]

        model = LinearRegression().fit(
            np.array(data['log_exp'].values).reshape(-1, 1),
            data['log_obs']
        )

        regression_line = Slope(
            gradient=model.coef_[0],
            y_intercept=model.intercept_,
            line_color=Colors.light_cornflower_blue
        )

        plot = points.points(
            [data['log_exp'].tolist()],
            [data['log_obs'].tolist()],
            height=350,
            tools="",
            toolbar_location=None,
            output_backend="webgl",
            colors=[Colors.light_cornflower_blue],
            x_axis_label='log10(Expected Count)',
            y_axis_label='log10(Observed Count)',
            title='Expected vs Observed'
        )

        corrs = (
            "Spearmans: {:.2f}, (p = {:.2f})  "
            "Pearsons: {:.2f}, (p = {:.2f})".format(
                value['spearmans_rho'],
                value['spearmans_rho_pval'],
                value['pearson'],
                value['pearson_pval']
            ))
        plot.add_layout(regression_line)
        self.add_plot_title(plot, {}, corrs)
        self.style_plot(plot)
        return plot
Example #3
0
def test_Slope() -> None:
    slope = Slope()
    assert slope.gradient is None
    assert slope.y_intercept is None
    assert slope.x_range_name == 'default'
    assert slope.y_range_name == 'default'
    assert slope.level == 'annotation'
    check_line_properties(slope, "", 'black', 1.0)
    check_properties_existence(slope, ANNOTATION + [
        "gradient",
        "y_intercept",
    ], LINE)
Example #4
0
def joins_sides(stats, colorblind=False):
    """
    Shows the join distribution using a scatter plot.
    Each join operator is shown by a dot. The x coordinate is the data read from the
    right-side table, and the y coordinate is the data read from the left-side table.
    Replicated joins are shown in a different color than Partitioned joins.
    For optimal performance, keep the right-side smaller than the left-side (the points should be above the y=x line). Replicated joins should be used as long as the right-side table is not too large, to prevent out-of-memory errors.
    If you are using CBO, ensure the correct statistics are estimated for all tables being joined using ANALYZE command.

    Optimization Tips -
    1. Queries to the left of the black dashed line and above the orange dashed line should all use the REPLICATED join distribution type.
    2. Queries to the right of the orange dashed line perform joins with an incorrect table order. Ensure statistics are used, or rewrite the queries to flip the table sides, to boost performance and save cluster resources.

    """
    joins = list(iter_joins(stats))
    if not joins:
        return

    p = figure(
        title="Joins distribution",
        x_axis_label="Right-side data read [bytes]",
        x_axis_type="log",
        y_axis_label="Left-side data read [bytes]",
        y_axis_type="log",
        sizing_mode="scale_width",
        tools=TOOLS,
    )

    data = {}
    for stat, node, probe, build in joins:
        data.setdefault("x", []).append(build["input_size"])  # right-side
        data.setdefault("y", []).append(probe["input_size"])  # left-side
        data.setdefault("dist", []).append(node["distributionType"])
        data.setdefault("copy_on_tap", []).append(stat["query_id"])

    shape_size = _get_size(colorblind)
    color_map = {"PARTITIONED": "red", "REPLICATED": "blue"}
    marker_map = {"PARTITIONED": "circle", "REPLICATED": "square"}

    data["color"] = [color_map[d] for d in data["dist"]]
    data["marker"] = [marker_map[d] for d in data["dist"]]
    source = ColumnDataSource(data)
    p.scatter("x", "y", marker="marker", color="color", legend_group="dist", alpha=0.5, size=shape_size, source=source)
    p.select(type=TapTool).callback = CustomJS(args=dict(source=source), code=COPY_JS)
    p.legend.title = "Join distribution"
    p.xaxis.ticker = [1, 1e3, 1e6, 1e9, 1e12]
    p.yaxis.ticker = [1, 1e3, 1e6, 1e9, 1e12]
    add_constant_line(p, 'height', 1e6)
    slope = Slope(gradient=1, y_intercept=0,
                  line_color='orange', line_dash='dashed', line_width=2)

    p.add_layout(slope)
    return p
Example #5
0
def plot_games(data, model, features, **plot_kwargs):
    plot_kwargs.setdefault("x_axis_label", features[0])
    plot_kwargs.setdefault("y_axis_label", features[1])
    plot_kwargs.setdefault("tools", TOOLS)
    plot_kwargs.setdefault(
        "tooltips",
        [
            ("name", "@name"),
            ("year", "@year"),
            ("complexity", "@complexity"),
            ("time", "@min_time–@max_time minutes"),
            ("age", "@min_age+"),
        ],
    )

    plot = figure(**plot_kwargs)

    data["color"] = [
        "#193F4A" if kennerspiel else "#E30613" for kennerspiel in data.ksdj
    ]
    data["marker"] = np.where(
        model.predict(data[features]) == data.ksdj, "circle", "square")

    plot.scatter(
        source=data,
        x=features[0],
        y=jitter(features[1], width=0.25, distribution="normal"),
        color="color",
        marker="marker",
        # alpha=0.9,
        size=8,
    )

    w1 = model.coef_[0, 0]
    w2 = model.coef_[0, 1]
    b = model.intercept_[0]
    slope = Slope(
        gradient=-w1 / w2,
        y_intercept=-b / w2,
        line_color="black",
        line_dash="dashed",
        line_width=2,
    )
    plot.add_layout(slope)

    return plot
def figures_chisq_detailed(init_group, df_chisq):
    df_chisq["dt_str"] = df_chisq.Date.dt.strftime("%Y-%m-%d")
    df_latest = df_chisq.groupby("CountryProv").apply(lambda g: g.tail(1)).reset_index(drop=True)
    df_latest["color"] = "#73b2ff"

    source_hist = ColumnDataSource(df_chisq)
    source_latest = ColumnDataSource(df_latest)

    # since cannot use View iwth LabelSet, creating a different source per continent
    srcLatest_continent = df_latest.groupby("Continent").apply(lambda g: ColumnDataSource(g))
    srcLatest_continent = srcLatest_continent.reset_index().rename(columns={0:"src"})
  
    gf = GroupFilter(column_name='CountryProv', group=init_group)
    view1 = CDSView(source=source_hist, filters=[gf])
    
    plot_size_and_tools = {'plot_height': 300, 'plot_width': 600,
                           'tools':['box_select', 'reset', 'help', 'box_zoom'],
                           'x_axis_type': 'datetime',
                           'tooltips': [
                                ("Date", "@dt_str"),
                            ],
                          }

    
    # FIXME couldnt do p_a1.line below, so using hack of varea
    p_a1 = figure(title="Confirmed and thresholds. Below threshold: good, above: bad, within: ok", **plot_size_and_tools)
    p_a1.varea(x='Date', y1='case_ma07_lower', y2='case_ma07_upper', source=source_hist, color='pink', view=view1, fill_alpha=.7, legend_label="mean +/- std band")
    p_a1.varea(x='Date', y1='case_ma07', y2='case_ma07_eps', source=source_hist, color='red', view=view1, legend_label="7-day moving avg")
    #p_a1.varea(x='Date', y1='case_ma14', y2='case_ma14_eps', source=source_hist, color='purple', view=view1, legend_label="14-day moving avg")
    p_a1.varea(x='Date', y1='threshold_min_eps', y2='threshold_max_eps', source=source_hist, color='green', view=view1, fill_alpha=.7, legend_label="chi-squared thresholds band")

    # band: view= is not supported, so just using varea above
    #band = Band(base='Date', lower='case_ma07_lower', upper='case_ma07_upper', source=source_hist, level='underlay',
    #            fill_alpha=1.0, line_width=1, line_color='black', view=view1)
    #p_a1.add_layout(band)
    c_a1a = p_a1.circle(x='Date', y='daily_conf', source=source_hist, color='black', view=view1)

    # https://stackoverflow.com/a/51540955/4126114
    # https://docs.bokeh.org/en/latest/docs/user_guide/styling.html#inside-the-plot-area
    p_a1.legend.label_text_font_size = '6pt'
    p_a1.legend.location = "top_left"

   
    p_a2 = figure(title="Total tests (daily vs 7-day moving avg)", **plot_size_and_tools)
    p_a2.varea(x='Date', y1='tests_ma07_lower', y2='tests_ma07_upper', source=source_hist, color='pink', view=view1)
    p_a2.varea(x='Date', y1='tests_ma07', y2="tests_ma07_eps", source=source_hist, color='red', view=view1)
    #p_a2.varea(x='Date', y1='tests_ma14', y2="tests_ma14_eps", source=source_hist, color='purple', view=view1)
    p_a2.circle(x='Date', y='daily_tests', source=source_hist, color='black', view=view1)
    p_a2.x_range = p_a1.x_range # lock in the x axis so that zoom works simultaneously on all

    p_b1 = figure(title="Detrended cases. Negative: good, positive: bad", **plot_size_and_tools)
    p_b1.varea(x='Date', y1='thresMinMinusMid', y2='thresMaxMinusMid', source=source_hist, color='green', view=view1, legend_label="thresholds band", fill_alpha=0.7)
    p_b1.varea(x='Date', y1='caseMa07Lower_minusMid', y2='caseMa07Upper_minusMid', source=source_hist, color='pink', view=view1, legend_label="cases ma7 - threshold mid +/- std", fill_alpha=0.7)
    p_b1.varea(x='Date', y1='case_detrended', y2='caseDet_eps', source=source_hist, color='red', view=view1, legend_label="cases detrended")
    p_b1.circle(x='Date', y='case_detrended', source=source_hist, color='red', view=view1)
    p_b1.x_range = p_a1.x_range
    p_b1.legend.label_text_font_size = '6pt'
    p_b1.legend.location = "top_left"

    p_b2 = figure(title="Detrended cases percentage of raw cases", **plot_size_and_tools)
    p_b2.circle(x='Date', y='caseDet_pct', source=source_hist, color='red', view=view1)
    p_b2.x_range = p_a1.x_range

    p_c1 = figure(title="Ratio case/total (daily)", **plot_size_and_tools)
    c_c1a = p_c1.circle(x='Date', y='ratio_daily', source=source_hist, color='blue', view=view1)

    p_c2 = figure(title="Ratio case/total (7-day ma)", **plot_size_and_tools)
    p_c2.circle(x='Date', y='ratio_ma07', source=source_hist, color='blue', view=view1)

    # general-use lines
    slope_y0 = Slope(gradient=0, y_intercept=0, line_color='orange', line_width=50)
    slope_x0 = Slope(gradient=np.Inf, y_intercept=0, line_color='orange', line_width=50)

    # scatter plot
    view_us = CDSView(source=source_latest, filters=[GroupFilter(column_name='Continent', group="US")])
    view_other = CDSView(source=source_latest, filters=[GroupFilter(column_name='Continent', group="Other")])

    TOOLTIPS = [
        ("Country/Region", "@CountryProv"),
    ]
    p_cont = []
    for srcCont_i in srcLatest_continent.iterrows():
      srcCont_i = srcCont_i[1]
      p_d1=figure(plot_width=600,plot_height=400,tooltips=TOOLTIPS,title=srcCont_i.Continent)
      p_d1.scatter('case_detrended','case_det_diff07',source=srcCont_i.src, size=12,color='color') # , view=view_us
      p_d1.xaxis.axis_label = 'Cases detrended: values'
      p_d1.yaxis.axis_label =  'Cases detrended: diff07'
      from bokeh.models import LabelSet
      labels = LabelSet(x='case_detrended', y='case_det_diff07', text='cp_code', level='glyph',
               x_offset=5, y_offset=5, source=srcCont_i.src, render_mode='canvas')
      p_d1.add_layout(labels)
      p_d1.add_layout(slope_y0)
      p_d1.add_layout(slope_x0)
      p_cont.append(p_d1)

    # group plots into 3 per row
    # https://stackoverflow.com/a/1625013/4126114
    from itertools import zip_longest
    p_cont = list(zip_longest(*(iter(p_cont),) * 3))
    p_cont = [[e for e in t if e != None] for t in p_cont]

    g = gridplot([[p_a1, p_a2], [p_b1, p_b2], [p_c1, p_c2]] + p_cont)

    return source_hist, c_a1a, g
Example #7
0
                    y_axis_label='Winning Percentage',
                    y_range=(0.25, 0.725),
                    title='Winning Percentage vs. Run Differential',
                    tools='hover',
                    tooltips=tooltip,
                    toolbar_location=None)

winrun_fig.circle(x='RunDiff',
                  y='WinPCT',
                  radius=2,
                  alpha=0.5,
                  color='#d71d1d',
                  source=rt_cds)

bf_line = Slope(gradient=coeffs[-2],
                y_intercept=coeffs[-1],
                line_color='#400987',
                line_width=2)
winrun_fig.add_layout(bf_line)

error_fig = figure(x_axis_label='Run Differential',
                   x_range=(-350, 310),
                   y_axis_label='Residual',
                   y_range=(-0.075, 0.09),
                   title='Residual Errors in Linear Regression',
                   tools='hover',
                   tooltips=tooltip,
                   toolbar_location=None)

error_fig.circle(x='RunDiff',
                 y='Error',
                 radius=2,
Example #8
0
def scatter(x,
            y,
            label=None,
            group=None,
            title="Scatter Plot",
            xlabel="x",
            ylabel="y",
            width=600,
            height=600,
            legend=True,
            size=4,
            shape="circle",
            font_size="16pt",
            label_font_size="13pt",
            col_palette=None,
            hover_xy=True,
            gradient=False,
            hline=False,
            vline=False,
            xrange=None,
            yrange=None):
    """Creates a scatterplot using Bokeh.

    Required Parameters
    -------------------
    x : array-like, shape = [n_samples]
        Inpute data for x-axis.

    y : array-like, shape = [n_samples]
        Inpute data for y-axis.
    """

    # Error check
    if len(x) != len(y):
        raise ValueError("length of X does not match length of Y.")

    # If label is None, give an index based on input order
    if label is None:
        label_copy = {}
        label_copy["Idx"] = list(range(len(x)))
    else:
        try:
            label2 = label.copy()
            label2_dict = label2.to_dict("series")
            label_copy = label2_dict  # Ensure I don't overwrite label (when plot_groupmean=True)
        except TypeError:
            label2 = label.copy()
            label_copy = {}
            label_copy[label2.name] = label2.values.tolist()

    # If colour palette is None (default):
    if col_palette is None:
        col_palette = [
            "red", "blue", "green", "orange", "blueviolet", "gold", "peru",
            "pink", "darkblue", "olive", "teal", "slategray"
        ]

    # Group is None or allow for multiple classes
    if group is None:
        group_copy = [None] * len(x)
        col = []
        for i in range(len(x)):
            col.append(col_palette[2])
    else:
        group_copy = group.copy()
        group_unique = np.sort(np.unique(group_copy))
        col = []
        for i in range(len(group_copy)):
            for j in range(len(group_unique)):
                if group_copy[i] == group_unique[j]:
                    col.append(col_palette[j])

    # Bokeh data source with data labels
    data = {"x": x, "y": y, "group": group_copy, "col": col}
    data_label = {}
    for name, val in label_copy.items():
        data_label[name] = val
    data.update(data_label)
    source = ColumnDataSource(data=data)

    # Tool-tip (add everything in label_copy)
    TOOLTIPS = []
    if hover_xy is True:
        TOOLTIPS = [("x", "@x{1.111}"), ("y", "@y{1.111}")]
    for name, val in data_label.items():
        TOOLTIPS.append((str(name), "@" + str(name)))

    # Base figure
    fig = figure(title=title,
                 x_axis_label=xlabel,
                 y_axis_label=ylabel,
                 plot_width=width,
                 plot_height=height,
                 x_range=xrange,
                 y_range=yrange)

    # Add to plot
    if shape is "circle":
        shape = fig.circle("x",
                           "y",
                           size=size,
                           alpha=0.6,
                           color="col",
                           legend="group",
                           source=source)
    elif shape is "triangle":
        shape = fig.triangle("x",
                             "y",
                             size=size,
                             alpha=0.6,
                             color="col",
                             legend="group",
                             source=source)
    else:
        raise ValueError("shape has to be either 'circle' or 'triangle'.")

    shape_hover = HoverTool(renderers=[shape], tooltips=TOOLTIPS)
    fig.add_tools(shape_hover)

    if gradient is not False:
        slope = Slope(gradient=gradient,
                      y_intercept=0,
                      line_color="black",
                      line_width=2,
                      line_alpha=0.3)
        fig.add_layout(slope)
        new_gradient = -(1 / gradient)
        slope2 = Slope(gradient=new_gradient,
                       y_intercept=0,
                       line_color="black",
                       line_dash="dashed",
                       line_width=2,
                       line_alpha=0.10)
        fig.add_layout(slope2)

    if hline is not False:
        h = Span(location=0,
                 dimension="width",
                 line_color="black",
                 line_width=3,
                 line_alpha=0.15)
        fig.add_layout(h)

    if vline is not False:
        v = Span(location=0,
                 dimension="height",
                 line_color="black",
                 line_width=3,
                 line_alpha=0.15)
        fig.add_layout(v)

    # Font-sizes
    fig.title.text_font_size = font_size
    fig.xaxis.axis_label_text_font_size = label_font_size
    fig.yaxis.axis_label_text_font_size = label_font_size

    # Extra padding
    fig.min_border_left = 20
    fig.min_border_right = 20
    fig.min_border_top = 20
    fig.min_border_bottom = 20

    # Remove legend
    if legend is False:
        fig.legend.visible = False

    return fig
Example #9
0
def scatterplot_comparison(
    controls_df: pd.DataFrame,
    result_df: pd.DataFrame,
    data_label: str,
    *,
    ref_label: Union[str, List[str]] = None,
    category_labels: Dict = None,
    controls_name: str = 'controls',
    result_name: str = 'model',
    size: float = 7.5,
    fill_alpha: float = 0.2,
    facet_col: str = None,
    facet_col_wrap: int = 2,
    facet_sort_order: bool = True,
    facet_sync_axes: str = 'both',
    hover_col: Union[str, List[str]] = None,
    glyph_col: str = None,
    glyph_legend: bool = True,
    glyph_legend_location: str = 'bottom_right',
    glyph_legend_label_text_font_size: str = '11px',
    figure_title: str = None,
    plot_height: int = None,
    identity_line: bool = True,
    identity_colour: str = 'red',
    identity_width: int = 2,
    color_palette: Dict[int, Any] = Category20,
    calc_pct_diff: bool = True,
    totals_in_titles: bool = True,
    filter_zero_rows: bool = True
) -> Tuple[pd.DataFrame, Union[Column, Figure, GridBox]]:
    """Creates an interactive Bokeh-based scatter plot to compare data.

    Args:
        controls_df (pd.DataFrame): A DataFrame containing control values. Must be in wide-format where rows represent
            a reference (e.g. count station, TAZ, geography, etc.) and columns represent the data categories.
        result_df (pd.DataFrame): A DataFrame containing modelled values. Uses the same format as `controls_df`.
        data_label (str): The name to use for the data represented by the `controls_df` and `result_df` columns.
        ref_label (Union[str, List[str]], optional): Defaults to ``None``. The name(s) corresponding to the
            ``controls_df`` and ``result_df`` indices. The function will try to infer the name(s) from indices of the
            source DataFrames. If the indicies of the DataFrames are not set, then values must be set for this
            parameter, otherwise an error will be raised. If providing a value to this parameter and the indices of the
            source DataFrames are MultiIndex objects, then the provided value must be a list of strings.
        category_labels (Dict, optional): Defaults to ``None``. Category labels used to rename the `controls_df` and
            `result_df` columns.
        controls_name (str, optional): Defaults to ``'controls'``. The name for the controls.
        result_name (str, optional): Defaults to ``'model'``. The name for the results.
        size (float, optional): Defaults to ``7.5``. The size of the scatter plot points.
        fill_alpha (float, optional): Defaults to ``0.2``. The opacity of the point fill.
        facet_col (str, optional): Defaults to ``None``. The name of the column to use for creating a facet plot.
        facet_col_wrap (int, optional): Defaults to ``2``. The number of columns to wrap subplots in the facet plot.
        facet_sort_order (bool, optional): Defaults to ``True``. A flag to render facet subplots in ascending order
            sorted by unique ``facet_col`` values.
        facet_sync_axes (str, optional): Defaults to ``'both'``. Option to sync/link facet axes. Accepts one of
            ``['both', 'x', 'y']``. Set to None to disable linked facet plot axes.
        hover_col (Union[str, List[str]], optional): Defaults to ``None``. The column names to display in the plot
            tooltips.
        glyph_col (str, optional): Defaults to ``None``. The name of the column to use for glyph coloring. A standard
            color palette will be mapped to unique ``glyph_col`` values.
        glyph_legend (bool, optional): Defaults to ``True``. A flag to enable/disable the legend if ``glyph_col`` is
            set. The legend will be included in each plot/facet subplot.
        glyph_legend_location (str, optional): Defaults to ``'bottom_right'``. The location of the glyph legend in each
            plot/facet subplot. Please refer to the Bokeh ``Legend`` documentation for acceptable values.
        glyph_legend_label_text_font_size (str, optional): Defaults to ``'11px'``. The text size of the legend labels.
        figure_title (str, optional): Defaults to ``None``. The chart title to use.
        plot_height (int, optional): Defaults to ``None``. The desired plot height. For facet plots, this value will be
            set for each subplot.
        identity_line (bool, optional): Defaults to ``True``. A flag to include an identity (1:1) line in the
            scatter plot.
        identity_colour (str, optional): Defaults to ``'red'``. The colour to use for the identity line. Accepts html
            colour names.
        identity_width (int, optional): Defaults to ``2``. The line width to use for the identity line.
        color_palette (Dict[str, Any], optional): Defaults to ``Category20``. The Bokeh color palette to use.
        calc_pct_diff (bool, optional): Defaults to ``True``. Include percent difference calculation in DataFrame output
        totals_in_titles (bool, optional): Defaults to ``True``. Include the control and result totals in plot title.
        filter_zero_rows (bool, optional): Defaults to ``True``. Filter out comparisons where controls and results are
            both zeros.

    Returns:
        Tuple[pd.DataFrame, Union[Column, Figure, GridBox]]
    """

    if not controls_df.index.equals(result_df.index):
        warnings.warn(
            'Indices for `controls_df` and `result_df` are not identical; function may not produce desired '
            'results')
    if not controls_df.columns.equals(result_df.columns):
        warnings.warn(
            'Columns for `controls_df` and `result_df` are not identical; function may not produce desired '
            'results')

    if ref_label is None:
        assert np.all(controls_df.index.names == result_df.index.names), 'Unable to resolve different index names, ' \
                                                                         'please specify values for `ref_label` instead'
        assert not (None in controls_df.index.names
                    ), 'Some index levels in `controls_df` do not have names'
        assert not (None in result_df.index.names
                    ), 'Some index levels in `result_df` do not have names'
        ref_label = list(controls_df.index.names)
    elif isinstance(ref_label, Hashable):
        ref_label = [ref_label]
    elif isinstance(ref_label, List):
        pass
    else:
        raise RuntimeError('Invalid data type provided for `ref_label`')

    if hover_col is None:
        hover_col = []
    if isinstance(hover_col, Hashable):
        hover_col = [hover_col]
    elif isinstance(hover_col, List):
        pass
    else:
        raise RuntimeError('Invalid data type provided for `ref_label`')

    # Prepare data for plotting
    df = controls_df.stack()
    df.index.names = [*ref_label, data_label]
    df = df.to_frame(name=controls_name)

    df[result_name] = result_df.stack()
    df[result_name].fillna(0, inplace=True)

    if filter_zero_rows:
        df = df[df.sum(axis=1) > 0].copy()

    df.reset_index(inplace=True)

    if category_labels is not None:
        df[data_label] = df[data_label].map(category_labels)

    fig_df = df.copy()
    if totals_in_titles:
        label_totals = fig_df.groupby(data_label)[[controls_name,
                                                   result_name]].sum()
        label_totals[
            'label'] = label_totals.index + f' ({controls_name}=' + label_totals[
                controls_name].map(
                    '{:,.0f}'.format
                ) + f', {result_name}=' + label_totals[result_name].map(
                    '{:,.0f}'.format) + ')'
        fig_df[data_label] = fig_df[data_label].map(label_totals['label'])

    if glyph_col is not None:
        n_colors = max(len(fig_df[glyph_col].unique()), 3)
        color_palette = color_palette[n_colors]

    # Prepare figure formatting values
    source = ColumnDataSource(fig_df)
    tooltips = [(c, '@{%s}' % c) for c in hover_col]
    tooltips += [(controls_name, '@{%s}{0,0.0}' % controls_name),
                 (result_name, '@{%s}{0,0.0}' % result_name)]
    figure_params = _prep_figure_params(controls_name, result_name, tooltips,
                                        plot_height)
    glyph_params = {
        'source': source,
        'x': controls_name,
        'y': result_name,
        'size': size,
        'fill_alpha': fill_alpha,
        'hover_color': 'red'
    }

    slope = Slope(gradient=1,
                  y_intercept=0,
                  line_color=identity_colour,
                  line_dash='dashed',
                  line_width=identity_width)

    def apply_legend_settings(p_: Figure):
        p_.legend.visible = glyph_legend
        p_.legend.title = glyph_col
        p_.legend.location = glyph_legend_location
        p_.legend.label_text_font_size = glyph_legend_label_text_font_size
        p_.legend.click_policy = 'hide'

    # Plot figure
    if facet_col is None:  # Basic plot
        p = figure(sizing_mode='stretch_both', **figure_params)
        if glyph_col is None:  # Single glyphs
            p.circle(**glyph_params)
        else:  # Iterate through unique `glyph_col` values to use interactive legend feature
            for i, gc in enumerate(sorted(fig_df[glyph_col].unique())):
                source_view = CDSView(
                    source=source,
                    filters=[GroupFilter(column_name=glyph_col, group=gc)])
                p.circle(view=source_view,
                         legend_label=gc,
                         color=color_palette[i],
                         **glyph_params)
            apply_legend_settings(p)
        if identity_line:
            p.add_layout(slope)
        fig = p
    else:  # Facet plot
        fig = []
        facet_column_items = fig_df[facet_col].unique().tolist()
        facet_column_items = sorted(
            facet_column_items) if facet_sort_order else facet_column_items
        linked_axes = {}
        for i, fc in enumerate(facet_column_items):
            p = figure(title=fc, **figure_params, **linked_axes)
            filters = [GroupFilter(column_name=facet_col, group=fc)]
            if glyph_col is None:  # Single glyphs
                source_view = CDSView(source=source, filters=filters)
                p.circle(view=source_view, **glyph_params)
            else:  # Iterate through unique `glyph_col` values to use interactive legend feature
                for j, gc in enumerate(sorted(fig_df[glyph_col].unique())):
                    filters_ = filters + [
                        GroupFilter(column_name=glyph_col, group=gc)
                    ]
                    source_view = CDSView(source=source, filters=filters_)
                    p.circle(view=source_view,
                             legend_label=gc,
                             color=color_palette[j],
                             **glyph_params)
                apply_legend_settings(p)

            if (i == 0) and (facet_sync_axes is not None):
                if facet_sync_axes.lower() in ['x', 'both']:
                    linked_axes['x_range'] = p.x_range
                if facet_sync_axes.lower() in ['y', 'both']:
                    linked_axes['y_range'] = p.y_range

            if identity_line:
                p.add_layout(slope)

            fig.append(p)
        fig = gridplot(fig,
                       ncols=facet_col_wrap,
                       sizing_mode='stretch_both',
                       merge_tools=True)

    if figure_title is not None:
        fig = _wrap_figure_title(fig, figure_title)

    if calc_pct_diff:
        df['pct_diff'] = (df[result_name] -
                          df[controls_name]) / df[controls_name] * 100
        df['pct_diff'] = df['pct_diff'].replace([np.inf, -np.inf], np.nan)

    return df, fig
Example #10
0
            source=source,
            selection_color="red",
            alpha=0.6,
            nonselection_alpha=0.1,
            selection_alpha=0.4)
orig_data_linreg = corr.line('StockFwdRets',
                             'LineRegressTotal',
                             source=source,
                             color='purple')
filtered_data_linreg = corr.line('StockFwdRets',
                                 'FilteredLineRegressTotal',
                                 source=source,
                                 color='orange')
modified_slope_obj = Slope(
    gradient=linreg_data_source.to_df().iloc[0]['modified_total_gradient'],
    y_intercept=linreg_data_source.to_df().iloc[0]['modified_total_yint'],
    line_color='orange',
    line_dash='dashed',
    line_width=3.5)
# corr.add_layout(modified_slope_obj)

# first time series chart
ts1 = figure(plot_width=500,
             plot_height=400,
             tools=tools,
             x_axis_type='datetime',
             active_drag="xbox_select")
ts1.line('date', 'StockAdjClose', source=source_static)
ts1.circle('date',
           'StockAdjClose',
           size=2,
           source=source,
Example #11
0
    def plot(self,
             metric="r2q2",
             hide_pval=True,
             grid_line=False,
             legend=True):

        # Choose metric to plot
        metric_title = np.array([
            "ACCURACY", "AIC", "AUC", "BIC", "F1-SCORE", "PRECISION", "R²",
            "SENSITIVITY", "SPECIFICITY", "SSE"
        ])
        metric_list = np.array([
            "acc", "aic", "auc", "bic", "f1score", "prec", "r2q2", "sens",
            "spec", "sse"
        ])
        metric_idx = np.where(metric_list == metric)[0][0]

        mname = metric_title[metric_idx]
        stats = []
        stats.append(
            [self.stats_original[0][mname], self.stats_original[1][mname], 1])
        for i in self.stats_perm:
            stats.append([i[0][mname], i[1][mname], i[2]])

        self.stats = stats

        if metric == "r2q2":
            full_text = "R²"
            cv_text = "Q²"
        else:
            full_text = mname + "full"
            cv_text = mname + "cv"

        # Split data for plotting (corr, r2, q2)
        stats_r2 = []
        stats_q2 = []
        stats_corr = []
        for i in range(len(stats)):
            stats_r2.append(stats[i][0])
            stats_q2.append(stats[i][1])
            stats_corr.append(stats[i][2])

        # Calculate gradient, and y-intercept for plot 1
        r2gradient = (stats_r2[0] -
                      np.mean(stats_r2[1:])) / (1 - np.mean(stats_corr[1:]))
        q2gradient = (stats_q2[0] -
                      np.mean(stats_q2[1:])) / (1 - np.mean(stats_corr[1:]))
        r2yintercept = stats_r2[0] - r2gradient
        q2yintercept = stats_q2[0] - q2gradient

        max_vals = max(np.max(stats_r2), np.max(stats_q2))
        min_vals = min(np.min(stats_r2), np.min(stats_q2))
        y_range_share = (min_vals - abs(0.2 * min_vals),
                         max_vals + abs(0.1 * min_vals))
        # Figure 1
        data = {"corr": stats_corr, "r2": stats_r2, "q2": stats_q2}
        source = ColumnDataSource(data=data)
        fig1 = figure(plot_width=470,
                      plot_height=410,
                      x_range=(-0.15, 1.15),
                      x_axis_label="Correlation",
                      y_range=y_range_share,
                      y_axis_label=full_text + " & " + cv_text)
        # Lines
        r2slope = Slope(gradient=r2gradient,
                        y_intercept=r2yintercept,
                        line_color="black",
                        line_width=2,
                        line_alpha=0.3)
        q2slope = Slope(gradient=q2gradient,
                        y_intercept=q2yintercept,
                        line_color="black",
                        line_width=2,
                        line_alpha=0.3)
        fig1.add_layout(r2slope)
        fig1.add_layout(q2slope)

        # Points
        r2_square = fig1.square("corr",
                                "r2",
                                size=6,
                                alpha=0.5,
                                color="red",
                                legend=full_text,
                                source=source)
        q2_square = fig1.square("corr",
                                "q2",
                                size=6,
                                alpha=0.5,
                                color="blue",
                                legend=cv_text,
                                source=source)

        # Add Hovertool
        fig1.add_tools(
            HoverTool(renderers=[r2_square],
                      tooltips=[(full_text + " Value", "@r2")]))
        fig1.add_tools(
            HoverTool(renderers=[q2_square],
                      tooltips=[(cv_text + " Value", "@q2")]))

        # Extra padding
        fig1.min_border_left = 20
        fig1.min_border_right = 20
        fig1.min_border_top = 20
        fig1.min_border_bottom = 20
        #fig1.legend.location = "bottom_right"

        # Calculate Density cure for Figure 2
        # Density curve
        X1 = np.array(stats_r2[1:])
        x1_min, x1_max = X1.min(), X1.max()
        x1_padding = (x1_max - x1_min) * 0.6
        x1_grid = np.linspace(x1_min - x1_padding, x1_max + x1_padding, 50)
        x1_pdf = scipy.stats.gaussian_kde(X1, "scott")
        x1_pdf_grid = x1_pdf(x1_grid)

        # Density curve
        X2 = np.array(stats_q2[1:])
        x2_min, x2_max = X2.min(), X2.max()
        x2_padding = (x2_max - x2_min) * 0.6
        x2_grid = np.linspace(x2_min - x2_padding, x2_max + x2_padding, 50)
        x2_pdf = scipy.stats.gaussian_kde(X2, "scott")
        x2_pdf_grid = x2_pdf(x2_grid)
        x2_pdf_grid = [-x for x in x2_pdf_grid]

        # Figure 2
        if hide_pval == True:
            y_range_share2 = (min_vals - abs(0.2 * min_vals),
                              max_vals + abs(0.1 * max_vals))
            ymin = min(x2_pdf_grid) - 1
            xmin = max(x1_pdf_grid) + 1
            yy_range = (ymin - abs(0.1 * ymin), xmin + abs(0.1 * xmin))
        else:
            y_range_share2 = [min_vals - abs(0.2 * min_vals), max_vals + 0.8]
            ymin = min(x2_pdf_grid) - 1.2
            xmin = max(x1_pdf_grid) + 1.2
            yy_range = (ymin - 1, xmin + 1)
            if metric == "auc":
                if y_range_share2[1] > 1.5:
                    y_range_share2[1] = 1.5
            y_range_share2 = tuple(y_range_share2)

        fig2 = figure(plot_width=470,
                      plot_height=410,
                      x_axis_label=full_text + " & " + cv_text,
                      y_axis_label="p.d.f.",
                      x_range=y_range_share2,
                      y_range=yy_range)
        slope_0 = Span(location=0,
                       dimension="width",
                       line_color="black",
                       line_width=2,
                       line_alpha=0.3)
        fig2.add_layout(slope_0)

        # Plot distribution
        fig2.patch(x1_grid,
                   x1_pdf_grid,
                   alpha=0.35,
                   color="red",
                   line_color="grey",
                   line_width=1)
        fig2.patch(x2_grid,
                   x2_pdf_grid,
                   alpha=0.35,
                   color="blue",
                   line_color="grey",
                   line_width=1)

        # Extra padding
        fig2.min_border_left = 60
        fig2.min_border_right = 20
        fig2.min_border_top = 20
        fig2.min_border_bottom = 20

        # Lollipops R2
        # Do a t-test
        #a = ttest_1samp(stats_r2[1:], [stats_r2[0]])[1][0]
        #b = a / 2
        b = ttest_ind(stats_r2[1:], [stats_r2[0]], alternative='smaller')[1]
        if b > 0.005:
            data2_manu = "%0.2f" % b
        else:
            data2_manu = "%0.2e" % b

        # Plot
        data2 = {
            "x": [stats_r2[0]],
            "y": [max(x1_pdf_grid) + 1],
            "hover": [data2_manu]
        }
        source2 = ColumnDataSource(data=data2)
        data2_line = {
            "x": [stats_r2[0], stats_r2[0]],
            "y": [max(x1_pdf_grid) + 1, 0],
            "hover": [str(data2_manu), str(data2_manu)]
        }
        source2_line = ColumnDataSource(data=data2_line)
        r2fig2_line = fig2.line("x",
                                "y",
                                line_width=2.25,
                                line_color="red",
                                alpha=0.5,
                                source=source2_line)
        r2fig2 = fig2.circle("x",
                             "y",
                             fill_color="red",
                             line_color="grey",
                             alpha=0.75,
                             size=7,
                             legend=full_text,
                             source=source2)

        # Lollipops Q2
        # Do a t-test
        # if ttest_1samp(stats_q2[1:], [stats_q2[0]])[1][0] / 2 > 0.005:
        #     a = ttest_1samp(stats_q2[1:], [stats_q2[0]])[1][0]
        #     b = a / 2
        #     data3_manu = "%0.2f" % b
        # else:
        #     a = ttest_1samp(stats_q2[1:], [stats_q2[0]])[1][0]
        #     b = a / 2
        #     data3_manu = "%0.2e" % b

        b = ttest_ind(stats_q2[1:], [stats_q2[0]], alternative='smaller')[1]
        if b > 0.005:
            data3_manu = "%0.2f" % b
        else:
            data3_manu = "%0.2e" % b

        # Plot
        data3 = {
            "x": [stats_q2[0]],
            "y": [min(x2_pdf_grid) - 1],
            "hover": [data3_manu]
        }
        source3 = ColumnDataSource(data=data3)
        data3_line = {
            "x": [stats_q2[0], stats_q2[0]],
            "y": [(min(x2_pdf_grid) - 1), 0],
            "hover": [data3_manu, data3_manu]
        }
        source3_line = ColumnDataSource(data=data3_line)
        q2fig2_line = fig2.line("x",
                                "y",
                                line_width=2.25,
                                line_color="blue",
                                alpha=0.5,
                                source=source3_line)
        q2fig2 = fig2.circle("x",
                             "y",
                             fill_color="blue",
                             line_color="grey",
                             alpha=0.75,
                             size=7,
                             legend=cv_text,
                             source=source3)

        if hide_pval == False:
            # Add text
            textr2 = "True " + full_text + "\nP-Value: {}".format(data2_manu)
            textq2 = "True " + cv_text + "\nP-Value: {}".format(data3_manu)
            fig2.text(x=[stats_r2[0] + 0.05, stats_q2[0] + 0.05],
                      y=[(max(x1_pdf_grid) + 0.5), (min(x2_pdf_grid) - 1.5)],
                      text=[textr2, textq2],
                      angle=0,
                      text_font_size="8pt")

        # Font-sizes
        fig1.xaxis.axis_label_text_font_size = "13pt"
        fig1.yaxis.axis_label_text_font_size = "13pt"
        fig2.xaxis.axis_label_text_font_size = "12pt"
        fig2.yaxis.axis_label_text_font_size = "12pt"
        fig1.legend.location = "bottom_right"
        fig2.legend.location = "top_left"
        fig1.legend.visible = True
        fig2.legend.visible = True

        if grid_line == False:
            fig1.xgrid.visible = False
            fig1.ygrid.visible = False
            fig2.xgrid.visible = False
            fig2.ygrid.visible = False

        if legend == False:
            fig1.legend.visible = False
            fig2.legend.visible = False

        fig = gridplot([[fig1, fig2]])
        return fig
Example #12
0
def scatter_matrix(
    df,
    *,
    xs: Sequence[str] = None,
    ys: Sequence[str] = None,
    width=None,
    height=None,
    regression=True,
    **kwargs,
):
    assert len(df) > 0, 'TODO handle this'

    # FIXME handle empty df
    source = CDS(df)
    # TODO what about non-numeric stuff?

    xs = df.columns if xs is None else xs
    ys = df.columns if ys is None else ys
    ys = list(reversed(
        ys))  # reorder to move meaningful stuff to the top left corner

    isnum = lambda c: is_numeric_dtype(df.dtypes[c])
    # reorder so non-numeric is in the back
    # todo mode to drop non-numeric? not sure.. definitely can drop 'error' and datetimish?
    xs = list(sorted(xs, key=isnum, reverse=True))
    ys = list(sorted(ys, key=isnum, reverse=True))

    from bokeh.models import Label

    # TODO not sure I wanna reuse axis?
    def make(xc: str, yc: str):
        p = figure(df=df)
        diag = xc == yc  # todo handle properly
        # TODO not sure if I even want them... move to the very end?
        if isnum(xc) and isnum(yc):
            p.scatter(x=xc, y=yc, source=source, size=3)
        else:
            # TODO ugh, doesn't want to show the label without any points??
            # p.circle(x=0.0, y=0.0)
            # FIXME how to make sure text fits into the plot??
            add_text(
                p,
                x=0.0,
                y=0.0,
                text='Not numeric',
                text_color='red',
            )
        p.xaxis.axis_label = xc
        p.yaxis.axis_label = yc
        return p

    grid = [[make(xc=x, yc=y) for x in xs] for y in ys]
    from bokeh.layouts import gridplot
    w1 = None if width is None else width // min(len(xs), len(ys))
    h1 = None if height is None else height // min(len(xs), len(ys))
    grid_res = gridplot(grid, plot_width=w1, plot_height=h1)

    # TODO might be useful to include/exclude specific cols (e.g. datetime) while keeping them in annotations

    # TODO add the presence of the grid to the 'visual tests'
    # but if I swith it to raw bokeh -- it has Grid class.. might need to mess with
    # also maybe add extra axis under each plot in the grid? easier for a huge matrix of plots
    # some code in old dashboard
    if not regression:
        return grid_res

    # todo this would be need for plotly as well?
    import statsmodels.formula.api as smf  # type: ignore

    for plot in chain.from_iterable(grid):
        gs = plot.renderers
        if len(gs) == 0:
            # must be non-numeric? meh though
            continue
        [g] = gs
        xx = g.glyph.x
        yy = g.glyph.y

        if xx == yy:
            # diagonal thing, e.g. histogram. compute some stats??
            continue

        with pd.option_context('mode.use_inf_as_null', True):
            # FIXME proper error handling, display number of dropped items?
            dd = df[[xx, yy]].dropna()  # otherwise from_scatter fails
        # todo would be nice to display stats on the number of points dropped

        udd = dd.drop_duplicates()
        if len(udd) <= 1:
            # can't perform a reasonable regression then
            add_text(
                plot,
                x=0.0,
                y=0.0,
                text='ERROR: no points to correlate',
                text_color='red',
            )
            continue

        res = smf.ols(f"{yy} ~ {xx}", data=dd).fit()
        intercept = res.params['Intercept']
        slope = res.params[xx]
        r2 = res.rsquared

        ## TODO crap. is it really the best way to figure out relative position??
        relx = 0.01
        rely = 0.1

        # todo highlight high enough R2?
        minx, maxx = min(dd[xx]), max(dd[xx])
        miny, maxy = min(dd[yy]), max(dd[yy])
        # todo font size dependent on width?? ugh.
        txt = f'R2 = {r2:.4f}\nY ~ {slope:.3f} X'

        # todo need to add various regression properties, like intercept, etc
        # TODO hopefuly this overlays correctly?? not sure about nans, again
        from bokeh.models import Slope
        sl = Slope(gradient=slope,
                   y_intercept=intercept,
                   line_color='green',
                   line_width=3)
        plot.add_layout(sl)
        add_text(
            plot,
            text=txt,
            x=minx + (maxx - minx) * relx,
            y=miny + (maxy - miny) * rely,
            text_color=g.glyph.line_color,
        )

    # TODO dynamic resizing would be nice
    return grid_res
Example #13
0
def triadEffortPlot(args):
    """ Plot concatenated pickled data from triadEffortData """

    from .stats import unpickleAll
    # Initializing bokeh is an expensive operation and this module is imported
    # alot, so only do it when necessary.
    from bokeh.palettes import Set3
    from bokeh.plotting import figure
    from bokeh.models import RadioButtonGroup, CustomJS, Slope
    from bokeh.embed import json_item
    from bokeh.layouts import column

    p = figure(
        plot_width=1000,
        plot_height=500,
        sizing_mode='scale_both',
        x_range=(0, 1),
        y_range=(0, 1),
        output_backend="webgl",
    )
    data = list(unpickleAll(sys.stdin.buffer))
    colors = Set3[len(data)]
    lines = dict()
    for o, color in zip(data, colors):
        name = o['layout'].name
        assert name not in lines
        lines[name] = p.line(o['x'],
                             o['y'],
                             line_width=1,
                             color=color,
                             legend_label=name,
                             name=name)

    # color: base1
    slope = Slope(gradient=1,
                  y_intercept=0,
                  line_color='#93a1a1',
                  line_dash='dashed',
                  line_width=1)
    p.add_layout(slope)

    setPlotStyle(p)
    for axis, size, font in ((p.xaxis, '1em', 'IBM Plex Sans'),
                             (p.yaxis, '1em', 'IBM Plex Sans')):
        axis.major_label_text_font_size = size
        axis.major_label_text_font = font

    LABELS = ["All", "Standard", "Usable"]
    visible = {
        0: list(lines.keys()),
        1: ['ar-asmo663', 'ar-linux', 'ar-osx'],
        2: ['ar-lulua', 'ar-ergoarabic', 'ar-malas', 'ar-linux', 'ar-osx'],
    }
    ranges = {
        0: [(0, 1), (0, 1)],
        1: [(0, 0.5), (0, 0.4)],
        2: [(0, 0.5), (0, 0.4)],
    }
    presets = RadioButtonGroup(labels=LABELS, active=0)
    # Set visibility and x/yranges on click. Not sure if there’s a more pythonic way.
    presets.js_on_click(
        CustomJS(args=dict(lines=lines, plot=p, visible=visible,
                           ranges=ranges),
                 code="""
            for (const [k, line] of Object.entries (lines)) {
                line.visible = visible[this.active].includes (k);
            }
            const xrange = plot.x_range;
            xrange.start = ranges[this.active][0][0];
            xrange.end = ranges[this.active][0][1];
            const yrange = plot.y_range;
            yrange.start = ranges[this.active][1][0];
            yrange.end = ranges[this.active][1][1];
            """))

    json.dump(json_item(column(p, presets)), sys.stdout)

    return 0
Example #14
0
def figures_slopes(df_slopes, df_pop):
    nbStart = 7
    nbEnd = 0
    rolling = 7
    df_countrySlopes = determineSlope(df_slopes, df_pop, nbStart, nbEnd,
                                      rolling)
    df_countrySlopes = df_countrySlopes.replace([np.inf, -np.inf], np.nan)
    df_countrySlopes = df_countrySlopes.dropna()
    #df_countrySlopes=df_countrySlopes[df_countrySlopes.casesSlopePval<0.05]
    #df_countrySlopes=df_countrySlopes[df_countrySlopes.testsSlopePval<0.05]
    df_countrySlopes["temp"] = "0"
    df_countrySlopes.loc[
        df_countrySlopes.testsWeeklyPerc >= df_countrySlopes.casesWeeklyPerc,
        ['temp']] = "1"
    #df_countrySlopes[["CountryProv","casesWeeklyPerc","testsWeeklyPerc"]].to_csv("df_countrySlopes.csv", index=False)
    df_countrySlopes = ColumnDataSource(df_countrySlopes)

    gf = GroupFilter(column_name='temp', group="1")
    view1 = CDSView(source=df_countrySlopes, filters=[gf])
    gf = GroupFilter(column_name='temp', group="0")
    view2 = CDSView(source=df_countrySlopes, filters=[gf])

    TOOLTIPS = [
        ("Country/Region", "@CountryProv"),
        ("Cases Rate (%)", "@casesWeeklyPerc"),
        ("Tests Rate (%)", "@testsWeeklyPerc"),
    ]

    p1 = figure(tooltips=TOOLTIPS,
                tools=",pan,tap,box_zoom,reset",
                title="Generated on the basis of " + str(rolling) +
                " day moving average")
    r1 = p1.scatter('casesWeeklyPerc',
                    'testsWeeklyPerc',
                    source=df_countrySlopes,
                    size=12,
                    color='#73b2ff',
                    legend_label='Tests Rate > Cases Rate',
                    view=view1)
    r2 = p1.scatter('casesWeeklyPerc',
                    'testsWeeklyPerc',
                    source=df_countrySlopes,
                    size=12,
                    color='#ff7f7f',
                    legend_label='Tests Rate < Cases Rate',
                    view=view2)
    p1.xaxis.axis_label = 'Weekly Rate of Change for Positive Cases(%)'
    p1.yaxis.axis_label = 'Weekly Rate of Change for Nb. Tests(%)'

    p1.ray([0], [0], length=0, angle=np.pi, color='white')
    p1.ray([0], [0], length=0, angle=0, color='white')
    p1.ray([0], [0], length=0, angle=np.pi / 2, color='white')
    p1.ray([0], [0], length=0, angle=3 * np.pi / 2, color='white')

    editplotcolors(p1)
    slope = Slope(gradient=1,
                  y_intercept=0,
                  line_color='white',
                  line_dash='dashed',
                  line_width=2)

    p1.add_layout(slope)

    p1.legend.background_fill_alpha = 0.8
    p1.legend.background_fill_color = "#262626"
    p1.legend.border_line_alpha = 0
    p1.legend.label_text_color = "whitesmoke"
    p1.legend.location = 'top_right'
    p1.toolbar_location = "right"
    from bokeh.layouts import row, column, widgetbox
    return df_countrySlopes, p1
    def generate_figure(self,
                        columns,
                        category_select=None,
                        genome_features=None,
                        reaction_scores=None,
                        reaction_percentiles=None):

        # To include with figure object
        TOOLTIPS = [("reaction", "@tooltip"), ("(x,y)", "($x, $y)")]

        ##################################################################
        # The output figure will be saved using the 'grid' function
        # Each row in the figure will be from a pair of columns in the matrix
        # The first scatterplot will be the general "genome-features background"
        # The second scatterplot will be the reaction percentiles and subsystems
        # The third "column" in a row will contain the subsystem select

        figure_grid = list()
        for first_column in range(len(columns)):
            for second_column in range(len(columns)):
                if (first_column >= second_column):
                    continue

                # Row of figures for the pair of conditions
                figure_row = list()

                ##################################################################
                # For the first scatterplot, it is optional
                if (genome_features is not None
                        or reaction_scores is not None):

                    # Find range for axes
                    x_max = math.ceil(
                        max(genome_features[columns[first_column]]))
                    y_max = math.ceil(
                        max(genome_features[columns[second_column]]))
                    plot_max = max([x_max, y_max])

                    bokeh_fig = figure(x_range=(0.0, plot_max),
                                       y_range=(0.0, plot_max))
                    bokeh_fig.xaxis.axis_label = columns[first_column]
                    bokeh_fig.yaxis.axis_label = columns[second_column]
                    bokeh_fig.title.text = "Genome Features Expression Abundances"

                    genome_source = ColumnDataSource(
                        data=dict(genome_features))

                    # Plot as black and visible
                    scatter_fig = bokeh_fig.circle(x=columns[first_column],
                                                   y=columns[second_column],
                                                   source=genome_source,
                                                   color='black',
                                                   size=4,
                                                   visible=True)

                    reaction_source = ColumnDataSource(
                        data=dict(reaction_scores))

                    # Plot as red
                    scatter_fig = bokeh_fig.circle(x=columns[first_column],
                                                   y=columns[second_column],
                                                   source=reaction_source,
                                                   color='red',
                                                   size=6,
                                                   visible=True)

                    slope_line = Slope(gradient=1,
                                       y_intercept=0,
                                       line_color="red")
                    bokeh_fig.add_layout(slope_line)

                    figure_row.append(bokeh_fig)

                ##################################################################
                # For the second scatterplot
                if (reaction_percentiles is not None):

                    ##################################################################
                    # Set up parent figure object

                    bokeh_fig = figure(tooltips=TOOLTIPS,
                                       x_range=(0.0, 1.0),
                                       y_range=(0.0, 1.0))
                    bokeh_fig.xaxis.axis_label = columns[first_column]
                    bokeh_fig.yaxis.axis_label = columns[second_column]
                    bokeh_fig.xaxis.formatter = NumeralTickFormatter(
                        format="0.0")
                    bokeh_fig.yaxis.formatter = NumeralTickFormatter(
                        format="0.0")
                    bokeh_fig.title.text = "Model Reactions Percentile Rank (p<0.01)"

                    ##################################################################
                    # The data is transformed into ColumnDataSource object to allow for CustomJS to work
                    # The source_dict stores the data after it's been transformed into ColumnDataSource
                    # The scatter_dict stores the individual bokeh scatterplots for rendering in CustomJS
                    source_dict = dict()
                    scatter_dict = dict()

                    ##################################################################
                    # For the background data, all the data is captured under a single 'All' key
                    # It is added first, so that it will always be in the background
                    # It is intentionally made visible and won't be changed in the CustomJS
                    # Transform
                    source = ColumnDataSource(
                        data=dict(reaction_percentiles['All']))
                    # Store transformation
                    source_dict['All'] = source
                    # Plot as black and visible
                    scatter_fig = bokeh_fig.circle(x=columns[first_column],
                                                   y=columns[second_column],
                                                   source=source,
                                                   color='color',
                                                   size='size',
                                                   fill_alpha='fill_alpha',
                                                   visible=True)
                    # Store plot
                    scatter_dict['All'] = scatter_fig

                    ##################################################################
                    # For the foreground data, the scatter plot for each subsystem is create
                    # separately, but made invisible, to be used with the Select dropdown

                    for scatter in reaction_percentiles.keys():
                        # Not using the 'All' background data
                        if (scatter == 'All'):
                            continue

                        # Transform
                        source = ColumnDataSource(
                            data=dict(reaction_percentiles[scatter]))
                        # Store transformation
                        source_dict[scatter] = source
                        # Plot as red but not visible
                        scatter_fig = bokeh_fig.circle(
                            x=columns[first_column],
                            y=columns[second_column],
                            source=source,
                            color='color',
                            size='size',
                            fill_alpha='fill_alpha',
                            visible=False)
                        # Store plot
                        scatter_dict[scatter] = scatter_fig

                    # Add red central slope
                    slope_line = Slope(gradient=1,
                                       y_intercept=0,
                                       line_color="red")
                    bokeh_fig.add_layout(slope_line)

                    # Add parent figure to row of figures
                    figure_row.append(bokeh_fig)

                    # Add subsystem selector
                    # Starts with default value of "None" and allows user to pick one
                    # whereupon, according to CustomJS code below, it'll become visible
                    subsystem_select = Select(title="Select Subsystem:",
                                              value="None",
                                              options=['None'] +
                                              sorted(category_select))

                    # Add JS callback
                    callback = CustomJS(args=dict(
                        source=source_dict,
                        figs=scatter_dict,
                        subsystem_select=subsystem_select),
                                        code="""
console.log("Updating")
for (let scatter in source){

    // Only choose subsystem
    if(scatter == 'All'){
        continue
    }

    // Chosen subsystem
    if(scatter == subsystem_select.value){
        
        figs[scatter].visible=true

        // Iterate through datapoints to make sure they are red and of a larger size
        for (let i = 0; i < source[scatter].data['color'].length; i++) {
            // This is where I would scale with p-value
            source[scatter].data['color'][i] = 'red' 
            source[scatter].data['size'][i] = 8
        }

    } else {

        // Here we have to make sure that the non-chosen subsystems are not visible
        figs[scatter].visible=false

        // default values, but this is really un-necessary
        for (let i = 0; i < source[scatter].data['color'].length; i++) {
            source[scatter].data['color'][i] = 'black'
            source[scatter].data['size'][i] = 6
        }
    }
    // Actually show change in plot
    source[scatter].change.emit()
}
""")
                    subsystem_select.js_on_change('value', callback)

                    # Add subsystem selector to row of figures
                    figure_row.append(subsystem_select)

                    # Add row of figures to grid
                    figure_grid.append(figure_row)

        return figure_grid
Example #16
0
def figure_scatter_values(df_chisq):
    df_chisq["casema07_diff07"] = df_chisq.case_ma07.diff(periods=1)
    df_chisq["testsma07_diff07"] = df_chisq.tests_ma07.diff(periods=1)
    df_chisq["casedet_diff07"] = df_chisq.case_detrended.diff(periods=1)
    df_chisq["casedetpct_diff07"] = df_chisq.caseDet_pct.diff(periods=1)
    df_chisq[
        "angle"] = df_chisq.testsma07_diff07 / df_chisq.casema07_diff07 * 3.14
    df_chisq["casema07_start"] = df_chisq.case_ma07 - df_chisq.casema07_diff07
    df_chisq[
        "testsma07_start"] = df_chisq.tests_ma07 - df_chisq.testsma07_diff07
    df_chisq[
        "casedet_start"] = df_chisq.case_detrended - df_chisq.casedet_diff07
    df_chisq[
        "casedetpct_start"] = df_chisq.caseDet_pct - df_chisq.casedetpct_diff07
    df_chisq["dt_str"] = df_chisq.Date.dt.strftime("%Y-%m-%d")

    # FIXME

    # df_chisq.set_index(["CountryProv","Date"]).tail()[['case_ma07', 'tests_ma07',  'casema07_diff07', 'testsma07_diff07', 'casema07_start', 'testsma07_start']]

    print("gathering moving 14-day windows")
    #df_sub = df_chisq[df_chisq.Date >= "2020-04-28"]
    df_sub = df_chisq
    df_latest = []
    dtmax_n = df_sub.Date.unique().max()
    dtmin_n = df_sub.Date.unique().min()
    import datetime as dt
    #dt_range = df_sub.Date.unique()
    dt_range = np.arange(dtmax_n, dtmin_n, dt.timedelta(days=-14))
    #dtmax_s = str(dtmax_n)[:10] # http://stackoverflow.com/questions/28327101/ddg#28327650
    for dt_i in dt_range:
        dt_delta = (dt_i - dtmin_n).astype('timedelta64[D]').astype(int)
        if dt_delta < 14: continue
        print(dt_i, dt_delta)

        df_i = df_sub[df_sub.Date <= dt_i]
        df_i = df_i.groupby("CountryProv").apply(
            lambda g: g.tail(14)).reset_index(drop=True)
        df_i["color"] = "#73b2ff"
        df_i["dtLast"] = dt_i
        df_latest.append(df_i)

    if len(df_latest) == 0: raise Exception("No data in moving window")
    df_latest = pd.concat(df_latest, axis=0)
    df_latest["display_cpcode"] = df_latest.apply(
        lambda g: "" if g.dtLast != g.Date else g.cp_code, axis=1)
    print("done")

    #source_hist = ColumnDataSource(df_chisq)
    #source_latest = ColumnDataSource(df_latest)

    # since cannot use View iwth LabelSet, creating a different source per continent
    # Couldn't figure out how to filter the datasource in add_layout or Arrow,
    # so just grouping on both continent and dtLast
    srcLatest_continent = df_latest.groupby(
        ["Continent", "dtLast"]).apply(lambda g: ColumnDataSource(g))
    srcLatest_continent = srcLatest_continent.reset_index().rename(
        columns={0: "src"})

    plot_size_and_tools = {
        'plot_height': 300,
        'plot_width': 600,
        'tools': ['box_select', 'reset', 'help', 'box_zoom'],
        'x_axis_type': 'datetime'
    }

    # general-use lines
    slope_y0 = Slope(gradient=0,
                     y_intercept=0,
                     line_color='orange',
                     line_width=50)
    slope_x0 = Slope(gradient=np.Inf,
                     y_intercept=0,
                     line_color='orange',
                     line_width=50)

    # scatter plot
    TOOLTIPS = [
        ("Country/Region", "@CountryProv"),
        ("Date", "@dt_str"),
    ]
    # first set for case vs tests, then second set for case diff vs test diff
    params = (
        #('values', 'tests_ma07', 'case_ma07', 'testsma07_start',  'casema07_start', 'ma07(Tests)', 'ma07(Cases)'),
        #('diffs', 'casema07_diff07', 'testsma07_diff07', 'diff07(ma07(Cases))', 'diff07(ma07(Tests))'),
        ('values', 'case_detrended', 'case_ma07', 'casedet_start',
         'casema07_start', 'detrended(cases)', 'ma07(Cases)'),
        #('values', 'caseDet_pct', 'case_ma07', 'casedetpct_start',  'casema07_start', 'detrended(ma07(cases))/cases*100', 'ma07(Cases)'),
    )
    p_all = {'values': [], 'diffs': []}
    from bokeh.models import Arrow, NormalHead, OpenHead, VeeHead
    for k, fdxv, fdyv, fdxs, fdys, labx, laby in params:
        p_cont = []
        for srcCont_i in srcLatest_continent.iterrows():
            srcCont_i = srcCont_i[1]
            print("Adding plot for %s, %s" %
                  (srcCont_i.Continent, srcCont_i.dtLast))

            #init_group=dtmax_s
            #gf = GroupFilter(column_name='dtLast', group=init_group)
            #view1 = CDSView(source=srcCont_i.src, filters=[gf])

            p_d1 = figure(plot_width=600,
                          plot_height=400,
                          tooltips=TOOLTIPS,
                          title="%s %s" %
                          (srcCont_i.Continent, srcCont_i.dtLast))

            #p_d1.triangle(fdxv, fdyv, source=srcCont_i.src, size=12, color='blue', angle="angle")
            #p_d1.scatter(fdxs, fdys, source=srcCont_i.src, size=3, color='red') #, view=view1)
            p_d1.scatter(fdxv, fdyv, source=srcCont_i.src, size=3, color='red')
            p_d1.add_layout(
                Arrow(end=VeeHead(size=6),
                      x_start=fdxs,
                      y_start=fdys,
                      x_end=fdxv,
                      y_end=fdyv,
                      line_color='blue',
                      source=srcCont_i.src
                      #view=view1 # srcCont_i.src
                      )  #,
                #view=view1 # not supported
            )

            p_d1.xaxis.axis_label = labx
            p_d1.yaxis.axis_label = laby
            from bokeh.models import LabelSet
            labels = LabelSet(x=fdxv,
                              y=fdyv,
                              text='display_cpcode',
                              level='glyph',
                              x_offset=5,
                              y_offset=5,
                              source=srcCont_i.src,
                              render_mode='canvas')
            p_d1.add_layout(labels)
            p_d1.add_layout(slope_y0)
            p_d1.add_layout(slope_x0)
            p_cont.append(p_d1)

        p_all[k] = p_cont

    # group plots into 3 per row
    # https://stackoverflow.com/a/1625013/4126114
    from itertools import zip_longest
    for k in ['values', 'diffs']:
        p_cont = p_all[k]
        p_cont = list(zip_longest(*(iter(p_cont), ) * 3))
        p_cont = [[e for e in t if e != None] for t in p_cont]
        p_all[k] = p_cont

    g = gridplot(p_all['values'] + p_all['diffs'])
    layout = column(g)

    return layout
Example #17
0
def scatter(x,
            y,
            label=None,
            group=None,
            title="Scatter Plot",
            xlabel="x",
            ylabel="y",
            width=600,
            height=600,
            legend=True,
            size=4,
            shape="circle",
            font_size="16pt",
            label_font_size="13pt",
            col_palette=None,
            hover_xy=True,
            gradient=False,
            gradient_alt=False,
            hline=False,
            vline=False,
            xrange=None,
            yrange=None,
            ci95=False,
            scatterplot=True,
            extraci95_x=False,
            extraci95_y=False,
            extraci95=False):
    """Creates a scatterplot using Bokeh.

    Required Parameters
    -------------------
    x : array-like, shape = [n_samples]
        Inpute data for x-axis.

    y : array-like, shape = [n_samples]
        Inpute data for y-axis.
    """

    # Error check
    if len(x) != len(y):
        raise ValueError("length of X does not match length of Y.")

    # If label is None, give an index based on input order
    if label is None:
        label_copy = {}
        label_copy["Idx"] = list(range(len(x)))
    else:
        try:
            label2 = label.copy()
            label2_dict = label2.to_dict("series")
            label_copy = label2_dict  # Ensure I don't overwrite label (when plot_groupmean=True)
        except TypeError:
            label2 = label.copy()
            label_copy = {}
            label_copy[label2.name] = label2.values.tolist()

    # If colour palette is None (default):
    if col_palette is None:
        col_palette = ["red", "blue", "green"]

    # Group is None or allow for multiple classes (can add more in the Future)

    if group is None:
        group_copy = [None] * len(x)
        col = []
        for i in range(len(x)):
            col.append(col_palette[2])
    else:
        group_copy = group.copy()
        group_unique = np.sort(np.unique(group_copy))
        col = []
        for i in range(len(group_copy)):
            if group_copy[i] == group_unique[0]:
                col.append(col_palette[0])
            elif group_copy[i] == group_unique[1]:
                col.append(col_palette[1])
            else:
                col.append(col_palette[2])

    # Bokeh data source with data labels
    data = {"x": x, "y": y, "group": group_copy, "col": col}
    data_label = {}
    for name, val in label_copy.items():
        data_label[name] = val
    data.update(data_label)
    source = ColumnDataSource(data=data)

    # Tool-tip (add everything in label_copy)
    TOOLTIPS = []
    if hover_xy is True:
        TOOLTIPS = [("x", "@x{1.111}"), ("y", "@y{1.111}")]
    for name, val in data_label.items():
        TOOLTIPS.append((str(name), "@" + str(name)))

    # Base figure
    fig = figure(title=title,
                 x_axis_label=xlabel,
                 y_axis_label=ylabel,
                 plot_width=width,
                 plot_height=height,
                 x_range=xrange,
                 y_range=yrange)

    # Add to plot
    if scatterplot is True:
        if shape is "circle":
            shape = fig.circle("x",
                               "y",
                               size=size,
                               alpha=0.6,
                               color="col",
                               source=source)
        elif shape is "triangle":
            shape = fig.triangle("x",
                                 "y",
                                 size=size,
                                 alpha=0.6,
                                 color="col",
                                 source=source)
        else:
            raise ValueError("shape has to be either 'circle' or 'triangle'.")

        shape_hover = HoverTool(renderers=[shape], tooltips=TOOLTIPS)
        fig.add_tools(shape_hover)

    if gradient is not False:
        if gradient_alt is False:
            slope = Slope(gradient=gradient,
                          y_intercept=0,
                          line_color="black",
                          line_width=2,
                          line_alpha=0.3)
            fig.add_layout(slope)
            new_gradient = -(1 / gradient)
            slope2 = Slope(gradient=new_gradient,
                           y_intercept=0,
                           line_color="black",
                           line_dash="dashed",
                           line_width=2,
                           line_alpha=0.10)
            fig.add_layout(slope2)
        else:
            c = 0.5 - gradient * 0.5
            slope = Slope(gradient=gradient,
                          y_intercept=c,
                          line_color="black",
                          line_width=2,
                          line_alpha=0.3)
            fig.add_layout(slope)
            new_gradient = -(1 / gradient)
            new_c = 0.5 - new_gradient * 0.5
            slope2 = Slope(gradient=new_gradient,
                           y_intercept=new_c,
                           line_color="black",
                           line_dash="dashed",
                           line_width=2,
                           line_alpha=0.10)
            fig.add_layout(slope2)

    if hline is not False:
        h = Span(location=0,
                 dimension="width",
                 line_color="black",
                 line_width=3,
                 line_alpha=0.15)
        fig.add_layout(h)

    if vline is not False:
        v = Span(location=0,
                 dimension="height",
                 line_color="black",
                 line_width=3,
                 line_alpha=0.15)
        fig.add_layout(v)

    # if ci95 is true
    if ci95 is True:

        # if group is None
        if group is None:
            group_label = [0] * len(X)

        group_label = group_copy
        x_score = x
        y_score = y
        # Score plot extra: 95% confidence ellipse using PCA
        unique_group = np.sort(np.unique(group_label))

        # Set colour per group
        list_color = [
            "red", "blue", "green", "black", "orange", "yellow", "brown",
            "cyan"
        ]
        while len(list_color) < len(
                unique_group
        ):  # Loop over list_color if number of groups > len(list_colour)
            list_color += list_color

        # Add 95% confidence ellipse for each unique group in a loop
        max_val = []
        for i in range(len(unique_group)):
            # Get scores for the corresponding group
            group_i_x = []
            group_i_y = []
            for j in range(len(group_label)):
                if group_label[j] == unique_group[i]:
                    group_i_x.append(x_score[j])
                    group_i_y.append(y_score[j])

            # Calculate ci95 ellipse for each group
            data_circ_group = pd.DataFrame({"0": group_i_x, "1": group_i_y})
            m, outside_m = ci95_ellipse(data_circ_group, type="mean")
            p, outside_p = ci95_ellipse(data_circ_group, type="pop")

            # Plot ci95 ellipse outer line
            fig.line(m[:, 0],
                     m[:, 1],
                     color=list_color[i],
                     line_width=2,
                     alpha=0.8,
                     line_dash="solid",
                     legend="{}".format(unique_group[i]))
            fig.line(p[:, 0], p[:, 1], color=list_color[i], alpha=0.4)

            # Plot ci95 ellipse shade
            fig.patch(m[:, 0], m[:, 1], color=list_color[i], alpha=0.07)
            fig.patch(p[:, 0], p[:, 1], color=list_color[i], alpha=0.01)
            fig.x(np.median(m[:, 0]),
                  np.median(m[:, 1]),
                  size=size,
                  alpha=0.6,
                  color=list_color[i],
                  line_width=2)

            maxv = max(np.abs(p).flatten())
            max_val.append(maxv)

        if extraci95 is True:
            # if group is None
            if group is None:
                group_label = [0] * len(X)

            group_label = group_copy
            x_score = extraci95_x
            y_score = extraci95_y
            # Score plot extra: 95% confidence ellipse using PCA
            unique_group = np.sort(np.unique(group_label))

            # Set colour per group
            list_color = [
                "red", "blue", "green", "black", "orange", "yellow", "brown",
                "cyan"
            ]
            while len(list_color) < len(
                    unique_group
            ):  # Loop over list_color if number of groups > len(list_colour)
                list_color += list_color

            # Add 95% confidence ellipse for each unique group in a loop
            for i in range(len(unique_group)):
                # Get scores for the corresponding group
                group_i_x = []
                group_i_y = []
                for j in range(len(group_label)):
                    if group_label[j] == unique_group[i]:
                        group_i_x.append(x_score[j])
                        group_i_y.append(y_score[j])

                # Calculate ci95 ellipse for each group
                data_circ_group = pd.DataFrame({
                    "0": group_i_x,
                    "1": group_i_y
                })
                m, outside_m = ci95_ellipse(data_circ_group, type="mean")
                p, outside_p = ci95_ellipse(data_circ_group, type="pop")

                # Plot ci95 ellipse outer line
                fig.line(m[:, 0],
                         m[:, 1],
                         color=list_color[i],
                         line_width=2,
                         alpha=0.8,
                         line_dash="dashed")
                fig.line(p[:, 0],
                         p[:, 1],
                         color=list_color[i],
                         alpha=0.4,
                         line_dash="dashed")

                # Plot ci95 ellipse shade
                fig.patch(m[:, 0], m[:, 1], color=list_color[i], alpha=0.07)
                fig.patch(p[:, 0], p[:, 1], color=list_color[i], alpha=0.01)
                fig.x(np.median(m[:, 0]),
                      np.median(m[:, 1]),
                      size=size,
                      alpha=0.6,
                      color=list_color[i],
                      line_width=2)

                maxv = max(np.abs(p).flatten())
                max_val.append(maxv)

        max_range = max(max_val)
        new_range_min = -max_range - 0.05 * max_range
        new_range_max = max_range + 0.05 * max_range
        fig.y_range = Range1d(new_range_min, new_range_max)
        fig.x_range = Range1d(new_range_min, new_range_max)

    # Font-sizes
    fig.title.text_font_size = font_size
    fig.xaxis.axis_label_text_font_size = label_font_size
    fig.yaxis.axis_label_text_font_size = label_font_size

    # Extra padding
    fig.min_border_left = 20
    fig.min_border_right = 20
    fig.min_border_top = 20
    fig.min_border_bottom = 20

    # Remove legend
    if legend is True:
        fig.legend.visible = True
        fig.legend.location = "bottom_right"
    else:
        fig.legend.visible = False
    # if scatterplot is True:
    #     if legend is False:
    #         fig.legend.visible = False

    return fig
Example #18
0
# linear equation parameters (obtained from a prior simple linear regression)
gradient = 0.94
y_intercept = 0.1966


# Set the figure up
p = figure(plot_height=480, plot_width= 647,
    y_range=(0, 1.1 * max(ypts)),
    title="Scatterplot with fitted regression line")

# plot the points within the figure
p.circle(xpts, ypts, size=10, color="#aeb3b7")



slope = Slope(gradient=gradient, y_intercept=y_intercept,
    line_color='#3a6587', line_dash='dotted', line_width=2)

p.add_layout(slope)


# Removes the chart gridlines (i.e.. removes the chart clutter)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

# Remove the border. Set the width to 0 does not work so we need
# to set to 0.1 to make it less visible.
p.outline_line_width = 0.1


# change just some things about the x-axes
p.xaxis.axis_label = "Independent Variable"
Example #19
0
def permutation_test(model, X, Y, nperm=100, folds=8, grid_line=True):
    """Creates permutation test plots using Bokeh.

    Required Parameters
    -------------------

    model : object
        This object is assumed to store bootlist attributes in .model (e.g. modelPLS.model.x_scores_).

    X : array-like, shape = [n_samples, n_features]
        Predictor variables, where n_samples is the number of samples and n_features is the number of predictors.

    Y : array-like, shape = [n_samples, 1]
        Response variables, where n_samples is the number of samples.
    """

    model = deepcopy(model)

    # Get train and test idx using Stratified KFold
    skf = StratifiedKFold(n_splits=folds)
    trainidx = []
    testidx = []
    for train, test in skf.split(X, Y):
        trainidx.append(train)
        testidx.append(test)

    # Calculate binary_metrics for stats_full
    y_pred_full = model.test(X)
    stats_full = binary_metrics(Y, y_pred_full)

    # Calculate binary_metrics for stats_cv
    y_pred_cv = [None] * len(Y)
    for j in range(len(trainidx)):
        X_train = X[trainidx[j], :]
        Y_train = Y[trainidx[j]]
        X_test = X[testidx[j], :]
        model.train(X_train, Y_train)
        y_pred = model.test(X_test)
        for (idx, val) in zip(testidx[j], y_pred):
            y_pred_cv[idx] = val.tolist()
    stats_cv = binary_metrics(Y, y_pred_cv)

    # Extract R2, Q2
    stats = []
    stats.append([stats_full["R²"], stats_cv["R²"], 1])

    # For each permutation, shuffle Y and calculate R2, Q2 and append to stats
    for i in tqdm(range(nperm), desc="Permutation Resample"):
        # Shuffle
        Y_shuff = Y.copy()
        np.random.shuffle(Y_shuff)

        # Model and calculate full binary_metrics
        model.train(X, Y_shuff)
        y_pred_full = model.test(X)
        stats_full = binary_metrics(Y_shuff, y_pred_full)

        # Get train and test idx using Stratified KFold for Y_shuff
        skf_nperm = StratifiedKFold(n_splits=folds)
        trainidx_nperm = []
        testidx_nperm = []
        for train, test in skf_nperm.split(X, Y_shuff):
            trainidx_nperm.append(train)
            testidx_nperm.append(test)

        # Model and calculate cv binary_metrics
        y_pred_cv = [None] * len(Y_shuff)
        for j in range(len(trainidx_nperm)):
            X_train = X[trainidx_nperm[j], :]
            Y_train = Y_shuff[trainidx_nperm[j]]
            X_test = X[testidx_nperm[j], :]
            model.train(X_train, Y_train)
            y_pred = model.test(X_test)
            for (idx, val) in zip(testidx_nperm[j], y_pred):
                y_pred_cv[idx] = val.tolist()
        stats_cv = binary_metrics(Y_shuff, y_pred_cv)

        # Calculate correlation using Pearson product-moment correlation coefficients and append permuted R2, Q2 and correlation coefficient
        corr = abs(np.corrcoef(Y_shuff, Y)[0, 1])
        stats.append([stats_full["R²"], stats_cv["R²"], corr])

    # Split data for plotting (corr, r2, q2)
    stats_r2 = []
    stats_q2 = []
    stats_corr = []
    for i in range(len(stats)):
        stats_r2.append(stats[i][0])
        stats_q2.append(stats[i][1])
        stats_corr.append(stats[i][2])

    # Calculate gradient, and y-intercept for plot 1
    r2gradient = (stats_r2[0] -
                  np.mean(stats_r2[1:])) / (1 - np.mean(stats_corr[1:]))
    q2gradient = (stats_q2[0] -
                  np.mean(stats_q2[1:])) / (1 - np.mean(stats_corr[1:]))
    r2yintercept = stats_r2[0] - r2gradient
    q2yintercept = stats_q2[0] - q2gradient

    # Figure 1
    data = {"corr": stats_corr, "r2": stats_r2, "q2": stats_q2}
    source = ColumnDataSource(data=data)
    fig1 = figure(plot_width=470,
                  plot_height=410,
                  x_range=(-0.15, 1.15),
                  x_axis_label="Correlation",
                  y_axis_label="R² & Q²")
    # Lines
    r2slope = Slope(gradient=r2gradient,
                    y_intercept=r2yintercept,
                    line_color="black",
                    line_width=2,
                    line_alpha=0.3)
    q2slope = Slope(gradient=q2gradient,
                    y_intercept=q2yintercept,
                    line_color="black",
                    line_width=2,
                    line_alpha=0.3)
    fig1.add_layout(r2slope)
    fig1.add_layout(q2slope)

    # Points
    r2_square = fig1.square("corr",
                            "r2",
                            size=6,
                            alpha=0.5,
                            color="red",
                            legend="R²",
                            source=source)
    q2_square = fig1.square("corr",
                            "q2",
                            size=6,
                            alpha=0.5,
                            color="blue",
                            legend="Q²",
                            source=source)

    # Add Hovertool
    fig1.add_tools(
        HoverTool(renderers=[r2_square], tooltips=[("R² Value", "@r2")]))
    fig1.add_tools(
        HoverTool(renderers=[q2_square], tooltips=[("Q² Value", "@q2")]))

    # Extra padding
    fig1.min_border_left = 20
    fig1.min_border_right = 20
    fig1.min_border_top = 20
    fig1.min_border_bottom = 20
    fig1.legend.location = "bottom_right"

    # Calculate Density cure for Figure 2
    # Density curve
    X1 = np.array(stats_r2[1:])
    x1_min, x1_max = X1.min(), X1.max()
    x1_padding = (x1_max - x1_min) * 0.6
    x1_grid = np.linspace(x1_min - x1_padding, x1_max + x1_padding, 50)
    x1_pdf = scipy.stats.gaussian_kde(X1, "scott")
    x1_pdf_grid = x1_pdf(x1_grid)

    # Density curve
    X2 = np.array(stats_q2[1:])
    x2_min, x2_max = X2.min(), X2.max()
    x2_padding = (x2_max - x2_min) * 0.6
    x2_grid = np.linspace(x2_min - x2_padding, x2_max + x2_padding, 50)
    x2_pdf = scipy.stats.gaussian_kde(X2, "scott")
    x2_pdf_grid = x2_pdf(x2_grid)
    x2_pdf_grid = [-x for x in x2_pdf_grid]

    # Figure 2
    fig2 = figure(plot_width=470,
                  plot_height=410,
                  x_range=(min(x2_grid) * 1.1,
                           max(stats_r2[0], max(x1_grid)) + 0.65),
                  y_range=((min(x2_pdf_grid) - 1) * 1.2,
                           (max(x1_pdf_grid) + 1) * 1.1),
                  x_axis_label="R² & Q²",
                  y_axis_label="p.d.f.")
    slope_0 = Span(location=0,
                   dimension="width",
                   line_color="black",
                   line_width=2,
                   line_alpha=0.3)
    fig2.add_layout(slope_0)

    # Plot distribution
    fig2.patch(x1_grid,
               x1_pdf_grid,
               alpha=0.35,
               color="red",
               line_color="grey",
               line_width=1)
    fig2.patch(x2_grid,
               x2_pdf_grid,
               alpha=0.35,
               color="blue",
               line_color="grey",
               line_width=1)

    # Extra padding
    fig2.min_border_left = 60
    fig2.min_border_right = 20
    fig2.min_border_top = 20
    fig2.min_border_bottom = 20

    # Lollipops R2
    # Do a t-test
    a = ttest_ind(stats_r2[1:], [stats_r2[0]], alternative="smaller")[1]
    if a > 0.005:
        data2_manu = "%0.2f" % a
    else:
        data2_manu = "%0.2e" % a

    # Plot
    data2 = {
        "x": [stats_r2[0]],
        "y": [max(x1_pdf_grid) + 1],
        "hover": [data2_manu]
    }
    source2 = ColumnDataSource(data=data2)
    data2_line = {
        "x": [stats_r2[0], stats_r2[0]],
        "y": [max(x1_pdf_grid) + 1, 0],
        "hover": [str(data2_manu), str(data2_manu)]
    }
    source2_line = ColumnDataSource(data=data2_line)
    r2fig2_line = fig2.line("x",
                            "y",
                            line_width=2,
                            line_color="red",
                            source=source2_line)
    r2fig2 = fig2.circle("x",
                         "y",
                         fill_color="red",
                         size=6,
                         legend="R²",
                         source=source2)

    # Lollipops Q2
    # Do a t-test
    b = ttest_ind(stats_q2[1:], [stats_q2[0]], alternative="smaller")[1]
    if b > 0.005:
        data3_manu = "%0.2f" % b
    else:
        data3_manu = "%0.2e" % b

    # Plot
    data3 = {
        "x": [stats_q2[0]],
        "y": [min(x2_pdf_grid) - 1],
        "hover": [data3_manu]
    }
    source3 = ColumnDataSource(data=data3)
    data3_line = {
        "x": [stats_q2[0], stats_q2[0]],
        "y": [(min(x2_pdf_grid) - 1), 0],
        "hover": [data3_manu, data3_manu]
    }
    source3_line = ColumnDataSource(data=data3_line)
    q2fig2_line = fig2.line("x",
                            "y",
                            line_width=2,
                            line_color="blue",
                            source=source3_line)
    q2fig2 = fig2.circle("x",
                         "y",
                         fill_color="blue",
                         size=6,
                         legend="Q²",
                         source=source3)

    # Add text
    textr2 = "True R²\nP-Value: {}".format(data2_manu)
    textq2 = "True Q²\nP-Value: {}".format(data3_manu)
    fig2.text(x=[stats_r2[0] + 0.05, stats_q2[0] + 0.05],
              y=[(max(x1_pdf_grid) + 0.5), (min(x2_pdf_grid) - 1.5)],
              text=[textr2, textq2],
              angle=0,
              text_font_size="8pt")

    # Font-sizes
    fig1.xaxis.axis_label_text_font_size = "13pt"
    fig1.yaxis.axis_label_text_font_size = "13pt"
    fig2.xaxis.axis_label_text_font_size = "12pt"
    fig2.yaxis.axis_label_text_font_size = "12pt"
    fig2.legend.location = "top_left"

    # Remove grid lines
    if grid_line == False:
        fig1.xgrid.visible = False
        fig1.ygrid.visible = False
        fig2.xgrid.visible = False
        fig2.ygrid.visible = False

    fig = gridplot([[fig1, fig2]])
    return fig
import numpy as np

from bokeh.models import Slope
from bokeh.plotting import figure, output_file, show

output_file("slope.html", title="slope.py example")

# linear equation parameters
gradient = 2
y_intercept = 10

# create random data
xpts = np.arange(0, 20)
ypts = gradient * xpts + y_intercept + np.random.normal(0, 4, 20)

p = figure(plot_width=450, plot_height=450, y_range=(0, 1.1 * max(ypts)))

p.circle(xpts, ypts, size=5, color="skyblue")

slope = Slope(gradient=gradient, y_intercept=y_intercept,
              line_color='orange', line_dash='dashed', line_width=3.5)

p.add_layout(slope)

p.yaxis.axis_label = 'y'
p.xaxis.axis_label = 'x'

show(p)
Example #21
0
                    y_axis_label='Slugging Percentage',
                    y_axis_type='linear',
                    y_range=(0.28, 0.75),
                    title='Hall of Fame OPS Components',
                    tools='hover',
                    tooltips=tooltip,
                    toolbar_location=None)

obpslg_fig.circle(x='OBP',
                  y='SLG',
                  radius=0.0025,
                  alpha=0.5,
                  color='blue',
                  source=hofbat_cds)

slope7 = Slope(gradient=-1, y_intercept=0.7, line_color='orange', line_width=1)
slope8 = Slope(gradient=-1, y_intercept=0.8, line_color='red', line_width=1)
slope9 = Slope(gradient=-1, y_intercept=0.9, line_color='white', line_width=1)
slope10 = Slope(gradient=-1, y_intercept=1.0, line_color='green', line_width=1)

obpslg_fig.add_layout(slope7)
obpslg_fig.add_layout(slope8)
obpslg_fig.add_layout(slope9)
obpslg_fig.add_layout(slope10)

erahrr_fig = figure(x_axis_label='Home Run Rate',
                    x_axis_type='linear',
                    x_range=(0, 0.09),
                    y_axis_label='Era',
                    y_range=labels,
                    title='Home Run Rates by Era',
Example #22
0
    def bokeh_scatter_plot(self, tool1, tool2, **kwargs):
        """Return (and show) an interactive scatter plot comparing
         2 tools rendered in bokeh library.

        Needs bokeh and colorcet libraries.

        Always return the `bokeh.plotting.Figure` instance with the
        plot. This can be used to further tune the plot.

         `tool1` (axis `x`) and `tool2` (axis `y`)
         `show` : Bool
            if `True` (default), show the plot in Jupyter notebook

        Possible kwargs
        ===============
        `show` : Bool, indicates, whether or not show the plot (in Jupyter)

        `col` : String
            name of ltlcross metric to plot, `states` by default
        `merge_same` : Bool
            if `True` (default), merge same instances and add colorbar
            for count, see `add_count` of `self.get_plot_data`.
        `include_equal` : Bool
            if `False` (default) do not include formulas with the same
            values for both tools

        And we have 4 arguments that control the appearance of the plot
        `palette` : color palette to use if `merge_same` is `True`
            default : `bwy` from `colorcet`
        `marker_color` : color to use if `merge_same` is `False`
            default : "navy"
        `alpha` : alpha of marks
            default `1` if `merge_same` and `.3` otherwise
        `marker_size` : int
            default `10`

        All remaining kwargs are supplied to `bokeh.plotting.scatter`
        """
        from bokeh.models import ColumnDataSource, CustomJS, ColorBar, TapTool, HoverTool, Slope
        from bokeh.transform import linear_cmap
        import bokeh.plotting as bplt

        # Get the arguments
        merge_same = kwargs.pop("merge_same", True)
        alpha = kwargs.pop("alpha", 1) if merge_same else kwargs.pop("alpha", .3)
        marker_size = kwargs.pop("marker_size", 10)
        show = kwargs.pop("show", True)
        include_equal = kwargs.pop("include_equal", False)
        col = kwargs.pop("col", "states")
        # Import colorcet for palette
        if merge_same:
            import colorcet as cc
            palette = kwargs.pop("palette", cc.bgy)

        # Make the graph render in notebooks
        if show:
            bplt.output_notebook()

        # Create the basic plot object
        p = bplt.figure(title=f"Numbers of {col}")
        p.xaxis.axis_label = f"{tool1}"
        p.yaxis.axis_label = f"{tool2}"

        # Prepare the data
        data = self.get_plot_data(tool1, tool2, add_count=merge_same, include_equal=include_equal, col=col)
        if not merge_same:
            # We want to have the form_id and formula fields available for tooltip
            data = data.reset_index()
        source = ColumnDataSource(data)

        # Tooltips
        tooltips = [
            (tool1, f"@{{{tool1}}}"),
            (tool2, f"@{{{tool2}}}"),
        ]

        if merge_same:
            # Map count of cases to color
            mapper = linear_cmap(palette=palette, field_name="count", low=1, high=data["count"].max())
            color = mapper

            # Add count to tooltip
            tooltips.append(("count", "@count"))

            # Print command to display selected formulas
            callback = CustomJS(args=dict(source=source), code=f"""
                // Select the data
                var inds = source.selected.indices;
                var data = source.data;
                var x = data['{tool1}'][inds];
                var y = data['{tool2}'][inds];

                // Create the two commands
                var fst_row = "data = a.get_plot_data('{tool1}','{tool2}',add_count=False)";
                var snd_row = "data[(data['{tool1}'] == " + x + ") & (data['{tool2}'] == " + y + ")]";

                // Instructions
                var instructions = "Use the following code to list the formulas.\\n";
                instructions += "Replace `a` with the ResAnalyzer` object:\\n\\n"
                alert(instructions + fst_row + "\\n" + snd_row);
                """)
        else:
            color = kwargs.pop("marker_color", "navy")
            tooltips.append(("formula id", "@form_id"))

            # Print formula on selection (currently only works for 1)
            callback = CustomJS(args=dict(source=source), code=f"""
                // Select the data
                var inds = source.selected.indices;
                var data = source.data;

                // Print formulas ids
                var output = data['form_id'][inds[0]];
                for (var i = 1; i < inds.length; i++) {{
                    var f = data['form_id'][inds[i]];
                    output += ', ' + f;
                }}
                output += '\\n'

                // Print formulas (1 per line)
                for (var i = 0; i < inds.length; i++) {{
                    var f = data['formula'][inds[i]];
                    output += f + '\\n';
                }}
                alert(output);
                """)

        # Plot data and add `y=x`
        slope = Slope(gradient=1, y_intercept=0, line_color="orange", line_width=2, line_dash="dashed")
        p.add_layout(slope)
        p.scatter(x=tool1, y=tool2, source=source, color=color, alpha=alpha, size=marker_size, **kwargs)

        # Add the hoover & selecting tool
        p.add_tools(TapTool(callback=callback))
        p.add_tools(HoverTool(tooltips=tooltips, mode="mouse"))

        if merge_same:
            color_bar = ColorBar(color_mapper=mapper['transform'], width=16, location=(0, 0))
            p.add_layout(color_bar, 'right')

        if show:
            bplt.show(p)
        return p
Example #23
0
from sklearn import linear_model
model = linear_model.LinearRegression()
training_x = np.array(train['horsepower']).reshape(-1,1)
training_y = np.array(train['price'])
model.fit(training_x, training_y)
slope = np.asscalar(np.squeeze(model.coef_))
intercept = model.intercept_
print('slope:', slope, 'intercept:', intercept)


# In[25]:


# Now let's add the line to our graph
from bokeh.models import Slope
best_line = Slope(gradient=slope, y_intercept=intercept, line_color='red', line_width=3)
p.add_layout(best_line)
show(p)

# In[26]:


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# function to predict the mean_absolute_error, mean_squared_error and r-squared
def predict_metrics(lr, x, y):
    pred = lr.predict(x)
    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    r2 = r2_score(y, pred)
    return mae, mse, r2
Example #24
0
ump_fig = figure(x_axis_label='Home Avg. Runs',
                 x_axis_type='linear',
                 x_range=(4, 5.25),
                 y_axis_label='Visitor Avg. Runs',
                 y_axis_type='linear',
                 y_range=(3.75, 5.25),
                 title='Home Plate Umpire Average Runs per game',
                 tools='hover',
                 tooltips=tooltip,
                 toolbar_location=None)

ump_fig.circle('AVG_HOME', 'AVG_VIS', size=20, color='#006BB6', source=ump_cds)
slope = Slope(gradient=1,
              y_intercept=0,
              line_color='#CE1141',
              line_dash='dashed',
              line_width=3)
ump_fig.add_layout(slope)
chuck = Label(x=5, y=5.04, text='Chuck Meriwether')
ump_fig.add_layout(chuck)
doug = Label(x=4.08, y=3.76, text='Doug Harvey')
ump_fig.add_layout(doug)
alfonso = Label(x=4.8, y=4.09, text='Alfonso Marquez')
ump_fig.add_layout(alfonso)

attend_cds = ColumnDataSource(day_attend)
attend_fig = figure(x_axis_label='Day of the Week',
                    x_range=days,
                    y_axis_label='Avg. Attendance',
                    y_axis_type='linear',