Example #1
0
# selectCountry = alt.selection_single(
#     name='Select', # name the selection 'Select'
#     fields=['country'], # limit selection to the country field
#     init={'country': countries[0]}, # use first country entry as initial value
#     bind=alt.binding_select(options=countries) # bind to a menu of unique country values
# )

# Year selection
brush = alt.selection_interval(encodings=['x'])
years = alt.Chart(deaths).mark_line().add_selection(
    brush
).transform_filter(
    alt.datum.country == selectCountry
).encode(
    alt.X('year:O', title='Year'),
    alt.Y('sum(value)', title='Smoking Deaths (all ages)')
).properties(
	width=400,
    height=100
)

# Area chart - Smoking deaths by ages
base = alt.Chart(deaths).mark_area().transform_filter(
    alt.datum.country == selectCountry
).transform_filter(
    brush
).encode(
    alt.X('year:O', title='Year'),
    y=alt.Y('value:Q', title='Smoking Deaths by Ages (normalized)', stack="normalize"),
    color=alt.Color('Age:O', scale=alt.Scale(scheme='lightorange')),
Example #2
0
    def plot_distance(
        self,
        rank="auto",
        metric="braycurtis",
        title=None,
        xlabel=None,
        ylabel=None,
        tooltip=None,
        return_chart=False,
        linkage="average",
        label=None,
    ):
        """Plot beta diversity distance matrix as a heatmap and dendrogram.

        Parameters
        ----------
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.
        metric : {'braycurtis', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac}, optional
            Function to use when calculating the distance between two samples.
        linkage : {'average', 'single', 'complete', 'weighted', 'centroid', 'median'}
            The type of linkage to use when clustering axes.
        title : `string`, optional
            Text label at the top of the plot.
        xlabel : `string`, optional
            Text label along the horizontal axis.
        ylabel : `string`, optional
            Text label along the vertical axis.
        tooltip : `string` or `list`, optional
            A string or list containing strings representing metadata fields. When a point in the
            plot is hovered over, the value of the metadata associated with that sample will be
            displayed in a modal.
        label : `string` or `callable`, optional
            A metadata field (or function) used to label each analysis. If passing a function, a
            dict containing the metadata for each analysis is passed as the first and only
            positional argument. The callable function must return a string.

        Examples
        --------
        Plot the weighted UniFrac distance between all our samples, using counts at the genus level.

        >>> plot_distance(rank='genus', metric='unifrac')
        """
        import altair as alt
        import numpy as np
        import pandas as pd
        from onecodex.viz import dendrogram

        if len(self._results) < 2:
            raise OneCodexException(
                "`plot_distance` requires 2 or more valid classification results."
            )

        # this will be passed to the heatmap chart as a dataframe eventually
        plot_data = {
            "1) Label": [],
            "2) Label": [],
            "Distance": [],
            "classification_id": []
        }

        # here we figure out what to put in the tooltips and get the appropriate data
        if tooltip:
            if not isinstance(tooltip, list):
                tooltip = [tooltip]
        else:
            tooltip = []

        tooltip.insert(0, "Label")

        magic_metadata, magic_fields = self._metadata_fetch(tooltip,
                                                            label=label)
        formatted_fields = []

        for _, magic_field in magic_fields.items():
            field_group = []

            for i in (1, 2):
                field = "{}) {}".format(i, magic_field)
                plot_data[field] = []
                field_group.append(field)

            formatted_fields.append(field_group)

        clust = self._cluster_by_sample(rank=rank,
                                        metric=metric,
                                        linkage=linkage)

        # must convert to long format for heatmap plotting
        for idx1, id1 in enumerate(clust["dist_matrix"].index):
            for idx2, id2 in enumerate(clust["dist_matrix"].index):
                if idx1 == idx2:
                    plot_data["Distance"].append(np.nan)
                else:
                    plot_data["Distance"].append(
                        clust["dist_matrix"].iloc[idx1, idx2])

                plot_data["classification_id"].append(id1)

                for field_group, magic_field in zip(formatted_fields,
                                                    magic_fields.values()):
                    plot_data[field_group[0]].append(
                        magic_metadata[magic_field][id1])
                    plot_data[field_group[1]].append(
                        magic_metadata[magic_field][id2])

        plot_data = pd.DataFrame(data=plot_data)

        labels_in_order = magic_metadata["Label"][
            clust["ids_in_order"]].tolist()

        # it's important to tell altair to order the cells in the heatmap according to the clustering
        # obtained from scipy
        alt_kwargs = dict(
            x=alt.X("1) Label:N",
                    axis=alt.Axis(title=xlabel),
                    sort=labels_in_order),
            y=alt.Y("2) Label:N",
                    axis=alt.Axis(title=ylabel, orient="right"),
                    sort=labels_in_order),
            color="Distance:Q",
            tooltip=list(chain.from_iterable(formatted_fields)) +
            ["Distance:Q"],
            href="url:N",
            url="https://app.onecodex.com/classification/" +
            alt.datum.classification_id,
        )

        chart = (alt.Chart(
            plot_data,
            width=15 * len(clust["dist_matrix"].index),
            height=15 * len(clust["dist_matrix"].index),
        ).transform_calculate(url=alt_kwargs.pop("url")).mark_rect().encode(
            **alt_kwargs))

        if title:
            chart = chart.properties(title=title)

        dendro_chart = dendrogram(clust["scipy_tree"])

        if return_chart:
            return dendro_chart | chart
        else:
            (dendro_chart | chart).display()
Sources: [covidtracking.com](https://covidtracking.com/api), [census.gov](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-state-total.html)

Hospitalization data is more normalized than case data because testing rates (and thus known cases) varies too much across states. 
While less of a leading indicator, the 7 day change gives a reasonable early sign of trouble.
""")

# Magic streamlit function that renders a date picker and assigns the picked value to picked_date
picked_date = st.date_input("Date", value=usdata_diff['date'].max()).strftime('%Y-%m-%d')

# 7 day change bar chart
st.subheader('Change last 7 days')
# The reason why picked_date was converted to string above is otherwise the data
# selection would not work in this line below.
st.write(alt.Chart(usdata_diff[usdata_diff['date'] == picked_date]).mark_bar().encode(
    y=alt.Y('state', sort='-x'),
    x=alt.X('hospitalizedPer100k7daychange:Q', axis=alt.Axis(orient='top')),
    tooltip=[alt.Tooltip("hospitalizedPer100k7daychange:Q", title="7 day change", format=',.0d')]
    ).properties(
        width=800
    )
)

st.subheader('Total')
st.markdown("Red bars are republican states, blue bars are democratic states per 2016 presidential election")
# The reason why picked_date was converted to string above is otherwise the data
# selection would not work in this line below.
st.write(alt.Chart(usdata[usdata['date'] == picked_date]).mark_bar().encode(
    y=alt.Y('state', sort='-x'),
    x=alt.X('hospitalizedCurrentlyPer100k', axis=alt.Axis(orient='top')),
    tooltip=[alt.Tooltip("hospitalizedCurrentlyPer100k:Q", title="hospitalized per 100k", format=',.2f')]
    ).properties(
Example #4
0
def graph_compare_cea2034(df, graph_params, speaker1, speaker2):
    selection1, selection2, selectorsMeasurements, scales = build_selections(
        df, speaker1, speaker2)

    # TODO(move to parameters)
    x_axis = alt.X('Freq:Q',
                   scale=alt.Scale(type="log", domain=[20, 20000], nice=False))
    y_axis = alt.Y('dB:Q', scale=alt.Scale(zero=False, domain=[-40, 10]))
    color = alt.Color('Measurements', type='nominal', sort=None)
    opacity = alt.condition(selectorsMeasurements, alt.value(1),
                            alt.value(0.2))

    line = alt.Chart(df).transform_filter(
        alt.FieldOneOfPredicate(field='Measurements',
                                oneOf=[
                                    'On Axis', 'Listening Window',
                                    'Early Reflections', 'Sound Power'
                                ])).encode(x=x_axis,
                                           y=y_axis,
                                           color=color,
                                           opacity=opacity)
    points = line.mark_circle(size=100).encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0)),
        tooltip=['Measurements', 'Freq', 'dB'])

    di_axis = alt.Y('dB:Q',
                    scale=alt.Scale(zero=False, domain=[-10, 40], nice=False))
    di = alt.Chart(df).transform_filter(
        alt.FieldOneOfPredicate(
            field='Measurements',
            oneOf=['Early Reflections DI',
                   'Sound Power DI'])).encode(x=x_axis,
                                              y=di_axis,
                                              color=color,
                                              opacity=opacity)
    points_di = di.mark_circle(size=100).encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0)),
        tooltip=['Measurements', 'Freq', 'dB'])

    spin_full = alt.layer(points + line.mark_line(),
                          points_di + di.mark_line(clip=True)).resolve_scale(
                              y='independent').properties(width=600,
                                                          height=300)

    spin_dash = alt.layer(
        points + line.mark_line(strokeDash=[4, 2]),
        points_di + di.mark_line(clip=True, strokeDash=[4, 2])).resolve_scale(
            y='independent').properties(width=600, height=300)

    line1 = spin_full.add_selection(selection1).transform_filter(selection1)
    line2 = spin_dash.add_selection(selection2).transform_filter(selection2)

    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0)))
    rules = alt.Chart(df).mark_rule(color='gray').encode(
        x='Freq:Q').transform_filter(nearest)

    layers = alt.layer(
        line2, line1,
        rules).add_selection(selectorsMeasurements).add_selection(
            scales).add_selection(nearest).interactive()
    return layers
Example #5
0
def check_column(
    data,
    columns,
    bins=False,
    missing=0.1,
    cardinality=15,
    float_frequency=30,
    category_frequency=100,
    outlier_function=quartile,
):
    """
    Presents a summary of given column(s) of the given pandas dataframe
    including summary statistics, bar chart or histogram, and any abnormalities found
    
    :param data: a pandas dataframe

    :param columns: a single column name or a list of column names to analyze

    :param bins: a boolean value or list of boolean values to determine whether to bin the histogram for each column

    :param missing: a cutoff point for high percentage of missing / zero values, defaults to 10%

    :param cardinality: a cutoff point for high cardinality of a categorical column, defaults to 15

    :param float_frequency: a cutoff point for high frequency of floating point numbers, defaults to 30

    :param category_frequency: a cutoff point for low frequency of categories in categorical columns, defaults to 100

    :param outlier_function: a function of the dataset and column name that returns the lower and upper limit for outliers,
        defaults to 1.5*IQR above the 3rd quartile or below the 1st quartile
    """

    if isinstance(columns, str):
        # with only one column, convert to lists
        columns = [columns]
        bins = [bins]
    else:
        if bins == False:
            # with multiple columns and no bins,
            # convert to list of correct length
            bins = [False] * len(columns)

        if isinstance(bins, int):
            # with multiple columns and only one bin
            # specification, convert to list of correct length
            bins = [bins] * len(columns)

    i = 0
    for col in columns:
        bin = bins[i]
        i += 1

        if data[col].dtype == "O":
            # cannot bin categorical data
            bin = False

        if bin == False:
            if data[col].dtype == "O":
                chart = (
                    alt.Chart(data)
                    .mark_bar(color="#64b5f6")
                    .encode(
                        alt.X(
                            col,
                            axis=alt.Axis(title=col.title()),
                            sort=alt.SortField(
                                field="count()", order="descending", op="values"
                            ),
                        ),
                        alt.Y("count()"),
                    )
                )
            else:
                chart = (
                    alt.Chart(data)
                    .mark_bar(color="#64b5f6")
                    .encode(
                        alt.X(
                            col, 
                            axis=alt.Axis(title=col.title())
                        ), 
                        alt.Y("count()")
                    )
                )
        else:
            chart = (
                alt.Chart(data)
                .mark_bar(color="#64b5f6")
                .encode(
                    alt.X(
                        col, 
                        bin=alt.Bin(maxbins=bin), 
                        axis=alt.Axis(title=col.title())
                    ),
                    alt.Y("count()"),
                )
            )

        if data[col].dtype == "float64":
            stats = data[col].describe()
        else:
            stats = data.groupby(col)[col].agg(["count"])
            stats["prop"] = stats["count"] / len(data)

        stats = pd.DataFrame(stats).T

        display(Markdown("#### Column Summary: " + col.title()))
        display(stats)
        display(chart)
        check_data(
            data,
            [col],
            missing=missing,
            cardinality=cardinality,
            float_frequency=float_frequency,
            category_frequency=category_frequency,
            outlier_function=outlier_function,
            title=False,
        )
Example #6
0
def make_category_plot_separate_top_n(infile, n_to_separate=20):
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', None)
    grouped_flows = infra.pd.read_parquet(infile)
    grouped_flows = grouped_flows.reset_index()
    grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[
        "bytes_down"]

    # Figure out sorting order by total amount.
    cat_totals = grouped_flows.groupby("category").sum().reset_index()
    cat_sort_order = cat_totals.sort_values(
        "bytes_total", ascending=False).set_index("bytes_total").reset_index()
    cat_sort_list = cat_sort_order["category"].tolist()

    user_totals = grouped_flows.groupby("user").sum().reset_index()
    user_sort_order = user_totals.sort_values(
        "bytes_total", ascending=False).set_index("bytes_total").reset_index()
    user_sort_list = user_sort_order["user"].tolist()

    # Generate a frame from the sorted user list that identifies the top users
    top_annotation_frame = user_sort_order[["user"]]
    bottom_n = len(user_sort_order) - n_to_separate
    top_annotation_frame = top_annotation_frame.assign(
        topN="Bottom {}".format(bottom_n))
    top_annotation_frame.loc[top_annotation_frame.index < n_to_separate,
                             "topN"] = "Top {}".format(n_to_separate)

    grouped_flows["GB"] = grouped_flows["bytes_total"] / (1000**3)
    grouped_flows = grouped_flows[["category", "user",
                                   "GB"]].groupby(["user", "category"]).sum()
    grouped_flows = grouped_flows.reset_index()
    grouped_flows["logGB"] = grouped_flows["GB"].transform(np.log10)
    grouped_flows = grouped_flows.merge(top_annotation_frame, on="user")

    alt.Chart(grouped_flows).mark_rect().encode(
        x=alt.X(
            "user:N",
            title="User (Sorted by Total GB)",
            axis=alt.Axis(labels=False),
            sort=user_sort_list,
        ),
        y=alt.Y(
            "category:N",
            title="Category (Sorted by Total GB)",
            sort=cat_sort_list,
        ),
        # shape="direction",
        color=alt.Color(
            "GB:Q",
            title="Total GB",
            scale=alt.Scale(scheme="viridis"),
        ),
    ).facet(column=alt.Column(
        "topN:N",
        sort="descending",
        title="",
    ), ).resolve_scale(x="independent", color="independent").save(
        "renders/users_per_category_split_outliers.png",
        scale_factor=2,
    )
Example #7
0
def make_category_plot(infile):
    pd.set_option('display.max_columns', None)
    grouped_flows = infra.pd.read_parquet(infile)
    grouped_flows = grouped_flows.reset_index()
    grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[
        "bytes_down"]

    # Figure out sorting order by total amount.
    cat_totals = grouped_flows.groupby("category").sum().reset_index()
    cat_sort_order = cat_totals.sort_values(
        "bytes_total", ascending=False).set_index("bytes_total").reset_index()
    cat_sort_list = cat_sort_order["category"].tolist()

    user_totals = grouped_flows.groupby("user").sum().reset_index()
    user_sort_order = user_totals.sort_values(
        "bytes_total", ascending=False).set_index("bytes_total").reset_index()
    user_sort_list = user_sort_order["user"].tolist()

    grouped_flows["GB"] = grouped_flows["bytes_total"] / (1000**3)
    grouped_flows = grouped_flows[["category", "user",
                                   "GB"]].groupby(["user", "category"]).sum()
    grouped_flows = grouped_flows.reset_index()
    grouped_flows["logGB"] = grouped_flows["GB"].transform(np.log10)

    # Filter users by time in network to eliminate early incomplete samples
    user_active_ranges = infra.pd.read_parquet(
        "data/clean/user_active_deltas.parquet")[[
            "user", "days_since_first_active", "days_active"
        ]]
    # Drop users that joined less than a week ago or were active for less than a week.
    users_to_analyze = user_active_ranges.loc[
        user_active_ranges["days_since_first_active"] >= 7]

    # Only needed if using the time normalized graph.
    # # Drop users active for less than one week
    # users_to_analyze = users_to_analyze.loc[
    #     users_to_analyze["days_active"] >= 7,
    # ]

    grouped_flows = grouped_flows.merge(users_to_analyze,
                                        on="user",
                                        how="inner")

    alt.Chart(grouped_flows).mark_rect().encode(
        x=alt.X(
            "user:N",
            title="User (Sorted by Total GB)",
            axis=alt.Axis(labels=False),
            sort=user_sort_list,
        ),
        y=alt.Y(
            "category:N",
            title="Category (Sorted by Total GB)",
            sort=cat_sort_list,
        ),
        # shape="direction",
        color=alt.Color(
            "logGB:Q",
            title="log(Total GB)",
            scale=alt.Scale(scheme="viridis"),
        ),
    ).properties(width=500, ).save(
        "renders/users_per_category.png",
        scale_factor=2,
    )

    # Normalize by each user's total spend to highlight categories
    user_total_to_merge = user_totals[[
        "user", "bytes_total"
    ]].rename(columns={"bytes_total": "user_total_bytes"})
    normalized_user_flows = grouped_flows.copy()
    normalized_user_flows = normalized_user_flows.merge(user_total_to_merge,
                                                        on="user")
    normalized_user_flows["user_total_bytes"] = normalized_user_flows[
        "user_total_bytes"] / 1000**3
    normalized_user_flows["normalized_bytes"] = normalized_user_flows[
        "GB"] / normalized_user_flows["user_total_bytes"]

    alt.Chart(normalized_user_flows).mark_rect().encode(
        x=alt.X(
            "user:N",
            title="User (Sorted by Total GB)",
            axis=alt.Axis(labels=False),
            sort=user_sort_list,
        ),
        y=alt.Y(
            "category:N",
            title="Category (Sorted by Total GB)",
            sort=cat_sort_list,
        ),
        # shape="direction",
        color=alt.Color(
            "normalized_bytes:Q",
            title="Normalized (Per User) Traffic",
            scale=alt.Scale(scheme="viridis"),
        ),
    ).properties(width=500, ).save(
        "renders/users_per_category_normalized_user_total.png",
        scale_factor=2,
    )

    # Normalize by each user's time in network to better compare users
    time_normalized_flows = grouped_flows
    time_normalized_flows["MB_per_day"] = time_normalized_flows[
        "GB"] * 1000 / time_normalized_flows["days_active"]
    time_normalized_flows["log_MB_per_day"] = time_normalized_flows[
        "MB_per_day"].transform(np.log10)

    alt.Chart(time_normalized_flows).mark_rect().encode(
        x=alt.X(
            "user:N",
            title="User (Sorted by Total)",
            axis=alt.Axis(labels=False),
            sort=user_sort_list,
        ),
        y=alt.Y(
            "category:N",
            title="Category (Sorted by Total)",
            sort=cat_sort_list,
        ),
        # shape="direction",
        color=alt.Color(
            "log_MB_per_day:Q",
            title="MB per Day (Log Transformed)",
            scale=alt.Scale(scheme="viridis"),
        ),
    ).properties(width=500, ).save(
        "renders/users_per_category_normalized_time.png",
        scale_factor=2,
    )
Example #8
0
                                             x,
                                             reference_feature,
                                             grid_resolution=50)
    ice_chart_data = pd.DataFrame(ice_values.T)
    ice_chart_data.columns = [str(c) for c in ice_chart_data.columns]
    ice_chart_data["index"] = ice_grid

    # ALE
    ale_grid, ale_values, feature_type = ale(model_fit, x, reference_feature)
    ale_chart_data = pd.DataFrame({"x": ale_grid, "ALE": ale_values})

    col1, col2 = st.beta_columns(2)
    with col1:
        st.write(
            alt.Chart(pdp_chart_data, title="PDP").mark_line().encode(
                x=alt.X("x", title=pretty_feature_name),
                y=alt.Y("PD", title="")).configure_title(
                    fontSize=18).properties(width=450))
    with col2:
        base = alt.Chart(title="ICE").mark_line().encode(
            x=alt.X("index", title=pretty_feature_name))
        st.write(
            alt.layer(*[
                base.encode(y=alt.Y(col, title=""))
                for col in ice_chart_data.columns if col != "index"
            ],
                      data=ice_chart_data).configure_title(
                          fontSize=18).properties(width=450))

    st.write(
        alt.Chart(ale_chart_data, title="ALE").mark_line().encode(
"""
Connected Scatterplot (Lines with Custom Paths)
-----------------------------------------------

This example shows how layering can be used to build a plot. This dataset tracks miles driven per capita along with gas prices annually from 1956 to 2010. It is based on the May 2, 2010 New York Times article 'Driving Shifts Into Reverse'. See http://mbostock.github.io/protovis/ex/driving.html .
"""

import altair as alt
from vega_datasets import data

driving = data.driving()

lines = alt.Chart(driving).mark_line().encode(
    alt.X('miles', scale=alt.Scale(zero=False)),
    alt.Y('gas', scale=alt.Scale(zero=False)),
    order='year')

points = alt.Chart(driving).mark_circle().encode(
    alt.X('miles', scale=alt.Scale(zero=False)),
    alt.Y('gas', scale=alt.Scale(zero=False)))

lines + points
Example #10
0
    def plot(self):
        df_combined = self.df
        m = self.now_utc_int

        start = m - 3600 * 1000 * 12
        max_time_unix = self.df.Minutes5UTC.astype(int).max() / 1000000
        end = min(m + 3600 * 1000 * 12, max_time_unix)
        # Convert np.int64 to int to ensure that result is JSON serializable
        height = max(250, int(df_combined.CO2Emission.max()) + 25)
        today = pd.DataFrame({
            'x': [self.now, self.now],
            'y': [0, self.quintiles[-1]]
        })
        rects = [
            pd.DataFrame({
                'x': [self.min_time],
                'y': [self.quintiles[i]],
                'x2': [self.max_time],
                'y2': [self.quintiles[i + 1]]
            }) for i in range(5)
        ]

        interval = alt.selection_interval(encodings=['x'],
                                          init={'x': [int(start),
                                                      int(end)]})

        base = alt.Chart(df_combined).mark_line(strokeWidth=4).encode(
            alt.X('Minutes5DK:T', title=''),
            alt.Y('CO2Emission:Q',
                  title='Udledningsintensitet [g CO2/kWh]',
                  scale=alt.Scale(domain=(0, height))),
            alt.Color('Type:N'),
        )

        today_line = alt.Chart(today).mark_rule(clip=True).encode(x='x:T',
                                                                  y='y:Q')
        today_chart = alt.Chart(today).mark_rule(clip=True).encode(x=alt.X(
            'x:T', scale=alt.Scale(domain=interval.ref())),
                                                                   y='y:Q')
        opacity = 0.15

        def make_rect(data, color):
            return alt.Chart(data).mark_rect(
                color=color, opacity=opacity).encode(x=alt.X(
                    'x:T', scale=alt.Scale(domain=interval.ref())),
                                                     x2='x2:T',
                                                     y='y:Q',
                                                     y2='y2:Q')

        rect_charts = [
            make_rect(data, color) for data, color in zip(
                rects, ['green', 'lightgreen', 'yellow', 'lightcoral', 'red'])
        ]
        combined_rect_chart = rect_charts[0] + rect_charts[1] + rect_charts[
            2] + rect_charts[3] + rect_charts[4]
        top = base.properties(width='container', height=300) \
            .encode(x=alt.X('Minutes5DK:T',
                            axis=alt.Axis(format='%H'),
                            title='',
                            scale=alt.Scale(domain=interval.ref())))
        chart = top + today_chart + combined_rect_chart
        view = base.properties(
            width='container', height=50, selection=interval).encode(y=alt.Y(
                'CO2Emission:Q', title='', scale=alt.Scale(domain=(0,
                                                                   height))))

        full_chart = chart & (view + today_line)

        return full_chart.configure_axis(titleX=-25,
                                         titleY=-20,
                                         titleAlign='left',
                                         titleAngle=0,
                                         titleFont='Inter Regular',
                                         titleFontWeight='normal',
                                         titleFontSize=13).configure_legend(
                                             title=None,
                                             orient='top-right',
                                             labelFont='Inter Regular',
                                             labelFontSize=12)
Example #11
0
def criar_histograma(coluna, df):
    chart = alt.Chart(df, width=600).mark_bar().encode(
        alt.X(coluna, bin=True),
        y='count()', tooltip=[coluna, 'count()']
    ).interactive()
    return chart
    def state_length_vs_timestep_sim(self, chart_type='bar', background_color='#abb2bf', lower_bound=25, upper_bound=200):
        '''
        creates interactive charts showing distributions of on and off times for simulated data
        
        chart types:
            bar: bar chart (histogram)
            area: area chart (filled line chart)
            
        background_color:
            hex code for chart background color, ex:
                #abb2bf - grey
                #ffffff - white
        '''
        
        full_sim_df = self.full_sim_df
        chart_type = chart_type.strip().lower()

        sel_timestep = alt.selection_multi(encodings=['y'])
        
        bar_chart = alt.Chart(
            full_sim_df,
            height=800,
            width=250
        ).mark_bar(
        ).encode(
            alt.X(
                'max(duration):Q',
                title='max packet duration',
                scale=alt.Scale(type='log')
            ),
            alt.Y(
                'timestep:N',
            ),
            color=alt.condition(
                sel_timestep,
                'timestep:N',
                alt.value('#96989b'),
                legend=None
            ),
            tooltip = [
                alt.Tooltip('timestep:N'),
                alt.Tooltip('duration:Q', aggregate='max')
            ]
        ).add_selection(
            sel_timestep
        )  
        
        
        #---------- histograms ----------
        detail_bar_chart = alt.Chart(
            full_sim_df,
            height=375,
            width=800
        ).mark_bar(
            opacity=0.5
        ).transform_filter(
            sel_timestep
        ).transform_filter(
            datum.duration > lower_bound
        ).transform_filter(
            datum.duration < upper_bound
        ).encode(
            alt.X('duration:Q'),
            alt.Y(
                'frequency:Q', 
                #scale=alt.Scale(type='log')
            ),
            color=alt.Color('timestep:N', legend=None),
            tooltip=[
                alt.Tooltip('duration:Q'),
                alt.Tooltip('frequency:Q'),
                alt.Tooltip('timestep:N'),
                alt.Tooltip('bit:N'),
            ]
        )
        
        detail_area_chart = alt.Chart(
            full_sim_df,
            height=375,
            width=800
        ).mark_area(
            opacity=0.5
        ).transform_filter(
            sel_timestep
        ).transform_filter(
            datum.duration > lower_bound
        ).transform_filter(
            datum.duration < upper_bound
        ).encode(
            alt.X('duration:Q'),
            alt.Y(
                'frequency:Q', 
                #scale=alt.Scale(type='log')
            ),
            color=alt.Color('timestep:N', legend=None),
            tooltip=[
                alt.Tooltip('duration:Q'),
                alt.Tooltip('frequency:Q'),
                alt.Tooltip('timestep:N'),
                alt.Tooltip('bit:N'),
            ]
        )
        
        if chart_type == 'bar':
            detail_chart = detail_bar_chart
        elif chart_type == 'area':
            detail_chart = detail_area_chart
        else:
            print(f'unsupported chart type {chart_type}')
        
        stacked_bit_details = alt.vconcat(
            detail_chart.transform_filter(datum.bit == 0).properties(title='off time distributions'),
            detail_chart.transform_filter(datum.bit == 1).properties(title='on time distributions'),
        )
        
        full = alt.hconcat(
            bar_chart,
            stacked_bit_details,    
            background=background_color
        )
        
        display(full)
    
        
    
    
    
    
        
        
        
 def state_length_vs_timestep_real(self, chart_type='bar', background_color='#abb2bf', lower_bound=25, upper_bound=2500):
     '''
     creates interactive charts showing distributions of on and off times for simulated data
     
     chart types:
         bar: bar chart (histogram)
         area: area chart (filled line chart)
         
     background_color:
         hex code for chart background color, ex:
             #abb2bf - grey
             #ffffff - white
     '''
     
     full_real_df = (
         self
         .full_real_df
         .groupby(['bit','duration'])
         .frequency
         .sum()
         .reset_index()
     )
    
     
     detail_bar_chart = alt.Chart(
         full_real_df,
         height=375,
         width=800
     ).mark_bar(
         opacity=0.5
     ).transform_filter(
         datum.duration > lower_bound
     ).transform_filter(
         datum.duration < upper_bound
     ).encode(
         alt.X('duration:Q'),
         alt.Y(
             'frequency:Q', 
             #scale=alt.Scale(type='log')
         ),
         tooltip=[
             alt.Tooltip('duration:Q'),
             alt.Tooltip('frequency:Q'),
             alt.Tooltip('bit:N'),
         ]
     )
     
     detail_area_chart = alt.Chart(
         full_real_df,
         height=375,
         width=800
     ).mark_area(
         opacity=0.5
     ).transform_filter(
         datum.duration > lower_bound
     ).transform_filter(
         datum.duration < upper_bound
     ).encode(
         alt.X('duration:Q'),
         alt.Y(
             'frequency:Q', 
             #scale=alt.Scale(type='log')
         ),
         tooltip=[
             alt.Tooltip('duration:Q'),
             alt.Tooltip('frequency:Q'),
             alt.Tooltip('bit:N'),
         ]
     )
     
     if chart_type == 'bar':
         detail_chart = detail_bar_chart
     elif chart_type == 'area':
         detail_chart = detail_area_chart
     else:
         print(f'unsupported chart type {chart_type}')
     
     
     stacked_bit_details = alt.vconcat(
         detail_chart.transform_filter(datum.bit == 0).properties(title='off time distributions'),
         detail_chart.transform_filter(datum.bit == 1).properties(title='on time distributions'),
         background=background_color
     )
     
     display(stacked_bit_details)       
Example #14
0
}, {
    "x": 15,
    "y": 17
}, {
    "x": 16,
    "y": 27
}, {
    "x": 17,
    "y": 68
}, {
    "x": 18,
    "y": 16
}, {
    "x": 19,
    "y": 49
}, {
    "x": 20,
    "y": 15
}])

area1 = alt.Chart(df).mark_area(clip=True, interpolate='monotone').encode(
    alt.X('x', scale=alt.Scale(zero=False, nice=False)),
    alt.Y('y', scale=alt.Scale(domain=[0, 50]), axis=alt.Axis(title='y')),
    opacity=alt.value(0.6)).properties(width=500, height=75)

area2 = area1.encode(alt.Y(
    'ny:Q',
    scale=alt.Scale(domain=[0, 50]))).transform_calculate("ny", datum.y - 50)

area1 + area2
Example #15
0
def create_graph():
    # retira o número máximo de linhas para pot com Altair
    alt.data_transformers.disable_max_rows()
    
    # faz query no banco de dados
    autores = queryDB('author', ['ID_author','author'])
    artigos = queryDB('paper', ['ID_paper','paper'])
    author_paper = queryDB('author_paper', ['ID_paper','ID_author'])
    
    autores['ID_author'] = autores['ID_author'].astype(str)
    artigos['ID_paper'] = artigos['ID_paper'].astype(str)

    ### renderiza os gráficos
    
    ## Grafo 1 - Autores (authors)
    
    print('Preparando grafo dos autores...')
    graph = nx.Graph()
    
    # dataframe com colunas: paper e [lista_autores]
    group = pd.DataFrame(author_paper.groupby('ID_paper')['ID_author'].apply(list))
    
    
    # Adicionando "edges"
    for j,row in group.iterrows():
        i=len(row['ID_author'])
        for i in range(len(row['ID_author'])):
            for k in range(i,len(row['ID_author'])):
                graph.add_edge(row['ID_author'][i], row['ID_author'][k])
                
    pos = nx.spring_layout(graph,k=0.2, iterations=50, weight=0.1, center=(0.5,0.5)) # forces graph layout
    
    # coletando nodes
    nodes = to_pandas_nodes(graph,pos)
    nodes.reset_index(inplace=True)
    nodes.rename(columns={'index':'ID_author'}, inplace=True)
    nodes = pd.merge(nodes,autores,on='ID_author')  # coletando nome dos autores
    nodes = pd.merge(nodes,author_paper, on='ID_author')  # coletando ID_paper
    
    # coletando edges
    edges = to_pandas_edges(graph,pos)
    
    
    
    # Gráfico 1
    print('Criando interatividade com o Altair (autores) ...')
    
    selector = alt.selection_single(empty='all',fields=['ID_author']) # iniciando seletor
    
    points = alt.Chart(nodes).add_selection(selector).mark_point(filled=True,size=90).encode(
                alt.X('x', axis=alt.Axis(title='')),
                alt.Y('y', axis=alt.Axis(title='')),
                tooltip='author',
                opacity=alt.condition(selector,alt.value(0.95),alt.value(0.4),legend=None),
                color=alt.condition(selector, 'ID_author', alt.value('lightgray'), legend=None)
            ).properties( selection=selector ).transform_filter(selector)

    # cria um background para efeitos de transição do seletor
    bk = alt.Chart(nodes).mark_point(color='lightgray',filled=True,size=90).encode(
                alt.X('x', axis=alt.Axis(title='')),
                alt.Y('y', axis=alt.Axis(title='')),
                tooltip='author',
                opacity=alt.value(0.4),
    )

    lines = alt.Chart(edges).mark_line(color='salmon').encode(
                alt.X('x', axis=alt.Axis(title='')),
                alt.Y('y', axis=alt.Axis(title='')),
                detail='edge',
                opacity=alt.value(0.15)
            )

    chart = alt.LayerChart(layer=(lines,bk+points)).properties(
                height=350,
                width=450
                ).interactive()
    
    
    
    
    ## Grafo 2 - Artigos (papers)
    print('Preparando grafo dos artigos...')

    graph1 = nx.Graph()
    group1 = pd.DataFrame(author_paper.groupby('ID_author')['ID_paper'].apply(list))
    
    # Adicionando "edges"
    for j,row in group1.iterrows():
        i=len(row['ID_paper'])
        for i in range(len(row['ID_paper'])):
            for k in range(i,len(row['ID_paper'])):
                graph1.add_edge(row['ID_paper'][i], row['ID_paper'][k])
                
    pos1 = nx.spring_layout(graph1,k=0.2, iterations=50, weight=0.1, center=(0.5,0.5))  # forces graph layout
    
    # coletando nodes
    nodes1 = to_pandas_nodes(graph1, pos1)
    nodes1.reset_index(inplace=True)
    nodes1.rename(columns={'index':'ID_paper'}, inplace=True)
    nodes1 = pd.merge(nodes1,artigos,on='ID_paper')  # coletando nome dos papers
    nodes1 = pd.merge(nodes1,author_paper,on='ID_paper')  # coletando ID_author
    
    # coletando edges
    edges1 = to_pandas_edges(graph1,pos1)
    
    
    
    # Gráfico 2
    print('Criando interatividade com o Altair (artigos)...')

    points1 = alt.Chart(nodes1).add_selection(selector).mark_point(filled=True,size=90).encode(
                alt.X('x', axis=alt.Axis(title='')),
                alt.Y('y', axis=alt.Axis(title='')),
                tooltip='paper',
                opacity=alt.condition(selector,alt.value(0.95),alt.value(0.4),legend=None),
                color=alt.condition(selector, 'ID_author', alt.value('lightgray'), legend=None)
    ).transform_filter(selector)

    # cria um background para efeitos de transição do seletor
    bk1 = alt.Chart(nodes1).mark_point(color='lightgray',filled=True,size=90).encode(
                alt.X('x', axis=alt.Axis(title='')),
                alt.Y('y', axis=alt.Axis(title='')),
                tooltip='paper',
                opacity=alt.value(0.4),
    )

    lines1 = alt.Chart(edges1).mark_line(color='lightblue').encode(
                alt.X('x', axis=alt.Axis(title='')),
                alt.Y('y', axis=alt.Axis(title='')),
                detail='edge',
                opacity=alt.value(0.2)
    )

    chart1 = alt.LayerChart(layer=(lines1,bk1 + points1)).properties(
                height=350,
                width=450
                ).interactive()

    
    
    ### Concatenando horizontamnete os gráficos 1 e 2
    horiz_chart = alt.hconcat(chart, chart1 ).configure_axis( ticks=False,
                grid=False,
                domain=False,
                labels=False).configure_view(
                strokeWidth=0   
            )


    return horiz_chart.to_json()
Example #16
0
    'country': 'United States',
    'animal': 'sheep'
}])

domains = ['person', 'cattle', 'pigs', 'sheep']

shape_scale = alt.Scale(
    domain=domains,
    range=[
        'M1.7 -1.7h-0.8c0.3 -0.2 0.6 -0.5 0.6 -0.9c0 -0.6 -0.4 -1 -1 -1c-0.6 0 -1 0.4 -1 1c0 0.4 0.2 0.7 0.6 0.9h-0.8c-0.4 0 -0.7 0.3 -0.7 0.6v1.9c0 0.3 0.3 0.6 0.6 0.6h0.2c0 0 0 0.1 0 0.1v1.9c0 0.3 0.2 0.6 0.3 0.6h1.3c0.2 0 0.3 -0.3 0.3 -0.6v-1.8c0 0 0 -0.1 0 -0.1h0.2c0.3 0 0.6 -0.3 0.6 -0.6v-2c0.2 -0.3 -0.1 -0.6 -0.4 -0.6z',
        'M4 -2c0 0 0.9 -0.7 1.1 -0.8c0.1 -0.1 -0.1 0.5 -0.3 0.7c-0.2 0.2 1.1 1.1 1.1 1.2c0 0.2 -0.2 0.8 -0.4 0.7c-0.1 0 -0.8 -0.3 -1.3 -0.2c-0.5 0.1 -1.3 1.6 -1.5 2c-0.3 0.4 -0.6 0.4 -0.6 0.4c0 0.1 0.3 1.7 0.4 1.8c0.1 0.1 -0.4 0.1 -0.5 0c0 0 -0.6 -1.9 -0.6 -1.9c-0.1 0 -0.3 -0.1 -0.3 -0.1c0 0.1 -0.5 1.4 -0.4 1.6c0.1 0.2 0.1 0.3 0.1 0.3c0 0 -0.4 0 -0.4 0c0 0 -0.2 -0.1 -0.1 -0.3c0 -0.2 0.3 -1.7 0.3 -1.7c0 0 -2.8 -0.9 -2.9 -0.8c-0.2 0.1 -0.4 0.6 -0.4 1c0 0.4 0.5 1.9 0.5 1.9l-0.5 0l-0.6 -2l0 -0.6c0 0 -1 0.8 -1 1c0 0.2 -0.2 1.3 -0.2 1.3c0 0 0.3 0.3 0.2 0.3c0 0 -0.5 0 -0.5 0c0 0 -0.2 -0.2 -0.1 -0.4c0 -0.1 0.2 -1.6 0.2 -1.6c0 0 0.5 -0.4 0.5 -0.5c0 -0.1 0 -2.7 -0.2 -2.7c-0.1 0 -0.4 2 -0.4 2c0 0 0 0.2 -0.2 0.5c-0.1 0.4 -0.2 1.1 -0.2 1.1c0 0 -0.2 -0.1 -0.2 -0.2c0 -0.1 -0.1 -0.7 0 -0.7c0.1 -0.1 0.3 -0.8 0.4 -1.4c0 -0.6 0.2 -1.3 0.4 -1.5c0.1 -0.2 0.6 -0.4 0.6 -0.4z',
        'M1.2 -2c0 0 0.7 0 1.2 0.5c0.5 0.5 0.4 0.6 0.5 0.6c0.1 0 0.7 0 0.8 0.1c0.1 0 0.2 0.2 0.2 0.2c0 0 -0.6 0.2 -0.6 0.3c0 0.1 0.4 0.9 0.6 0.9c0.1 0 0.6 0 0.6 0.1c0 0.1 0 0.7 -0.1 0.7c-0.1 0 -1.2 0.4 -1.5 0.5c-0.3 0.1 -1.1 0.5 -1.1 0.7c-0.1 0.2 0.4 1.2 0.4 1.2l-0.4 0c0 0 -0.4 -0.8 -0.4 -0.9c0 -0.1 -0.1 -0.3 -0.1 -0.3l-0.2 0l-0.5 1.3l-0.4 0c0 0 -0.1 -0.4 0 -0.6c0.1 -0.1 0.3 -0.6 0.3 -0.7c0 0 -0.8 0 -1.5 -0.1c-0.7 -0.1 -1.2 -0.3 -1.2 -0.2c0 0.1 -0.4 0.6 -0.5 0.6c0 0 0.3 0.9 0.3 0.9l-0.4 0c0 0 -0.4 -0.5 -0.4 -0.6c0 -0.1 -0.2 -0.6 -0.2 -0.5c0 0 -0.4 0.4 -0.6 0.4c-0.2 0.1 -0.4 0.1 -0.4 0.1c0 0 -0.1 0.6 -0.1 0.6l-0.5 0l0 -1c0 0 0.5 -0.4 0.5 -0.5c0 -0.1 -0.7 -1.2 -0.6 -1.4c0.1 -0.1 0.1 -1.1 0.1 -1.1c0 0 -0.2 0.1 -0.2 0.1c0 0 0 0.9 0 1c0 0.1 -0.2 0.3 -0.3 0.3c-0.1 0 0 -0.5 0 -0.9c0 -0.4 0 -0.4 0.2 -0.6c0.2 -0.2 0.6 -0.3 0.8 -0.8c0.3 -0.5 1 -0.6 1 -0.6z',
        'M-4.1 -0.5c0.2 0 0.2 0.2 0.5 0.2c0.3 0 0.3 -0.2 0.5 -0.2c0.2 0 0.2 0.2 0.4 0.2c0.2 0 0.2 -0.2 0.5 -0.2c0.2 0 0.2 0.2 0.4 0.2c0.2 0 0.2 -0.2 0.4 -0.2c0.1 0 0.2 0.2 0.4 0.1c0.2 0 0.2 -0.2 0.4 -0.3c0.1 0 0.1 -0.1 0.4 0c0.3 0 0.3 -0.4 0.6 -0.4c0.3 0 0.6 -0.3 0.7 -0.2c0.1 0.1 1.4 1 1.3 1.4c-0.1 0.4 -0.3 0.3 -0.4 0.3c-0.1 0 -0.5 -0.4 -0.7 -0.2c-0.3 0.2 -0.1 0.4 -0.2 0.6c-0.1 0.1 -0.2 0.2 -0.3 0.4c0 0.2 0.1 0.3 0 0.5c-0.1 0.2 -0.3 0.2 -0.3 0.5c0 0.3 -0.2 0.3 -0.3 0.6c-0.1 0.2 0 0.3 -0.1 0.5c-0.1 0.2 -0.1 0.2 -0.2 0.3c-0.1 0.1 0.3 1.1 0.3 1.1l-0.3 0c0 0 -0.3 -0.9 -0.3 -1c0 -0.1 -0.1 -0.2 -0.3 -0.2c-0.2 0 -0.3 0.1 -0.4 0.4c0 0.3 -0.2 0.8 -0.2 0.8l-0.3 0l0.3 -1c0 0 0.1 -0.6 -0.2 -0.5c-0.3 0.1 -0.2 -0.1 -0.4 -0.1c-0.2 -0.1 -0.3 0.1 -0.4 0c-0.2 -0.1 -0.3 0.1 -0.5 0c-0.2 -0.1 -0.1 0 -0.3 0.3c-0.2 0.3 -0.4 0.3 -0.4 0.3l0.2 1.1l-0.3 0l-0.2 -1.1c0 0 -0.4 -0.6 -0.5 -0.4c-0.1 0.3 -0.1 0.4 -0.3 0.4c-0.1 -0.1 -0.2 1.1 -0.2 1.1l-0.3 0l0.2 -1.1c0 0 -0.3 -0.1 -0.3 -0.5c0 -0.3 0.1 -0.5 0.1 -0.7c0.1 -0.2 -0.1 -1 -0.2 -1.1c-0.1 -0.2 -0.2 -0.8 -0.2 -0.8c0 0 -0.1 -0.5 0.4 -0.8z'
    ])

color_scale = alt.Scale(domain=domains,
                        range=[
                            'rgb(162,160,152)', 'rgb(194,81,64)',
                            'rgb(93,93,93)', 'rgb(91,131,149)'
                        ])

alt.Chart(source).mark_point(filled=True, opacity=1, size=100).encode(
    alt.X('x:O', axis=None),
    alt.Y('animal:O', axis=None),
    alt.Row('country:N', header=alt.Header(title='')),
    alt.Shape('animal:N', legend=None, scale=shape_scale),
    alt.Color('animal:N', legend=None, scale=color_scale),
).transform_window(x='rank()', groupby=['country',
                                        'animal']).properties(width=550,
                                                              height=140)
Example #17
0
def Plots(result, nth, w, h):
    # This allows for data greater than 5000 rows to be plotted
    alt.data_transformers.disable_max_rows()

    x, y17, y17w, y17b, y18, y40, y40b = (result[4], result[5], result[6],
                                          result[7], result[8], result[9],
                                          result[10])
    TPDsi, TPDei, Arsi, Arei = (result[14], result[15], result[16], result[17])
    wf, watstart, watend, ratio1718 = (result[18], result[11], result[12],
                                       result[13])

    temp = list(x[TPDsi:TPDei])
    temp = temp - temp[0]
    temp = 323 + temp * 10
    temp = list(temp[0::nth])

    # cut out and keep every nth point
    # Raw data Cuts
    xr = list(x[0::nth])
    y17r = list(y17[0::nth])
    y18r = list(y18[0::nth])
    y40r = list(y40[0::nth])

    # TPDsi:TPDei cuts
    xt = (list(x[TPDsi:TPDei]))[0::nth]
    y17t = (list(y17[TPDsi:TPDei]))[0::nth]
    y17wt = (list(y17w[TPDsi:TPDei]))[0::nth]
    y17bt = (list(y17b[TPDsi:TPDei]))[0::nth]
    y18t = (list(y18[TPDsi:TPDei]))[0::nth]

    # Ar cuts
    xa = (list(x[Arsi:Arei]))[0::nth]
    y40a = (list(y40[Arsi:Arei]))[0::nth]
    y40ba = (list(y40b[Arsi:Arei]))[0::nth]

    raw_data = pd.DataFrame({'x': xr, 'y17': y17r, 'y18': y18r, 'y40': y40r})
    raw_reshape = pd.melt(raw_data,
                          id_vars=['x'],
                          value_vars=['y17', 'y18', 'y40'],
                          var_name='legend',
                          value_name='y')
    raw_chart = alt.Chart(raw_reshape).mark_line(size=3).encode(
        alt.X('x', axis=alt.Axis(tickCount=7, title='Time (min)')),
        alt.Y('y', axis=alt.Axis(tickCount=7, title='Intensity (counts)')),
        alt.Color('legend', legend=alt.Legend(
            orient='top-left'))).configure_axis(grid=False).properties(
                width=w, height=h, title='Raw Data from CSV').interactive()

    TPD_data = pd.DataFrame({
        'x': xt,
        'y17': y17t,
        'y17 water corrected': y17wt,
        'y17 baseline corrected': y17bt,
        'y18': y18t
    })
    TPD_reshape = pd.melt(TPD_data,
                          id_vars=['x'],
                          value_vars=[
                              'y17', 'y17 water corrected',
                              'y17 baseline corrected', 'y18'
                          ],
                          var_name='legend',
                          value_name='y')
    TPD_chart = alt.Chart(TPD_reshape).mark_line(size=3).encode(
        alt.X('x', axis=alt.Axis(title='Time (min)')),
        alt.Y('y', axis=alt.Axis(title='Intensity (counts)')),
        alt.Color('legend', legend=alt.Legend(
            orient='top-right'))).configure_axis(grid=False).properties(
                width=w,
                height=h,
                title='TPD: y17 Water and Baseline Correction ')

    Ar_data = pd.DataFrame({
        'x': xa,
        'y40': y40a,
        'y40 baseline corrected': y40ba
    })
    Ar_reshape = pd.melt(Ar_data,
                         id_vars=['x'],
                         value_vars=['y40', 'y40 baseline corrected'],
                         var_name='legend',
                         value_name='y')
    Ar_chart = alt.Chart(Ar_reshape).mark_line(size=3).encode(
        alt.X('x', axis=alt.Axis(title='Time (min)')),
        alt.Y('y', axis=alt.Axis(title='Intensity (counts)')),
        alt.Color('legend',
                  legend=alt.Legend(orient='top-right'))).configure_axis(
                      grid=False).properties(width=w,
                                             height=h,
                                             title='Ar Pulse')

    T_data = pd.DataFrame({'T': temp, 'y': y17bt})
    T_chart = alt.Chart(T_data).mark_line(size=3).encode(
        alt.X('T', axis=alt.Axis(title='Temp (K)')),
        alt.Y('y', axis=alt.Axis(title='Intensity (counts)')),
    ).configure_axis(grid=False).properties(
        width=w, height=h, title='TPD as a Function of Temperature')

    if wf == 1:
        W_data = pd.DataFrame({'x': x[watstart:watend], 'y': ratio1718})
        W_chart = alt.Chart(W_data).mark_line(size=3).encode(
            alt.X('x', axis=alt.Axis(title='Time (min)')),
            alt.Y('y', axis=alt.Axis(title='Value of y17/y18 Preceeding TPD')),
        ).configure_axis(grid=False).properties(
            width=w, height=h, title='Water Correction Factor')
    else:
        W_chart = alt.LayerChart()

    return (raw_chart, TPD_chart, Ar_chart, T_chart, W_chart)
"""
Normalized Stacked Bar Chart
----------------------------
This example shows how to make a normalized stacked bar chart.
"""

import altair as alt
from altair.expr import datum, if_
from vega_datasets import data

source = data.population.url

alt.Chart(source).mark_bar().encode(
    alt.X('age:O', scale=alt.Scale(rangeStep=17)),
    alt.Y('sum(people):Q',
        axis=alt.Axis(title='population'),
        stack='normalize'
    ),
    alt.Color('gender:N',
        scale=alt.Scale(range=["#EA98D2", "#659CCA"])
    )
).transform_filter(
    datum.year == 2000
).transform_calculate(
    "gender", if_(datum.sex == 2, 'Female', 'Male')
)
Example #19
0
def make_org_plot(infile):
    """ Generate plots to explore the traffic distribution across organizations
    """
    pd.set_option('display.max_columns', None)
    grouped_flows = infra.pd.read_parquet(infile)
    grouped_flows = grouped_flows.reset_index()
    grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows[
        "bytes_down"]

    # If any orgs are visited by fewer than 5 participants, need to be "other" per IRB
    user_count = grouped_flows.copy()[["org", "user", "bytes_total"]]
    user_count = user_count.set_index("bytes_total")
    user_count = user_count.drop(0).reset_index()
    user_count = user_count.groupby(["org", "user"]).sum().reset_index()
    user_count = user_count.groupby(["org"]).count()
    small_orgs = user_count.loc[user_count["user"] < 5]
    small_orgs = small_orgs.reset_index()["org"]

    grouped_flows = grouped_flows.replace(small_orgs.values,
                                          value="Aggregated (Users < 5)")

    # Filter users by time in network to eliminate early incomplete samples
    user_active_ranges = infra.pd.read_parquet(
        "data/clean/user_active_deltas.parquet")[[
            "user", "days_since_first_active", "days_active"
        ]]
    # Drop users that joined less than a week ago or were active for less than a week.
    users_to_analyze = user_active_ranges.loc[
        user_active_ranges["days_since_first_active"] >= 7]

    grouped_flows = grouped_flows.merge(users_to_analyze,
                                        on="user",
                                        how="inner")
    print(user_active_ranges.head(10))

    # Figure out sorting order by total amount.
    org_totals = grouped_flows.groupby("org").sum().reset_index()
    org_sort_order = org_totals.sort_values(
        "bytes_total", ascending=False).set_index("bytes_total").reset_index()
    cat_sort_list = org_sort_order["org"].tolist()

    user_totals = grouped_flows.groupby("user").sum().reset_index()
    user_sort_order = user_totals.sort_values(
        "bytes_total", ascending=False).set_index("bytes_total").reset_index()
    user_sort_list = user_sort_order["user"].tolist()

    grouped_flows["GB"] = grouped_flows["bytes_total"] / (1000**3)
    grouped_flows = grouped_flows[["org", "user",
                                   "GB"]].groupby(["user", "org"]).sum()
    grouped_flows = grouped_flows.reset_index()
    grouped_flows["logGB"] = grouped_flows["GB"].transform(np.log10)

    alt.Chart(grouped_flows).mark_rect().encode(
        x=alt.X(
            "user:N",
            title="User (Sorted by Total GB)",
            axis=alt.Axis(labels=False),
            sort=user_sort_list,
        ),
        y=alt.Y(
            "org:N",
            title="Organization (Sorted by Total GB)",
            sort=cat_sort_list,
        ),
        # shape="direction",
        color=alt.Color(
            "logGB:Q",
            title="log(Total GB)",
            scale=alt.Scale(scheme="viridis"),
        ),
    ).properties(width=500, ).save(
        "renders/users_per_category_org.png",
        scale_factor=2,
    )

    # Normalize by each user's total spend to highlight categories
    user_total_to_merge = user_totals[[
        "user", "bytes_total"
    ]].rename(columns={"bytes_total": "user_total_bytes"})
    normalized_user_flows = grouped_flows.copy()
    normalized_user_flows = normalized_user_flows.merge(user_total_to_merge,
                                                        on="user")
    normalized_user_flows["user_total_bytes"] = normalized_user_flows[
        "user_total_bytes"] / 1000**3
    normalized_user_flows["normalized_bytes"] = normalized_user_flows[
        "GB"] / normalized_user_flows["user_total_bytes"]

    alt.Chart(normalized_user_flows).mark_rect().encode(
        x=alt.X(
            "user:N",
            title="User (Sorted by Total GB)",
            axis=alt.Axis(labels=False),
            sort=user_sort_list,
        ),
        y=alt.Y(
            "org:N",
            title="Organization (Sorted by Total GB)",
            sort=cat_sort_list,
        ),
        # shape="direction",
        color=alt.Color(
            "normalized_bytes:Q",
            title="Normalized (Per User) Traffic",
            scale=alt.Scale(scheme="viridis"),
        ),
    ).properties(width=500, ).save(
        "renders/users_per_category_org_normalized.png",
        scale_factor=2,
    )
Example #20
0
    st.write(
        """Customize the x and y axis through the sidebar visualization settings. 
                You can also select binary features as labels which will be in the form 
                of a color.""")
    select_graph = st.sidebar.radio('Select Graph',
                                    ('point', 'bar', 'area', 'line'))

    col1, col2, col3 = st.beta_columns([.5, .5, 1])
    graph_hgt = col1.slider('Height', 200, 600, 400, step=10)
    graph_wgt = col2.slider('Width', 400, 800, 600, step=10)

    df = df.loc[(df.creatinine_phosphokinase < 800) & (df.platelets < 500000) &
                (df.serum_creatinine < 2.2) & (df.age >= 40)]

    chart = alt.Chart(data=df, mark=select_graph).encode(
        alt.X(x_axis, scale=alt.Scale(zero=False)),
        alt.Y(y_axis, scale=alt.Scale(zero=False)),
        color=label).properties(height=graph_hgt, width=graph_wgt)
    st.write(chart)

    if y_axis == 'age' and x_axis == 'platelets' and label == 'DEATH_EVENT':
        st.write(
            'Majority of deceased patients had platelet count ranging from 150,000 - 300,000 and aged 58 - 75'
        )
    elif y_axis == 'age' and x_axis == 'creatinine_phosphokinase' and label == 'DEATH_EVENT':
        st.write(
            'Majority of deceased patients had creatinine phosphokinase count ranging from 100 - 250 and aged 55 - 70'
        )
    elif y_axis == 'age' and x_axis == 'serum_creatinine' and label == 'DEATH_EVENT':
        st.write(
            'Majority of deceased patients had serum creatinine count ranging from 1.2 - 1.9 and aged 50 - 75'
Example #21
0
def graph_spinorama(dfu, graph_params):
    xmin = graph_params['xmin']
    xmax = graph_params['xmax']
    ymin = graph_params['ymin']
    ymax = graph_params['ymax']
    if xmax == xmin:
        logging.error('Graph configuration is incorrect: xmin==xmax')
    if ymax == ymin:
        logging.error('Graph configuration is incorrect: ymin==ymax')
    # add selectors
    selectorsMeasurements = alt.selection_multi(fields=['Measurements'],
                                                bind='legend')
    scales = alt.selection_interval(bind='scales')
    # main charts
    xaxis = alt.X('Freq:Q',
                  title='Freqency (Hz)',
                  scale=alt.Scale(type='log',
                                  base=10,
                                  nice=False,
                                  domain=[xmin, xmax]),
                  axis=alt.Axis(format='s'))
    yaxis = alt.Y('dB:Q',
                  title='Sound Pressure (dB)',
                  scale=alt.Scale(zero=False, domain=[ymin, ymax]))
    # why -10?
    di_yaxis = alt.Y('dB:Q',
                     title='Sound Pressure DI (dB)',
                     scale=alt.Scale(zero=False, domain=[-5, ymax - ymin - 5]))
    color = alt.Color('Measurements', type='nominal', sort=None)
    opacity = alt.condition(selectorsMeasurements, alt.value(1),
                            alt.value(0.2))

    line = alt.Chart(dfu).mark_line().transform_filter(
        alt.FieldOneOfPredicate(field='Measurements',
                                oneOf=[
                                    'On Axis', 'Listening Window',
                                    'Early Reflections', 'Sound Power'
                                ])).encode(x=xaxis,
                                           y=yaxis,
                                           color=color,
                                           opacity=opacity)

    circle = alt.Chart(dfu).mark_circle(size=100).transform_filter(
        alt.FieldOneOfPredicate(
            field='Measurements',
            oneOf=[
                'On Axis', 'Listening Window', 'Early Reflections',
                'Sound Power'
            ])).encode(x=xaxis,
                       y=yaxis,
                       color=color,
                       opacity=alt.condition(nearest, alt.value(1),
                                             alt.value(0)),
                       tooltip=['Measurements', 'Freq', 'dB'])

    di = alt.Chart(dfu).mark_line().transform_filter(
        alt.FieldOneOfPredicate(
            field='Measurements',
            oneOf=['Early Reflections DI',
                   'Sound Power DI'])).encode(x=xaxis,
                                              y=di_yaxis,
                                              color=color,
                                              opacity=opacity)

    circle_di = alt.Chart(dfu).mark_circle(size=100).transform_filter(
        alt.FieldOneOfPredicate(
            field='Measurements',
            oneOf=['Early Reflections DI', 'Sound Power DI'
                   ])).encode(x=xaxis,
                              y=di_yaxis,
                              color=color,
                              opacity=alt.condition(nearest, alt.value(1),
                                                    alt.value(0)),
                              tooltip=['Measurements', 'Freq', 'dB'])

    # assemble elements together
    spin = alt.layer(circle + line, circle_di + di).resolve_scale(
        y='independent').add_selection(selectorsMeasurements).add_selection(
            scales).add_selection(nearest).properties(
                width=graph_params['width'], height=graph_params['height'])

    return spin
Example #22
0
def main():
    data = read_data()
    fos_level = unique_fos_level(data)
    model = load_bert_model()
    faiss_index = faiss.deserialize_index(load_faiss_index())
    author_data = read_author_data()

    st.title("ACL Publications Explorer")

    filter_year = st.sidebar.slider("Filter by year", 2000, 2020, (2000, 2020),
                                    1)
    filter_fos_level = st.sidebar.selectbox("Choose Field of Study level",
                                            fos_level)
    fields_of_study = unique_fos(data, filter_fos_level, 25)
    filter_fos = st.sidebar.multiselect("Choose Fields of Study",
                                        fields_of_study)
    author_input = st.sidebar.text_input("Search by author name")
    # User search
    user_input = st.sidebar.text_area("Search by paper title")
    num_results = st.sidebar.slider("Number of search results", 10, 150, 10)

    if filter_fos and not user_input and not author_input:
        frame = data[(data.name.isin(filter_fos))
                     & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1]))]
        color_on_fos = True
    elif filter_fos and user_input and not author_input:
        encoded_user_input = vector_search([user_input], model, faiss_index,
                                           num_results)
        frame = data[(data.name.isin(filter_fos))
                     & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1])) &
                     (data.id.isin(encoded_user_input))]
        color_on_fos = True
    elif filter_fos and user_input and author_input:
        ids = author_data[author_data.name == author_input]['paper_id']
        encoded_user_input = vector_search([user_input], model, faiss_index,
                                           num_results)
        frame = data[(data.name.isin(filter_fos))
                     & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1])) &
                     (data.id.isin(encoded_user_input)) & (data.id.isin(ids))]
        color_on_fos = True
    elif filter_fos and not user_input and author_input:
        ids = author_data[author_data.name == author_input]['paper_id']
        frame = data[(data.name.isin(filter_fos))
                     & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1])) & (data.id.isin(ids))]
        color_on_fos = True
    elif not filter_fos and user_input and not author_input:
        encoded_user_input = vector_search([user_input], model, faiss_index,
                                           num_results)
        frame = data[data.id.isin(encoded_user_input)
                     & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1]))]
        color_on_fos = False
    elif not filter_fos and user_input and author_input:
        encoded_user_input = vector_search([user_input],
                                           model,
                                           faiss_index,
                                           num_results=150)
        ids = author_data[author_data.name == author_input]['paper_id']
        frame = data[(data.id.isin(ids)) & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1])) &
                     (data.id.isin(encoded_user_input))]
        color_on_fos = False
    elif not filter_fos and not user_input and author_input:
        ids = author_data[author_data.name == author_input]['paper_id']
        frame = data[(data.id.isin(ids)) & (data.year >= str(filter_year[0])) &
                     (data.year <= str(filter_year[1]))]
        color_on_fos = False
    else:
        frame = data[(data.year >= str(filter_year[0]))
                     & (data.year <= str(filter_year[1]))]
        color_on_fos = False

    if color_on_fos:
        chart = alt.Chart(frame.drop_duplicates('id')).mark_point().encode(
            alt.X('Component 1', scale=alt.Scale(domain=(1, 16))),
            alt.Y('Component 2', scale=alt.Scale(domain=(0, 18))),
            alt.Color('name', title='Field of Study'),
            alt.Size('citations',
                     scale=alt.Scale(range=[10, 500]),
                     title='Citations'),
            href='source:N',
            tooltip=['title', 'year']).interactive().properties(width=650,
                                                                height=500)

    else:
        chart = alt.Chart(frame.drop_duplicates('id')).mark_point().encode(
            alt.X('Component 1', scale=alt.Scale(domain=(1, 16))),
            alt.Y('Component 2', scale=alt.Scale(domain=(0, 18))),
            alt.Size('citations',
                     scale=alt.Scale(range=[10, 500]),
                     title='Citations'),
            href='source:N',
            tooltip=['title', 'year']).interactive().properties(width=650,
                                                                height=500)

    bar_data = pd.DataFrame(frame[frame.level == filter_fos_level].groupby(
        'name')['id'].count()).reset_index().sort_values('id',
                                                         ascending=False)[:30]
    barchart = alt.Chart(bar_data).mark_bar().encode(
        alt.X('name', sort='-y', title='Fields of Study'),
        alt.Y('id', title='Count')).properties(width=650, height=150)
    c = (chart & barchart)
    st.altair_chart(c, use_container_width=True)

    st.subheader("How to use this app")
    st.write(f"""
    This application is intended for the visual exploration and discovery of research publications that have been presented at the ACL (Annual Meeting of the Association for Computational Linguistics).

Every particle in the scatterplot is an academic publication. The particles are positioned in space based on the semantic similarity of the paper titles; the closer two points are, the more semantically similar their titles. You can hover over the particles to read their titles and you can click them to be redirected to the original source. You can zoom in the visualisation by scrolling and you can reset the view by double clicking the white space within the figure. Regarding the bar chart, it shows the most used Fields of Study for the papers shown in the scatterplot.

You can also **search** for publications by paper titles (more information below). 

#### Filters
You can refine your query based on the publication year, paper content, field of study and author. You can also combine any of the filter for more granular searches.
- **Filter by year**: Select a time range for the papers. For example, drag both sliders to 2020 to find out the papers that will be presented at ACL 2020.
- **Field of Study level**: Microsoft Academic Graph uses a 6-level hierarchy where level 0 contains high level disciplines such as Computer science and level 5 contains the most granular paper keywords. This filter will change what's shown in the bar chart as well as the available options in the filter below.
- ** Fields of Study**: Select the Fields of Study to be displayed in the visualisations. The available options are affected by your selection in the above filter.
- **Search by author name**: Find an author's publications. **Note**: You need to type in the exact name.
- **Search by paper title**: Type in a paper title and find its most relevant relevant publications. You should use at least a sentence to receive meaningful results.
- **Number of search results**: Specify the number of papers to be returned when you search by paper title.
    """)

    st.subheader("About")
    st.write(f"""
I am [Kostas](http://kstathou.github.io/) and I work at the intersection of knowledge discovery, data engineering and scientometrics. I am a Mozilla Open Science Fellow and a Principal Data Science Researcher at Nesta. I am currently working on [Orion](https://orion-search.org/) (work in progress), an open-source knowledge discovery and research measurement tool. 

If you have any questions or would like to learn more about it, you can find me on [twitter](https://twitter.com/kstathou) or send me an email at [email protected]
    """)

    st.subheader("Appendix: Data & methods")
    st.write(f"""
    I collected all of the publications from [Microsoft Academic Graph](https://www.microsoft.com/en-us/research/project/academic-knowledge/) that were published between 2000 and 2020 and were presented at the ACL.

I fetched 8,724 publications. To create the 2D visualisation, I encoded the paper titles to dense vectors using a [sentence-DistilBERT](https://github.com/UKPLab/sentence-transformers) model. That produced a 768-dimensional vector for each paper which I projected to a 2D space with [UMAP](https://umap-learn.readthedocs.io/en/latest/). For the paper title search engine, I indexed the vectors with [Faiss](https://github.com/facebookresearch/faiss/tree/master/python).
""")
Example #23
0
his.update_yaxes(title="Listings")
st.plotly_chart(his)

st.header("Scatter Visualization of Price")


st.markdown('In addition, we want to learn how the number of bedrooms, ratings and capacity affect the price.')

column = ["neighbourhood_cleansed","bedrooms","beds","review_scores_rating", "accommodates"]
x_axis = st.selectbox('X Axis',column)

color_list = ["room_type", "neighbourhood_cleansed", "bedrooms", "review_scores_rating"]
scatter_color = st.selectbox('Color', color_list)

scatter = alt.Chart(filtered_listing).mark_point().encode(
    alt.X(x_axis),
    alt.Y("price"),
    alt.Color(scatter_color)
)

st.write(scatter)

st.header("Price Estimation")
st.markdown('Finally, before we actually book an order, let\'s estimate how much we are going to pay.')
st.write("Please provide the room type, location and the number of bedrooms")
est_list = ["room_type", "neighbourhood_cleansed", "bedrooms"]
est_room_type = st.selectbox('Room Type', df_listing['room_type'].unique())
est_neigh = st.selectbox('Location', df_listing['neighbourhood_cleansed'].unique())
est_bed = st.selectbox('Number of Bedroom', df_listing['bedrooms'].unique())

# kNN algorithm, take average for tie-break
Example #24
0
# Sidebar Controls
st.sidebar.header('Filter Data:')
year = st.sidebar.slider('Year', 1970, 1980, (1970, 1980))
origin = st.sidebar.multiselect('Origin', ['Europe', 'Japan', 'USA'],
                                ['Europe', 'Japan', 'USA'])

# Filter data by sidebar inputs:
cars = df[(df['Year'].dt.year.between(year[0], year[1]))
          & (df['Origin'].isin(origin))]
cars

# Summary of selected data
chart = alt.Chart(cars).mark_bar().encode(
    x='count()', y='Origin', color='Origin').properties(
        width=300, height=200) | alt.Chart(cars).mark_bar().encode(
            alt.X("year(Year):N"), y='count()', color='Origin').properties(
                width=300, height=200)
chart

st.markdown('## Projection of cars')

projcars = cars.dropna().reset_index(drop=True)

features = st.multiselect('Features to project:', [
    'Weight_in_lbs', 'Horsepower', 'Miles_per_Gallon', 'Displacement',
    'Cylinders', 'Acceleration'
], ['Weight_in_lbs', 'Horsepower', 'Miles_per_Gallon'])
method_name = st.selectbox('Projection method:', ('PCA', 'MDS', 'TSNE'))

projData = projcars.drop(projcars.columns.difference(features), axis=1)
Example #25
0
    def plot_mds(
        self,
        rank="auto",
        metric="braycurtis",
        method="pcoa",
        title=None,
        xlabel=None,
        ylabel=None,
        color=None,
        size=None,
        tooltip=None,
        return_chart=False,
        label=None,
    ):
        """Plot beta diversity distance matrix using multidimensional scaling (MDS).

        Parameters
        ----------
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.
        metric : {'braycurtis', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac}, optional
            Function to use when calculating the distance between two samples.
        method : {'pcoa', 'smacof'}
            Algorithm to use for ordination. PCoA uses eigenvalue decomposition and is not well
            suited to non-euclidean distance functions. SMACOF is an iterative optimization strategy
            that can be used as an alternative.
        title : `string`, optional
            Text label at the top of the plot.
        xlabel : `string`, optional
            Text label along the horizontal axis.
        ylabel : `string`, optional
            Text label along the vertical axis.
        size : `string` or `tuple`, optional
            A string or a tuple containing strings representing metadata fields. The size of points
            in the resulting plot will change based on the metadata associated with each sample.
        color : `string` or `tuple`, optional
            A string or a tuple containing strings representing metadata fields. The color of points
            in the resulting plot will change based on the metadata associated with each sample.
        tooltip : `string` or `list`, optional
            A string or list containing strings representing metadata fields. When a point in the
            plot is hovered over, the value of the metadata associated with that sample will be
            displayed in a modal.
        label : `string` or `callable`, optional
            A metadata field (or function) used to label each analysis. If passing a function, a
            dict containing the metadata for each analysis is passed as the first and only
            positional argument. The callable function must return a string.

        Examples
        --------
        Scatter plot of weighted UniFrac distance between all our samples, using counts at the genus
        level.

        >>> plot_mds(rank='genus', metric='unifrac')

        Notes
        -----
        **For `smacof`**: The values reported on the axis labels are Pearson's correlations between
        the distances between points on each axis alone, and the corresponding distances in the
        distance matrix calculated using the user-specified metric. These values are related to the
        effectiveness of the MDS algorithm in placing points on the scatter plot in such a way that
        they truly represent the calculated distances. They do not reflect how well the distance
        metric captures similarities between the underlying data (in this case, an OTU table).
        """
        import altair as alt
        import numpy as np
        import pandas as pd
        from scipy.spatial.distance import squareform
        from scipy.stats import pearsonr
        from skbio.stats import ordination
        from sklearn import manifold
        from sklearn.metrics.pairwise import euclidean_distances

        if len(self._results) < 2:
            raise OneCodexException(
                "`plot_mds` requires 2 or more valid classification results.")

        dists = self._compute_distance(rank, metric).to_data_frame()

        # here we figure out what to put in the tooltips and get the appropriate data
        if tooltip:
            if not isinstance(tooltip, list):
                tooltip = [tooltip]
        else:
            tooltip = []

        tooltip.insert(0, "Label")

        if color and color not in tooltip:
            tooltip.insert(1, color)

        if size and size not in tooltip:
            tooltip.insert(2, size)

        magic_metadata, magic_fields = self._metadata_fetch(tooltip,
                                                            label=label)

        if method == "smacof":
            # adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html
            x_field = "MDS1"
            y_field = "MDS2"

            seed = np.random.RandomState(seed=3)
            mds = manifold.MDS(max_iter=3000,
                               eps=1e-12,
                               random_state=seed,
                               dissimilarity="precomputed",
                               n_jobs=1)
            pos = mds.fit(dists).embedding_
            plot_data = pd.DataFrame(pos,
                                     columns=[x_field, y_field],
                                     index=dists.index)
            plot_data = plot_data.div(plot_data.abs().max(axis=0),
                                      axis=1)  # normalize to [0,1]

            # determine how much of the original distance is captured by each of the axes after MDS.
            # this implementation of MDS does not use eigen decomposition and so there's no simple
            # way of returning a 'percent of variance explained' value
            r_squared = []

            for axis in [0, 1]:
                mds_dist = pos.copy()
                mds_dist[::, axis] = 0
                mds_dist = squareform(euclidean_distances(mds_dist).round(6))
                r_squared.append(pearsonr(mds_dist, squareform(dists))[0])

            # label the axes
            x_extra_label = "r² = %.02f" % (r_squared[0], )
            y_extra_label = "r² = %.02f" % (r_squared[1], )
        elif method == "pcoa":
            # suppress eigenvalue warning from skbio--not because it's an invalid warning, but
            # because lots of folks in the field run pcoa on these distances functions, even if
            # statistically inappropriate. perhaps this will change if we ever become more
            # opinionated about the analyses that we allow our users to do (roo)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                ord_result = ordination.pcoa(
                    dists.round(6))  # round to avoid float precision errors

            plot_data = ord_result.samples.iloc[:,
                                                [0, 1
                                                 ]]  # get first two components
            plot_data = plot_data.div(plot_data.abs().max(axis=0),
                                      axis=1)  # normalize to [0,1]
            plot_data.index = dists.index
            x_field, y_field = plot_data.columns.tolist(
            )  # name of first two components

            x_extra_label = "%0.02f%%" % (ord_result.proportion_explained[0] *
                                          100, )
            y_extra_label = "%0.02f%%" % (ord_result.proportion_explained[1] *
                                          100, )
        else:
            raise OneCodexException("MDS method must be one of: smacof, pcoa")

        # label the axes
        if xlabel is None:
            xlabel = "{} ({})".format(x_field, x_extra_label)
        if ylabel is None:
            ylabel = "{} ({})".format(y_field, y_extra_label)

        plot_data = pd.concat([plot_data, magic_metadata],
                              axis=1).reset_index()

        alt_kwargs = dict(
            x=alt.X(x_field, axis=alt.Axis(title=xlabel)),
            y=alt.Y(y_field, axis=alt.Axis(title=ylabel)),
            tooltip=[magic_fields[t] for t in tooltip],
            href="url:N",
            url="https://app.onecodex.com/classification/" +
            alt.datum.classification_id,
        )

        # only add these parameters if they are in use
        if color:
            alt_kwargs["color"] = magic_fields[color]
        if size:
            alt_kwargs["size"] = magic_fields[size]

        chart = (alt.Chart(plot_data).transform_calculate(
            url=alt_kwargs.pop("url")).mark_circle().encode(**alt_kwargs))

        if title:
            chart = chart.properties(title=title)

        if return_chart:
            return chart
        else:
            chart.interactive().display()
# Environment ------------------------------------------------------------------
from pathlib import Path
import altair as alt
import pandas as pd


# Data I/O
data = pd.read_csv('summary.csv')

# define selection
click = alt.selection_multi(encodings=['color'])

scatter_fd = alt.Chart(data).mark_circle(size=250).encode(
    x=alt.X('sub:Q', title=''),
    y=alt.Y('fd-mean:Q', title='Mean Framewise Displacement'),
    color=alt.Color('task:N',
                    scale=alt.Scale(range=['#E7DECD', '#B9314F', '#312F2F']),
                    legend=None),
    tooltip=[alt.Tooltip('sub:N', title='Subject No.'),
             alt.Tooltip('fd-per:Q', format='.2f', title='>0.2 (%)'),
             alt.Tooltip('fd-mean:Q', format='.4f', title='Mean'),
             alt.Tooltip('fd-std:Q', format='.4f', title='SD'),
             alt.Tooltip('fd-max:Q', format='.4f', title='Max'),
             alt.Tooltip('fd-min:Q', format='.4f', title='Min')]
).properties(
    width=700,
    height=400
).transform_filter(
    click
).interactive()
Example #27
0
def plot_weather_data(obs_df, col_name, time_basis):
    """
    Visualizes the weather station observations including air temperature,
    atmospheric pressure, wind speed, and wind direction changing over time.

    Parameters
    ----------
    obs_df : pandas.DataFrame
        A dataframe that contains a time series of weather station
        observations.
    col_name : str
        Variables that users would like to plot on a timely basis,
        including 'air_temp', 'atm_press', 'wind_spd', 'wind_dir'
    time_basis : str
        The users can choose to plot the observations on 'monthly' or
        'daily basis'
    Returns
    -------
    altair.vegalite.v4.api.Chart
        A plot can visualize the changing of observation on the timely basis
        that user chooses.
    Examples
    --------
    >>> plot_weather_data(obs_df, col_name="air_temp", time_basis="monthly")
    """

    # Test input types
    assert (
        type(obs_df) == pd.core.frame.DataFrame
    ), "Weather data should be a Pandas DataFrame."
    assert type(col_name) == str, "Variable name must be entered as a string"
    assert type(time_basis) == str, "Time basis must be entered as a string"
    # Test edge cases
    assert col_name in [
        "air_temp",
        "atm_press",
        "wind_spd",
        "wind_dir",
    ], "Variable can only be one of air_temp, atm_press, wind_spd or wind_dir"
    assert time_basis in [
        "monthly",
        "daily",
    ], "Time basis can only be monthly or daily"

    df = obs_df.dropna()
    assert (
        len(df.index) > 2
    ), "Dataset is not sufficient to visualize"  # Test edge cases
    year = df.datetime.dt.year[0]

    if time_basis == "monthly":
        df = df.set_index("datetime").resample("M").mean().reset_index()
        assert (
            len(df.index) > 2
        ), "Dataset is not sufficient to visualize"  # Test edge cases

        if col_name == "air_temp":
            line = (
                alt.Chart(df, title="Air Temperature for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "month(datetime)",
                        title="Month",
                        axis=alt.Axis(labelAngle=-30),
                    ),
                    alt.Y(
                        "air_temp",
                        title="Air Temperature",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )
        elif col_name == "atm_press":
            line = (
                alt.Chart(df, title="Atmospheric Pressure for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "month(datetime)",
                        title="Month",
                        axis=alt.Axis(labelAngle=-30),
                    ),
                    alt.Y(
                        "atm_press",
                        title="Atmospheric Pressure",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )
        elif col_name == "wind_spd":
            line = (
                alt.Chart(df, title="Wind Speed for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "month(datetime)",
                        title="Month",
                        axis=alt.Axis(labelAngle=-30),
                    ),
                    alt.Y(
                        "wind_spd",
                        title="Wind Speed",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )
        else:
            line = (
                alt.Chart(df, title="Wind Direction for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "month(datetime)",
                        title="Month",
                        axis=alt.Axis(labelAngle=-30),
                    ),
                    alt.Y(
                        "wind_dir",
                        title="Wind Direction",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )

    else:
        df = df.set_index("datetime").resample("D").mean().reset_index()
        assert (
            len(df.index) > 2
        ), "Dataset is not sufficient to visualize"  # Test edge cases

        if col_name == "air_temp":
            line = (
                alt.Chart(df, title="Air Temperature for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "datetime", title="Date", axis=alt.Axis(labelAngle=-30)
                    ),
                    alt.Y(
                        "air_temp",
                        title="Air Temperature",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )
        elif col_name == "atm_press":
            line = (
                alt.Chart(df, title="Atmospheric Pressure for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "datetime", title="Date", axis=alt.Axis(labelAngle=-30)
                    ),
                    alt.Y(
                        "atm_press",
                        title="Atmospheric Pressure",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )
        elif col_name == "wind_spd":
            line = (
                alt.Chart(df, title="Wind Speed for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "datetime", title="Date", axis=alt.Axis(labelAngle=-30)
                    ),
                    alt.Y(
                        "wind_spd",
                        title="Wind Speed",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )
        else:
            line = (
                alt.Chart(df, title="Wind Direction for " + str(year))
                .mark_line(color="orange")
                .encode(
                    alt.X(
                        "datetime", title="Date", axis=alt.Axis(labelAngle=-30)
                    ),
                    alt.Y(
                        "wind_dir",
                        title="Wind Direction",
                        scale=alt.Scale(zero=False),
                    ),
                    alt.Tooltip(col_name),
                )
            )

    chart = (
        line.properties(width=500, height=350)
        .configure_axis(labelFontSize=15, titleFontSize=20, grid=False)
        .configure_title(fontSize=25)
    )

    return chart
Example #28
0
def main():

    #st.write(pd.value_counts(dados_covid["resultadoTeste"]))

    st.sidebar.title('HACKATHON 1')
    st.sidebar.subheader('Análise dados COVID-19')

    filtro_coluna = st.sidebar.selectbox(
        'Selecione o filtro', ('Dados Gerais', 'Idade', 'Sexo', 'Sintomas'))

    if filtro_coluna:

        if filtro_coluna == 'Dados Gerais':
            st.write(dados_covid.head(1000))
            if st.checkbox("Mostrar colunas"):
                st.write(dados_covid.count())

        if filtro_coluna == 'Sexo':
            st.subheader('Sexo por resultado do teste')
            sexo_barras = alt.Chart(dados_covid, width=200).mark_bar().encode(
                alt.X('sexo:O', axis=alt.Axis(title='')),
                alt.Y('count():Q'),
                alt.Column('resultadoTeste:O'),
                color=alt.Color('sexo:N',
                                scale=alt.Scale(range=["#EA98D2", "#659CCA"])),
                tooltip='count()').interactive()
            st.altair_chart(sexo_barras)
    #        count_sexo = pd.value_counts(dados_covid['sexo'])color=alt.Color('gender:N', scale=alt.Scale(range=["#EA98D2", "#659CCA"]))
    #        st.write(count_sexo)
    #        st.bar_chart(count_sexo)

        if filtro_coluna == 'Idade':
            st.markdown('Describe da coluna Idade')
            st.write(dados_covid.idade.describe())
            st.subheader('Idade por resultado teste')
            idade_barras = alt.Chart(dados_covid, width=200).mark_bar().encode(
                alt.X('idade', bin=alt.Bin(maxbins=20)),
                alt.Y('count():Q'),
                alt.Column('resultadoTeste:O'),
                color='resultadoTeste',
                tooltip=['idade', 'count()']).transform_filter(
                    alt.FieldRangePredicate(field='idade',
                                            range=[0, 120])).interactive()
            st.altair_chart(idade_barras, use_container_width=True)

            idade_line = alt.Chart(dados_covid, width=600).mark_line().encode(
                x='idade', y='count():Q',
                color='resultadoTeste:O').transform_filter(
                    alt.FieldRangePredicate(field='idade', range=[0, 100]))
            st.altair_chart(idade_line)

        if filtro_coluna == 'Sintomas':
            st.subheader('Sintomas relatados')
            sintomas = dados_covid.columns[4:]
            df_sintomas = dados_covid[sintomas].sum().reset_index()
            df_sintomas.columns = ['sintoma', 'count']
            st.write(df_sintomas)
            sintomas_bar = alt.Chart(df_sintomas, width=700).mark_bar().encode(
                x='sintoma', y='count', tooltip=['count']).interactive()
            st.write("\n\n")
            st.altair_chart(sintomas_bar)

            st.subheader('Sintomas por resultado do teste')
            grouped = dados_covid.groupby(['resultadoTeste'])[sintomas].sum()
            st.write(grouped)
            st.write("\n\n\n")
            st.bar_chart(grouped)

            select = st.multiselect(
                "Selecione combinação de sintomas apresentados",
                dados_covid.columns[4:].tolist(),
                default=["Febre"])
            df_select = countsintomas(select, sintomas)
            df_select
            #            st.write("Numero de ocorrências: ", len(df_select.index))
            st.write(df_select.groupby(['resultadoTeste'])[sintomas].sum())

            st.markdown(
                '->Número de pessoas que aprensentaram apenas os sintomas selecionados'
            )

    st.sidebar.subheader('Grupo 1')
    st.sidebar.markdown('Daniel Santos Pereira')
    st.sidebar.markdown('Fernando Henrique De Brito Borges')
    st.sidebar.markdown('Gláucio Ribeiro Santos')
    st.sidebar.markdown('Rafael Rodrigues dos Santos')
Example #29
0
def horizon_selector(
    base: alt.Chart,
    horizon_selection_brush: alt.MultiSelection,
    belief_horizon_unit: str,
    intuitive_forecast_horizon: bool,
    unique_belief_horizons,
) -> alt.LayerChart:
    bar_chart = (
        base.mark_rule(orient="vertical").transform_filter(
            time_selection_brush
        )  # Apply brush before calculating accuracy metrics for the selected events on the fly
        .transform_calculate(
            constant=1 + alt.datum.event_start -
            alt.datum.event_start).transform_calculate(
                belief_horizon_str='datum.belief_horizon + " %s"' %
                belief_horizon_unit).
        encode(
            opacity=alt.condition(
                time_selection_brush,
                alt.Opacity("event_start:T",
                            scale=alt.Scale(domain=(0.9999, 1)),
                            legend=None),
                alt.value(0),
            ),
            # Trick to be able to apply the selection filter for event_start (event_start must be a field in one of the encoding channels)
            x=alt.X(
                "belief_horizon:Q",
                axis=alt.Axis(labelFlush=False),
                scale=alt.Scale(
                    zero=False,
                    domain=(unique_belief_horizons[0],
                            unique_belief_horizons[-1]),
                ),
                title="",
            ),
            y=alt.Y(
                "constant:Q",
                title=" ",
                axis=alt.Axis(values=[], domain=False, ticks=False),
            ),
            color=alt.condition(
                horizon_selection_brush | horizon_hover_brush,
                alt.ColorValue("#c21431"),
                alt.ColorValue(idle_color),
            ),
            size=alt.value(1),
            tooltip=[
                alt.Tooltip(
                    "belief_horizon_str:N",
                    title="Click to select %s" %
                    ("forecast horizon"
                     if intuitive_forecast_horizon else "belief horizon"),
                )
            ],
        ).properties(
            height=30,
            title="Select %s" %
            ("forecast horizon"
             if intuitive_forecast_horizon else "belief horizon"),
        ).transform_filter(time_selection_brush))
    circle_chart = (bar_chart.mark_circle().transform_calculate(
        half_constant=alt.datum.constant / 2).encode(
            y=alt.Y("half_constant:Q", title="", axis=alt.Axis(values=[])),
            size=alt.value(100),
        ))
    return (
        bar_chart.add_selection(horizon_selection_brush, horizon_hover_brush) +
        circle_chart)
"""
Error Bars showing Confidence Interval
======================================
This example shows how to show error bars using covidence intervals.
The confidence intervals are computed internally in vega by
a non-parametric [bootstrap of the mean](https://github.com/vega/vega-statistics/blob/master/src/bootstrapCI.js).
"""

import altair as alt
from vega_datasets import data

barley = data.barley()

points = alt.Chart(barley).mark_point(filled=True).encode(
    alt.X('mean(yield)',
          scale=alt.Scale(zero=False),
          axis=alt.Axis(title='Barley Yield')),
    y='variety',
    color=alt.value('black'))

error_bars = alt.Chart(barley).mark_rule().encode(x='ci0(yield)',
                                                  x2='ci1(yield)',
                                                  y='variety')

points + error_bars