Exemple #1
0
def run_trend_over_time():
    st.markdown('''
		## How is life expectancy associated with health indicators and economics indicators?


		Having a healthy lifestyle can increase the life expectancy [1], and the trend in life expectancy
		over time can reflect the changes in population health conditions and health services adequacy.

		In this seciton, we will explore the relationship between life expectancy and health expenditures
		and economics indicators. 

		## Let's see the data

		To compare data for specific countries, you can choose multiple countries in the **multi-selection box** on the left sidebar. 
		If no countries are selected, we will show you a graph of life expectancy over time for all countries.

		Once you have selected one or more countries to focus on, you can choose from the **drop down menu** in the sidebar an additional indicator to explore its relationship with life expectancy over time, while
		comparing among multiple countries.
	''')
    data_cached, countries = load_merge_data()
    data = data_cached.copy()
    data['Year'] = pd.to_datetime(data['Year'], format='%Y')

    # drop box to select one variable to view
    st.sidebar.header("Adjust Parameters")

    factors = [
        'Current health expenditure (% of GDP)',
        'Current health expenditure per capita (current US$)',
        'GDP per capita (current US$)',
        'Unemployment, total (% of total labor force)',
        'Gini',
    ]

    factor = st.sidebar.selectbox("Additional Factors", factors)

    selected_countries = st.sidebar.multiselect('Select Countries to Compare',
                                                countries)

    # plot factor countries over time
    if selected_countries:

        curr_df = keep_only_selected_countries(data, selected_countries)

        curr_df = dropna_by_feature(
            curr_df, [factor, 'Life expectancy at birth, total (years)'])

        line_p = alt.Chart(curr_df).mark_line().encode(x=alt.X(
            'Year:T', axis=alt.Axis(title='Year', format=("%Y"))),
                                                       color='Country Name')
        upper = line_p.encode(y=str(factor))
        lower = line_p.encode(y='Life expectancy at birth, total (years)')

        # Create a selection that chooses the nearest point & selects based on x-value
        nearest = alt.selection(type='single',
                                nearest=True,
                                on='mouseover',
                                fields=['Year'],
                                empty='none')
        idx = 0
        plots = [upper, lower]
        result_plots = []
        for line in plots:

            # Transparent selectors across the chart. This is what tells us
            # the x-value of the cursor
            selectors = alt.Chart(curr_df).mark_point().encode(
                x='Year',
                opacity=alt.value(0),
            )
            if idx == 0:
                selectors = selectors.add_selection(nearest)

            # Draw a rule at the location of the selection
            rules = alt.Chart(curr_df).mark_rule(color='darkgray').encode(
                x='Year', ).transform_filter(nearest)
            # Draw points on the line, and highlight based on selection
            points = line.mark_point().encode(
                opacity=alt.condition(nearest, alt.value(1), alt.value(0)))
            if idx == 0:
                # Draw text labels near the points, and highlight based on selection
                text = line.mark_text(align='left', dx=5, dy=-5).encode(
                    text=alt.condition(nearest, str(factor), alt.value(' ')))
            else:
                # Draw text labels near the points, and highlight based on selection
                text = line.mark_text(
                    align='left', dx=10, dy=-10).encode(text=alt.condition(
                        nearest, 'Life expectancy at birth, total (years)',
                        alt.value(' ')))
            if idx == 0:
                # Put the five layers into a chart and bind the data
                result_plots.append(
                    alt.layer(line, selectors, points, rules, text))
            else:
                result_plots.append(
                    alt.layer(line, selectors, points, rules,
                              text).properties(height=200))
            idx += 1
        result_plot = alt.vconcat(result_plots[0], result_plots[1])
        st.altair_chart(result_plot, use_container_width=True)
        st.markdown('''
			The above graph consists of two line charts. The upper one displays a line chart of your selected 
			indicator for the selected countries over time. The lower chart, on the same time scale, displays 
			the life expectancies of those countries. You can compare the trend and shape of your selected factor
			with the line of life expectancy for one country or multiple countries.

			To make the comparison meaningful, the graph only shows a time range which we have both life expectancy
			and the selected indicator data collected for the specified countries.

			You can move your mouse on the upper graph, a vertical line displays the nearest year that your mouse 
			points to, and the corresponding data points will be exaggrated with the exact y values to the right of the 
			data point. The values are printed in the same color as its corresponding line.
		''')

    else:
        country_filter = st.radio(
            '', ('All', 'Top 5 as of 2017', 'Bottom 5 as of 2017'))
        if country_filter == 'All':
            countries_keep = countries
        elif country_filter == 'Top 5 as of 2017':
            countries_keep = data[data['Year'] == pd.to_datetime(
                '2017', format='%Y')].sort_values(
                    'Life expectancy at birth, total (years)',
                    ascending=False).head(5)['Country Name']
        else:
            countries_keep = data[data['Year'] == pd.to_datetime(
                '2017', format='%Y')].sort_values(
                    'Life expectancy at birth, total (years)').head(
                        5)['Country Name']

        data = keep_only_selected_countries(data, countries_keep)
        # always plot the life expectancy
        life_exp = alt.Chart(data).mark_line(size=4).encode(
            x=alt.X(
                'Year:T',
                scale=alt.Scale(domain=(pd.to_datetime('1960', format='%Y'),
                                        pd.to_datetime('2017', format='%Y'))),
                axis=alt.Axis(title='Year', format=("%Y"))),
            y=alt.Y('Life expectancy at birth, total (years)',
                    scale=alt.Scale(domain=(0, 90))),
            color='Country Name',
            tooltip=['Country Name'])
        st.altair_chart(life_exp.properties(height=450),
                        use_container_width=True)
        st.markdown('''
			The above is a line graph of life expectancy at birth over time, where each line is a different country.
			Most of the lines are mangled together as there are so many countries in the world. Instead of looking at the 
			lines of specific countries, we hope to provide you with an idea of a general increasing trend in life expectancy in 
			the recent years in the world. 

			If you find a particular line especially interesting, you will be able to see the name of the country 
			corresponding to the line by moving your mouse over it.

			We have provided two filter options to help you narrow down your exploration scope. The default 'All' will
			show all countries on the graph; 'Top 5 as of 2017' will only keep 5 countries or regions with highest
			life expectancy in 2017; similarly, 'Bottom 5 as of 2017' will only keep 5 countries or regions with least
			life expectancy in 2017.
		''')

    st.markdown('''
		### References
		[1]
		Li, Y., Pan, A., Wang, D. D., Liu, X., Dhana, K., Franco, O. H., Hu, F. B. (2018) -  
		"Impact of Healthy Lifestyle Factors on Life Expectancies in the US Population." 
		Circulation, 138(4), 345-355. doi:10.1161/circulationaha.117.032047
	''')
Exemple #2
0
def make_line_plot(date_list, damage):
    """
    Generates a filled line plot based on user selection of date and damage level

    Parameters
    ----------
    date_list - a list of 2 ints; user selected from a date slide and passed by callback
    damage - a string; user selected from dropdown and passed by callback

    Return
    ------
    an altair plot converted to html
    """

    alt.themes.register('mds_special', mds_special)
    alt.themes.enable('mds_special')
    #alt.themes.enable('none') # to return to default

    query_string = ""
    for user_select_damage in damage:
        query_string += 'damage_level == "' + user_select_damage + '" | '

    query_string = query_string[:-2]

    df_line = df.query('year >= @date_list[0] & year <= @date_list[1]')

    if len(query_string) != 0:
        label = alt.selection_single(encodings=['x'],
                                     on='mouseover',
                                     nearest=True,
                                     empty='none')

        #generate a line plot
        line_plot_base = alt.Chart(
            df_line.query(query_string),
            title='Bird Strike Damage over Time').mark_area(
                opacity=0.3, interpolate='monotone').encode(
                    alt.X('year:O', axis=alt.Axis(title="Year", labelAngle=0)),
                    alt.Y('count(damage_level):N',
                          axis=alt.Axis(title="Bird Strikes"),
                          stack=None),
                    alt.Color(
                        'damage_level',
                        scale=alt.Scale(
                            domain=['Substantial', 'Medium', 'Minor', 'None'],
                            range=['red', 'dodgerblue', 'grey', 'darkgreen']),
                        legend=alt.Legend(orient='bottom',
                                          titleOrient='left',
                                          title="Damage Level",
                                          labelFontSize=15,
                                          titleFontSize=15)),
                    alt.Order('damage_level_sort', sort='ascending'),
                    alt.Tooltip(
                        ['damage_level', 'year', 'count(damage_level)']))
        #create an interactive vertical bar that displays point values of line plots
        line_plot = alt.layer(
            line_plot_base,
            alt.Chart().mark_rule(color='grey').encode(
                x='year:O').transform_filter(label),
            line_plot_base.mark_circle().encode(opacity=alt.condition(
                label, alt.value(1), alt.value(0))).add_selection(label),
            line_plot_base.mark_text(
                align='left',
                dx=5,
                dy=-10,
                fontSize=15,
                fontWeight=600,
                stroke='grey',
                strokeWidth=1).encode(
                    text='count(damage_level):N').transform_filter(label),
            line_plot_base.mark_text(
                align='left', dx=5, dy=-10, fontSize=15,
                fontWeight=600).encode(
                    text='count(damage_level):N').transform_filter(label),
            data=df).properties(width=500, height=400)

        line_plot = line_plot.to_html()
    else:
        line_plot = None

    return line_plot
    def render(self, df):
        st.markdown(
            '# Percentual de homens e mulheres que ingressaram, evadiram e concluíram, em cada centro da UFRN'
        )
        st.markdown(
            'O gráfico a seguir retrata o percentual de discentes dos sexos feminino e masculino que ingressaram, concluíram e evadiram por cada centro da UFRN.'
        )
        st.markdown('#### Como interpretar o gráfico')
        st.markdown('''
                - Cada barra representa os valores para um dos centros de ensino da UFRN;
                - Na parte superior do gráfico está representado o percentual de discentes do sexo masculino, enquanto na parte inferior está expresso o percentual de discentes do sexo feminino;
                - A barra empilhada representa o percentual dos totais de ingressantes daquele sexo que concluíram e evadiram. A parte mais próxima do eixo X corresponde a de maior valor. Exemplo: se a barra de conclusão estiver mais próximo do eixo X que a barra de evasão, significa que concluíram mais pessoas do que evadiram.
                ''')
        st.markdown(
            '**Observação**: é importante ressaltar que os valores negativos no eixo y ("% do total de discentes"), quando estamos observando o percentual de discentes do sexo feminino, não indica um valor negativo em si - esse formato foi utilizado por limitações da ferramenta.'
        )

        dfs = []
        df = df[df['nome_unidade_gestora'].notna()]
        df = df[df['nome_unidade_gestora'].str.contains('CENTRO')]

        for campus in df.nome_unidade_gestora.unique():
            df_campus = self._calcular_percentuais_by_campus(df, campus)
            df_campus['nome_unidade_gestora'] = campus
            dfs.append(df_campus)

        df_chart = pd.concat(dfs)
        filter_f = df_chart['sexo'] == 'F'
        df_chart.loc[filter_f,
                     'percentual'] = df_chart[filter_f]['percentual'] * -1
        df_chart['sexo'] = df_chart['sexo'].replace({
            'F': 'Feminino',
            'M': 'Masculino',
        })
        df_chart['sort'] = abs(df_chart['percentual'])
        df_chart['nome_unidade_gestora'] = df_chart[
            'nome_unidade_gestora'].str.title()
        df_chart = df_chart.sort_values(['sort'], ascending=False)
        df_chart = df_chart.reset_index(drop=True)

        for name, group in df_chart.groupby(
                by=['nome_unidade_gestora', 'sexo']):
            indexes = list(group.index)
            df_chart.at[indexes[0], 'size'] = 100
            df_chart.at[indexes[1], 'size'] = 50
            df_chart.at[indexes[2], 'size'] = 50

        chart_params = {
            'x':
            alt.X('nome_unidade_gestora:N',
                  title=None,
                  axis=alt.Axis(zindex=10)),
            'y':
            alt.Y('sum(percentual):Q',
                  stack=False,
                  title='% do total de discentes'),
            'color':
            alt.Color('tipo:N',
                      title='Status',
                      scale=alt.Scale(domain=['Evasão', 'Conclusão', 'Total'],
                                      range=['#fd8060', '#b0d8a4',
                                             '#fee191'])),
            'size':
            alt.Size('size:Q', legend=None, scale=alt.Scale(domain=[0, 40])),
            'order':
            alt.Order('sort', sort='descending'),
            'tooltip': [
                alt.Tooltip('sexo:N', title='Gênero'),
                alt.Tooltip('sum(percentual):Q',
                            title='% referente ao total',
                            format='.2f'),
                alt.Tooltip('tipo:N', title='Status'),
                alt.Tooltip('total:Q', title='Quantidade'),
            ]
        }

        alt_chart = alt.Chart(df_chart[df_chart['tipo'] == 'Total']).mark_bar()\
            .encode(**chart_params)\
            .properties(height=600)
        chart_params['y'] = alt.Y('sum(percentual):Q',
                                  title='% do total de discentes')
        alt_chart_stacked = alt.Chart(
            df_chart[df_chart['tipo'] != 'Total']).mark_bar().encode(
                **chart_params)
        line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule().encode(y='y')

        st.altair_chart(alt_chart + alt_chart_stacked + line,
                        use_container_width=True)
Exemple #4
0
        svr_cv.predict(X_test)[i] + 1.5,
        y_test.to_numpy().ravel()[i] + 1.5
    ])
    data.append([
        'k Neighbours Regressor',
        knr_cv.predict(X_test)[i][0] + 1.5,
        y_test.to_numpy().ravel()[i] + 1.5
    ])
    data.append([
        'Random Forest Regressor',
        rfr_cv.predict(X_test)[i] + 1.5,
        y_test.to_numpy().ravel()[i] + 1.5
    ])
df = pd.DataFrame(data, columns=['Model', 'Predicted Age', 'Observed Age'])

df_line = pd.DataFrame([[0, 0], [25, 25]], columns=['x', 'y'])

scatter = alt.Chart(df, title='Predicted Age vs. Observed Age').mark_circle(
    opacity=0.3).encode(x=alt.X('Observed Age:Q'),
                        y=alt.Y('Predicted Age:Q'),
                        color=alt.Color('Model:N')).properties(width=1000,
                                                               height=500)

line = alt.Chart(df_line).mark_line().encode(
    x=alt.X('x:Q'), y=alt.Y('y:Q'),
    color=alt.value('red')).properties(width=1000, height=500)

line + scatter

# SVR does best as it has the lowest root mean squared error. However, the models are very similar in their performances.
    'Country name',
    df.groupby('Country').count().reset_index()['Country'].tolist())

# by country name
if len(country_name_input) > 0:
    subset_data = df[df['Country'].isin(country_name_input)]

########################################################################################
## linechart

st.subheader('Comparison of infection growth')

total_cases_graph = alt.Chart(subset_data).transform_filter(
    alt.datum.total_cases > 0).mark_line().encode(
        x=alt.X('date', type='temporal', title='Date'),
        y=alt.Y('sum(total_cases):Q', title='Confirmed cases'),
        color='Country',
        tooltip='sum(total_cases)',
    ).properties(width=1500, height=600).configure_axis(labelFontSize=17,
                                                        titleFontSize=20)

st.altair_chart(total_cases_graph)

########################################################################################
### SELECTBOX widgets
metrics = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths']
#  'total_cases_per_million', 'new_cases_per_million',
# 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand',
# 'new_tests_per_thousand']

mini_df = df[metrics + ['date', "Longitude", "Latitude"]]
Exemple #6
0
def project_progress(progress_df,
                     width=800,
                     heights=(50, 400),
                     line_size=5,
                     text_size=15,
                     opacity=0.3):
    """Creates an interactive project progress exploration chart.

    It lets you choose the resources you want to see ('experiment_count_day' or 'running_time_day'), you
    can see the metric/id/tags for every experiment on mouseover, you can select the x range which you want to
    investigate by selecting it on the top chart and you get shown the actual values on mousehover.

    The chart is build on top of the Altair which in turn is build on top of Vega-Lite and Vega.
    That means you can use the objects produces by this script (converting it first to json by .to_json() method)
    in your html webpage without any problem.

    Args:
        progress_df('pandas.DataFrame'): Dataframe containing ['id', 'metric', 'metric_best', 'running_time',
            'running_time_day', 'experiment_count_day', 'owner', 'tags', 'timestamp', 'timestamp_day'].
            It can be obtained from a list of experiments by using the
            `neptunecontrib.api.extract_project_progress_info` function.
            If the len of the dataframe exceeds 5000 it will cause the MaxRowsError.
            Read the Note to learn why and how to disable it.
        width(int): width of the chart. Default is 800.
        heights(tuple): heights of the subcharts. The first value controls the top chart, the second
            controls the bottom chart. Default is (50,400).
        line_size(int): size of the lines. Default is 5.
        text_size(int): size of the text containing metric/id/tags in the middle.
        opacity(float): opacity of the resource bars in the background. Default is 0.3.

    Returns:
        `altair.Chart`: Altair chart object which will be automatically rendered in the notebook. You can
        also run the `.to_json()` method on it to convert it to the Vega-Lite json format.

    Examples:
        Instantiate a session::

            from neptunelib.api.session import Session
            session = Session()

        Fetch a project and the experiment view of that project::

            project = session.get_projects('neptune-ai')['neptune-ai/Salt-Detection']
            leaderboard = project.get_leaderboard()

        Create a progress info dataframe::

            from neptunecontrib.api.utils import extract_project_progress_info
            progress_df = extract_project_progress_info(leadearboard,
                                                        metric_colname='channel_IOUT',
                                                        time_colname='finished')

        Plot interactive chart in notebook::

            from neptunecontrib.viz.projects import project_progress
            project_progress(progress_df)

    Note:
        Because Vega-Lite visualizations keep all the chart data in the HTML the visualizations can consume huge
        amounts of memory if not handled properly. That is why, by default the hard limit of 5000 rows is set to
        the len of dataframe. That being said, you can disable it by adding the following line in the notebook or code::

            import altair as alt
            alt.data_transformers.enable('default', max_rows=None)

    """
    top_height, bottom_height = heights

    progress_df = _prep_progress_df(progress_df)

    nearest = alt.selection(type='single',
                            nearest=True,
                            on='mouseover',
                            fields=['timestamp'],
                            empty='none')
    brush = alt.selection(type='interval', encodings=['x'])
    exp_box = alt.binding_select(
        options=['running_time_day', 'experiment_count_day'])
    exp_selection = alt.selection_single(name='select',
                                         fields=['resource'],
                                         bind=exp_box)

    top_view = alt.Chart(height=top_height, width=width).mark_line(
        interpolate='step-after', size=line_size).encode(
            x='timestamp:T',
            y=alt.Y('metric:Q', scale=alt.Scale(zero=False), axis=None),
            color=alt.Color(
                'actual_or_best:N',
                legend=alt.Legend(title='Metric actual or current best')),
        ).add_selection(brush)

    selectors = alt.Chart().mark_point().encode(
        x=alt.X('timestamp:T'),
        opacity=alt.value(0),
    ).add_selection(nearest).transform_filter(brush)
    line = alt.Chart().mark_line(
        interpolate='step-after', size=line_size).encode(
            x=alt.X('timestamp:T'),
            y=alt.Y('metric:Q', scale=alt.Scale(zero=False)),
            color=alt.Color(
                'actual_or_best:N',
                legend=alt.Legend(title='Metric actual or current best')),
        ).transform_filter(brush)
    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0)))
    text = line.mark_text(align='left', dx=5, dy=-5,
                          size=text_size).encode(text=alt.condition(
                              nearest, 'metric:Q', alt.value(' ')),
                                                 color='actual_or_best:N')
    rules = alt.Chart().mark_rule(color='gray').encode(
        x=alt.X('timestamp:T'), ).transform_filter(nearest)
    metrics = alt.layer(line, points, text, rules, selectors).properties(
        height=bottom_height,
        width=width,
    )

    exp_selector = alt.Chart().mark_area().encode(
        x=alt.X('timestamp:T'),
        opacity=alt.value(0),
    ).add_selection(exp_selection).transform_filter(
        exp_selection).transform_filter(brush)
    exp_line = alt.Chart().mark_area(interpolate='step-after').encode(
        x=alt.X('timestamp:T'),
        y=alt.Y('time_or_count:Q', scale=alt.Scale(zero=False)),
        color=alt.ColorValue('red'),
        opacity=alt.OpacityValue(opacity)).transform_filter(
            brush).transform_filter(exp_selection)
    exp_points = exp_line.mark_point(filled=True).encode(
        color=alt.ColorValue('black'),
        opacity=alt.condition(nearest, alt.value(1), alt.value(0)))
    exp_text = exp_line.mark_text(
        align='left', dx=5, dy=-5, fontWeight='bold',
        size=text_size).encode(text=alt.condition(nearest, 'time_or_count:Q',
                                                  alt.value(' ')),
                               color=alt.ColorValue('black'))
    exp_rules = alt.Chart().mark_rule(color='gray').encode(
        x=alt.X('timestamp:T'), ).transform_filter(nearest)
    exps = alt.layer(exp_line, exp_points, exp_rules, exp_text,
                     exp_selector).properties(
                         height=bottom_height,
                         width=width,
                     )

    main_view = alt.layer(exps, metrics).properties(
        height=bottom_height,
        width=width,
    ).resolve_scale(y='independent')

    tags = alt.Chart(height=1, width=1).mark_text(align='left',
                                                  size=text_size,
                                                  fontWeight='bold').encode(
                                                      x=alt.X('timestamp:T',
                                                              axis=None),
                                                      text=alt.condition(
                                                          nearest, 'text:N',
                                                          alt.value(' ')),
                                                  )

    combined = alt.vconcat(top_view, tags, main_view, data=progress_df)
    return combined
def make_plot(race='Black', place='Hospital'):
    # Don't forget to include imports

    def mds_special():
        font = "Arial"
        axisColor = "#000000"
        gridColor = "#DEDDDD"
        return {
            "config": {
                "title": {
                    "fontSize": 24,
                    "font": font,
                    "anchor": "start",  # equivalent of left-aligned.
                    "fontColor": "#000000"
                },
                'view': {
                    "height": 300,
                    "width": 400
                },
                "axisX": {
                    "domain": True,
                    #"domainColor": axisColor,
                    "gridColor": gridColor,
                    "domainWidth": 1,
                    "grid": False,
                    "labelFont": font,
                    "labelFontSize": 12,
                    "labelAngle": 0,
                    "tickColor": axisColor,
                    "tickSize":
                    5,  # default, including it just to show you can change it
                    "titleFont": font,
                    "titleFontSize": 16,
                    "titlePadding":
                    10,  # guessing, not specified in styleguide
                    "title": "X Axis Title (units)",
                },
                "axisY": {
                    "domain": False,
                    "grid": True,
                    "gridColor": gridColor,
                    "gridWidth": 1,
                    "labelFont": font,
                    "labelFontSize": 14,
                    "labelAngle": 0,
                    #"ticks": False, # even if you don't have a "domain" you need to turn these off.
                    "titleFont": font,
                    "titleFontSize": 16,
                    "titlePadding":
                    10,  # guessing, not specified in styleguide
                    "title": "Y Axis Title (units)",
                    # titles are by default vertical left of axis so we need to hack this
                    #"titleAngle": 0, # horizontal
                    #"titleY": -10, # move it up
                    #"titleX": 18, # move it to the right so it aligns with the labels
                },
            }
        }

    # register the custom theme under a chosen name
    alt.themes.register('mds_special', mds_special)

    # enable the newly registered theme
    alt.themes.enable('mds_special')
    #alt.themes.enable('none') # to return to default

    # Need to enable this to allow work with larger datasets (https://altair-viz.github.io/user_guide/faq.html)
    # alt.data_transformers.enable('json')
    alt.data_transformers.disable_max_rows()

    #################################################################### READING THE DATA #########################################################################################
    drug_overdose_wrangled_m = pd.read_csv(
        "../data/2012-2018_lab4_data_drug-overdose-deaths-connecticut-wrangled-melted.csv"
    )  # FOR THE BAR CHART

    drug_overdose_wrangled_p = pd.read_csv(
        "../data/2012-2018_lab4_data_drug-overdose-deaths-connecticut-wrangled-pivot.csv"
    )  # FOR THE LINE CHART

    ##################################################################### FILTERING BY race and place of death ####################################################################
    ####################################### HERE is Where we are taking the inputs of the function to update the data ##############################################################
    by_race_place = drug_overdose_wrangled_m[
        (drug_overdose_wrangled_m['Race'] == race) &
        (drug_overdose_wrangled_m['Location'] == place)]  # FOR THE BAR CHART
    by_race_place_p = drug_overdose_wrangled_p[
        (drug_overdose_wrangled_p['Race'] == race) &
        (drug_overdose_wrangled_p['Location'] == place)]  # FOR THE LINE CHART

    # WRANGLING
    drug_overdose_mpdrug = by_race_place.groupby(['Drug']).sum().drop(columns = 'Age')\
                                               .sort_values('Toxicity_test', ascending = False).reset_index()

    ######################### BAR PLOT using the filtered data ################################

    mp_drug = alt.Chart(drug_overdose_mpdrug).mark_bar(
        opacity=0.8, color='teal').encode(
            alt.Y('Drug:N',
                  title='',
                  sort=alt.EncodingSortField(field='Toxicity_test',
                                             order='descending')),
            alt.X('Toxicity_test:Q', title='Times a drug tested positive'),
            tooltip=[
                alt.Tooltip('Drug', title='Drug'),
                alt.Tooltip('Toxicity_test', title='Positives')
            ]).properties(width=200, height=400, title='Drugs in test')
    ######################### LINE PLOT  filtered data ######################################
    trend_AFTER = alt.Chart(by_race_place_p).mark_line(point=True).encode(
        alt.X('year(Date):O', title='Reported year of death'),
        alt.Y('count()',
              title='Count of people',
              scale=alt.Scale(domain=(0, 2000))),
        tooltip=[
            alt.Tooltip('year(Date)', title='Year'),
            alt.Tooltip('count()', title='Count of people')
        ]).properties(width=200, height=400, title="  Trend")

    return (mp_drug | trend_AFTER)
df = df.dropna()


# In[31]:


df


# In[5]:


alt.Chart(draft07).mark_point().encode(
    alt.X('PTS'),
    alt.Y('G'),
    alt.Size('Pk'),
    tooltip = [alt.Tooltip('Player'),
               alt.Tooltip('PTS'),
               alt.Tooltip('G'),
               alt.Tooltip('Pk')
              ]
).interactive()


# In[6]:


alt.Chart(draft07).mark_point().encode(
    x = "G",
    y = "PTS",
def carrier_delay():
    st.header("Is there any carrier that is more likely to delay?")
    """
    Carrier Delay represent the delay caused by the air carrier. 
    Possible occurence are: aircraft cleaning, aircraft damage, baggage, etc.

    """

    carrier_delay = alt.Chart(df).mark_bar().transform_filter(
        alt.datum['CARRIER_DELAY'] > 0).encode(
            x=alt.X("OP_CARRIER", sort='-y', title='Carrier'),
            y=alt.Y("average(CARRIER_DELAY)",
                    scale=alt.Scale(zero=False),
                    title='Average Carrier Delay'),
            tooltip=[
                alt.Tooltip("OP_CARRIER", title='Carrier'),
                alt.Tooltip("average(CARRIER_DELAY)",
                            title='Average Carrier Delay')
            ]).properties(width=600, height=250)
    st.write(carrier_delay)

    picked = alt.selection_interval()

    carrier_delay_dist = alt.Chart(df).mark_point(
    ).transform_filter(alt.datum['CARRIER_DELAY'] > 0).transform_calculate(
        CARRIER_DELAY="datum.CARRIER_DELAY < 350 ? datum.CARRIER_DELAY : 350",
    ).encode(x=alt.X("OP_CARRIER"),
             y=alt.Y("CARRIER_DELAY", scale=alt.Scale(zero=False)),
             order=alt.Order('average(CARRIER_DELAY)', sort='descending'),
             tooltip=['OP_CARRIER', 'CARRIER_DELAY', 'ARR_DELAY'
                      ]).properties(width=600,
                                    height=100).add_selection(picked)

    # st.write()

    # st.write(carrier_delay_dist.add_selection(picked) & carrier_delay.transform_filter(picked))
    # binding selection
    # input_dropdown = alt.binding_select(options=carrier_names, name="Carrier ")
    # dd_select = alt.selection_single(encodings=['color'], bind=input_dropdown)

    ## carrier delay vs. arrival delay
    "**Is there any relationship between Carrier Delay and other type delays?**"
    select = alt.selection_single(on='mouseover', fields=['OP_CARRIER'])

    delay_type_input = st.selectbox(
        "Show correlation between Carrier Delay and ",
        ('Arrival Delay', 'Departure Delay', 'Weather Delay', 'Nas Delay',
         'Security Delay', 'Late Aircraft Delay'))
    delay_type = get_delay_type(delay_type_input)

    carrier_vs_arr = alt.Chart(df).mark_point(
    ).transform_filter(alt.datum['CARRIER_DELAY'] >= 0).transform_calculate(
        SELECTED_DELAY=f"datum.{delay_type} < 350 ? datum.{delay_type} : 350",
        CARRIER_DELAY="datum.CARRIER_DELAY < 350 ? datum.CARRIER_DELAY : 350",
    ).encode(x='SELECTED_DELAY:Q',
             y=alt.Y("CARRIER_DELAY"),
             color=alt.Color('OP_CARRIER'),
             tooltip=[delay_type, 'CARRIER_DELAY',
                      'OP_CARRIER']).properties(width=600, height=300)
    """
    You can select the interval in the graph below to explore the carrier delays in a specific range.
    """
    st.write(
        carrier_delay_dist & carrier_vs_arr.transform_filter(picked).encode(
            color=alt.condition(select, "OP_CARRIER:N", alt.value(
                'lightgray'))).add_selection(select))
Exemple #10
0
})
df_fi.sort_values(by="importance", ascending=False, inplace=True)

# +
from sklearn.metrics import mean_squared_error

rmse_dev = mean_squared_error(y_dev, rf.predict(X_dev), squared=False)
print(f"RMSE on the dev set: {rmse_dev:0.4f}")
# -

# The following bar chart shows the permutation feature importance. If the bar is orange, then the permutation importance of the corresponding feature is smaller than 1% of the RMSE on the dev set.

alt.Chart(df_fi)\
   .mark_bar()\
   .encode(x=alt.X("importance:Q"), 
           y=alt.Y("feature:N", sort="-x"),
           tooltip=["feature:N", alt.Tooltip("importance:Q", format="0.5f")],
           color=alt.condition(alt.datum.importance > 0.01*rmse_dev, 
                               alt.value("#1f77b4"), alt.value("#ff7f0e")))\
   .properties(height=1024, title="Permutation feature importance")


# +
def scatter_plot(data:pd.DataFrame, field:str, width:int=240) -> alt.Chart:
    chart = alt.Chart(df_dev[[field,target]])\
               .mark_circle()\
               .encode(x=alt.X(field, scale=alt.Scale(zero=False), title=None), 
                       y=alt.Y(target, scale=alt.Scale(type="log"), title=f"Log{target}"))\
               .properties(width=width, title=field)
    return chart
).transform_window(
    first_date='first_value(scaled_date)',
    last_date='last_value(scaled_date)',
    sort=[{"field": "scaled_date", "order": "ascending"}],
    groupby=['decade'],
    frame=[None, None]
).transform_calculate(
  end="datum.first_date === datum.scaled_date ? 'first' : datum.last_date === datum.scaled_date ? 'last' : null"
).encode(
    x=alt.X(
        "scaled_date:Q",
        axis=alt.Axis(title="Year into Decade", tickCount=11)
    ),
    y=alt.Y(
        "CO2:Q",
        title="CO2 concentration in ppm",
        scale=alt.Scale(zero=False)
    )
)

line = base.mark_line().encode(
    color=alt.Color(
        "decade:O",
        scale=alt.Scale(scheme="magma"),
        legend=None
    )
)

text = base.encode(text="year:N")

start_year = text.transform_filter(
Exemple #12
0
def analyse_view_clones_ts_fragments():

    log.info("read views/clones time series fragments (CSV docs)")

    basename_suffix = "_views_clones_series_fragment.csv"
    csvpaths = _glob_csvpaths(basename_suffix)

    dfs = []
    column_names_seen = set()

    for p in csvpaths:
        log.info("attempt to parse %s", p)
        snapshot_time = _get_snapshot_time_from_path(p, basename_suffix)

        df = pd.read_csv(
            p,
            index_col=["time_iso8601"],
            date_parser=lambda col: pd.to_datetime(col, utc=True),
        )

        # Skip logic for empty data frames. The CSV files written should never
        # be empty, but if such a bad file made it into the file system then
        # skipping here facilitates debugging and enhanced robustness.
        if len(df) == 0:
            log.warning("empty dataframe parsed from %s, skip", p)
            continue

        # A time series fragment might look like this:
        #
        # df_views_clones:
        #                            clones_total  ...  views_unique
        # time_iso8601                             ...
        # 2020-12-21 00:00:00+00:00           NaN  ...             2
        # 2020-12-22 00:00:00+00:00           2.0  ...            23
        # 2020-12-23 00:00:00+00:00           2.0  ...            20
        # ...
        # 2021-01-03 00:00:00+00:00           8.0  ...            21
        # 2021-01-04 00:00:00+00:00           7.0  ...            18
        #
        # Note the NaN and the floaty type.

        # All metrics are known to be integers by definition here. NaN values
        # are expected to be present anywhere in this dataframe, and they
        # semantically mean "0". Therefore, replace those with zeros. Also see
        # https://github.com/jgehrcke/github-repo-stats/issues/4
        df = df.fillna(0)
        # Make sure numbers are treated as integers from here on. This actually
        # matters in a cosmetic way only for outputting the aggregate CSV later
        # #       # not for plotting and number crunching).
        df = df.astype(int)

        # attach snapshot time as meta data prop to df
        df.attrs["snapshot_time"] = snapshot_time

        # The index is not of string type anymore, but of type
        # `pd.DatetimeIndex`. Reflect that in the name.
        df.index.rename("time", inplace=True)

        if column_names_seen and set(df.columns) != column_names_seen:
            log.error("columns seen so far: %s", column_names_seen)
            log.error("columns in %s: %s", p, df.columns)
            sys.exit(1)

        column_names_seen.update(df.columns)

        df = df.sort_index()

        # Sanity check: snapshot time _after_ latest timestamp in time series?
        # This could hit in on a machine with a bad time setting when fetching
        # data.
        if df.index.max() > snapshot_time:
            log.error(
                "for CSV file %s the snapshot time %s is older than the newest sample",
                p,
                snapshot_time,
            )
            sys.exit(1)

        dfs.append(df)

    # for df in dfs:
    #     print(df)

    log.info("total sample count: %s", sum(len(df) for df in dfs))

    if len(dfs) == 0:
        log.info("leave early: no data for views/clones")
        return

    newest_snapshot_time = max(df.attrs["snapshot_time"] for df in dfs)

    df_prev_agg = None
    if ARGS.views_clones_aggregate_inpath:
        if os.path.exists(ARGS.views_clones_aggregate_inpath):
            log.info("read previous aggregate: %s",
                     ARGS.views_clones_aggregate_inpath)
            df_prev_agg = pd.read_csv(
                ARGS.views_clones_aggregate_inpath,
                index_col=["time_iso8601"],
                date_parser=lambda col: pd.to_datetime(col, utc=True),
            )
            df_prev_agg.index.rename("time", inplace=True)
        else:
            log.info(
                "previous aggregate file does not exist: %s",
                ARGS.views_clones_aggregate_inpath,
            )

    log.info("time of newest snapshot: %s", newest_snapshot_time)
    log.info("build aggregate, drop duplicate data")

    # Each dataframe in `dfs` corresponds to one time series fragment
    # ("snapshot") obtained from the GitHub API. Each time series fragment
    # contains 15 samples (rows), with two adjacent samples being 24 hours
    # apart. Ideally, the time series fragments overlap in time. They overlap
    # potentially by a lot, depending on when the individual snapshots were
    # taken (think: take one snapshot per day; then 14 out of 15 data points
    # are expected to be "the same" as in the snapshot taken the day before).
    # Stich these fragments together (with a buch of "duplicate samples), and
    # then sort this result by time.
    log.info("pd.concat(dfs)")
    dfall = pd.concat(dfs)

    if df_prev_agg is not None:
        if set(df_prev_agg.columns) != set(dfall.columns):
            log.error(
                "set(df_prev_agg.columns) != set (dfall.columns): %s, %s",
                df_prev_agg.columns,
                dfall.columns,
            )
            sys.exit(1)
        log.info("pd.concat(dfall, df_prev_agg)")
        dfall = pd.concat([dfall, df_prev_agg])

    dfall.sort_index(inplace=True)

    log.info("shape of dataframe before dropping duplicates: %s", dfall.shape)
    # print(dfall)

    # Now, the goal is to drop duplicate data. And again, as of a lot of
    # overlap between snapshots there's a lot of duplicate data to be expected.
    # What does "duplicat data" mean? We expect that there are multiple samples
    # from different snapshots with equivalent timestamp. OK, we should just
    # take any one of them. They should all be the same, right? They are not
    # all equivalent. I've found that at the boundaries of each time series
    # fragment, the values returned by the GitHub API are subject to a
    # non-obvious cutoff effect: for example, in a snapshot obtained on Dec 15,
    # the sample for Dec 7 is within the mid part of the fragment and shows a
    # value of 73 for `clones_total`. The snapshot obtained on Dec 21 has the
    # sample for Dec 7 at the boundary (left-hand, towards the past), and that
    # shows a value of 18 for `clones_total`. 73 vs 18 -- how is that possible?
    # That's easily possible, assuming that GitHub uses a rolling window of 14
    # days width with a precision higher than 1 day and after all the cutoff
    # for the data points at the boundary depends on the _exact time_ when the
    # snapshot was taken. That is, for aggregation (for dropping duplicate/bad
    # data) we want to look for the maximum data value for any given timestamp.
    # Using that method, we effectively ignore said cutoff artifact. In short:
    # group by timestamp (index), take the maximum.
    df_agg = dfall.groupby(dfall.index).max()

    log.info("shape of dataframe after dropping duplicates: %s", df_agg.shape)

    # Write aggregate
    # agg_fname = (
    #     datetime.strftime(newest_snapshot_time, "%Y-%m-%d_%H%M%S")
    #     + "_views_clones_aggregate.csv"
    # )
    # agg_fpath = os.path.join(ARGS.snapshotdir, agg_fname)
    if ARGS.views_clones_aggregate_outpath:

        if os.path.exists(ARGS.views_clones_aggregate_outpath):
            log.info("file exists: %s", ARGS.views_clones_aggregate_outpath)
            if not ARGS.views_clones_aggregate_inpath:
                log.error(
                    "would overwrite output aggregate w/o reading input aggregate -- you know what you're doing?"
                )
                sys.exit(1)

        log.info("write aggregate to %s", ARGS.views_clones_aggregate_outpath)
        # Pragmatic strategy against partial write / encoding problems.
        tpath = ARGS.views_clones_aggregate_outpath + ".tmp"
        df_agg.to_csv(tpath, index_label="time_iso8601")
        os.rename(tpath, ARGS.views_clones_aggregate_outpath)

        if ARGS.delete_ts_fragments:
            # Iterate through precisely the set of files that was read above.
            # If unlinkling fails at OS boundary then don't crash this program.
            for p in csvpaths:
                log.info("delete %s as of --delete-ts-fragments", p)
                try:
                    os.unlink(p)
                except Exception as e:
                    log.warning("could not unlink %s: %s", p, str(e))

    # print(df_agg)

    # matplotlib_config()
    # log.info("aggregated sample count: %s", len(df_agg))
    # df_agg.plot(
    #     linestyle="solid",
    #     marker="o",
    #     markersize=5,
    #     subplots=True,
    #     # ylabel="count",
    #     xlabel="",
    #     # logy="sym",
    # )
    # plt.ylim([0, None])
    # plt.tight_layout()
    # plt.show()

    # Why reset_index()? See
    # https://github.com/altair-viz/altair/issues/271#issuecomment-573480284
    df_agg = df_agg.reset_index()
    df_agg_views = df_agg.drop(columns=["clones_unique", "clones_total"])
    df_agg_clones = df_agg.drop(columns=["views_unique", "views_total"])

    PANEL_WIDTH = "container"
    PANEL_HEIGHT = 200

    panel_props = {"height": PANEL_HEIGHT, "width": PANEL_WIDTH, "padding": 10}

    chart_clones_unique = ((alt.Chart(df_agg_clones).mark_line(
        point=True).encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "clones_unique",
                type="quantitative",
                title="unique clones per day",
                scale=alt.Scale(
                    domain=(0, df_agg_clones["clones_unique"].max() * 1.1),
                    zero=True,
                ),
            ),
        )).configure_axisY(labelBound=True).configure_point(
            size=100).properties(**panel_props))

    chart_clones_total = ((alt.Chart(df_agg_clones).mark_line(
        point=True).encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "clones_total",
                type="quantitative",
                title="total clones per day",
                scale=alt.Scale(
                    domain=(0, df_agg_clones["clones_total"].max() * 1.1),
                    zero=True,
                ),
            ),
        )).configure_axisY(labelBound=True).configure_point(
            size=100).properties(**panel_props))

    chart_views_unique = ((alt.Chart(df_agg_views).mark_line(
        point=True).encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "views_unique",
                type="quantitative",
                title="unique views per day",
                scale=alt.Scale(
                    domain=(0, df_agg_views["views_unique"].max() * 1.1),
                    zero=True,
                ),
            ),
        )).configure_axisY(labelBound=True).configure_point(
            size=100).properties(**panel_props))

    chart_views_total = ((alt.Chart(df_agg_views).mark_line(point=True).encode(
        alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"),
        alt.Y(
            "views_total",
            type="quantitative",
            title="total views per day",
            scale=alt.Scale(
                domain=(0, df_agg_views["views_total"].max() * 1.1),
                zero=True,
            ),
        ),
    )).configure_axisY(labelBound=True).configure_point(size=100).properties(
        **panel_props))

    chart_views_unique_spec = chart_views_unique.to_json(indent=None)
    chart_views_total_spec = chart_views_total.to_json(indent=None)
    chart_clones_unique_spec = chart_clones_unique.to_json(indent=None)
    chart_clones_total_spec = chart_clones_total.to_json(indent=None)

    MD_REPORT.write(
        textwrap.dedent("""


    ## Views

    #### Unique visitors
    <div id="chart_views_unique" class="full-width-chart"></div>

    #### Total views
    <div id="chart_views_total" class="full-width-chart"></div>

    <div class="pagebreak-for-print"> </div>


    ## Clones

    #### Unique cloners
    <div id="chart_clones_unique" class="full-width-chart"></div>

    #### Total clones
    <div id="chart_clones_total" class="full-width-chart"></div>

    """))
    JS_FOOTER_LINES.extend([
        f"vegaEmbed('#chart_views_unique', {chart_views_unique_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
        f"vegaEmbed('#chart_views_total', {chart_views_total_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
        f"vegaEmbed('#chart_clones_unique', {chart_clones_unique_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
        f"vegaEmbed('#chart_clones_total', {chart_clones_total_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
    ])
Exemple #13
0
def analyse_top_x_snapshots(entity_type):
    assert entity_type in ["referrer", "path"]

    log.info("read 'top %s' snapshots (CSV docs)", entity_type)
    basename_suffix = f"_top_{entity_type}s_snapshot.csv"
    csvpaths = _glob_csvpaths(basename_suffix)
    snapshot_dfs = _get_snapshot_dfs(csvpaths, basename_suffix)

    # for df in snapshot_dfs:
    #     print(df)

    # Keep in mind: an entity_type is either a top 'referrer', or a top 'path'.
    # Find all entities seen across snapshots, by their name. For type referrer
    # a specific entity(referrer) name might be `github.com`.

    def _get_uens(snapshot_dfs):
        unique_entity_names = set()
        for df in snapshot_dfs:
            unique_entity_names.update(df[entity_type].values)

        return unique_entity_names

    unique_entity_names = _get_uens(snapshot_dfs)
    log.info("all %s entities seen: %s", entity_type, unique_entity_names)

    # Clarification: each snapshot dataframe corresponds to a single point in
    # time (the snapshot time) and contains information about multiple top
    # referrers/paths. Now, invert that structure: work towards individual
    # dataframes where each dataframe corresponds to a single referrer/path,
    # and contains imformation about multiple timestamps

    # First, create a dataframe containing all information.
    dfa = pd.concat(snapshot_dfs)

    if len(dfa) == 0:
        log.info("leave early: no data for entity of type %s", entity_type)
        return

    # Build a dict: key is path/referrer name, and value is DF with
    # corresponding raw time series.
    entity_dfs = _build_entity_dfs(dfa, entity_type, unique_entity_names)

    # It's important to clarify what each data point in a per-referrer raw time
    # series means. Each data point has been returned by the GitHub traffic
    # API. Each sample (row in the df) I think it can/should be looked at as
    # the result of a rolling window analysis that shows cumulative values
    # summed up over a period of 14 days; noted at the _right edge_ of the
    # rolling time window.

    # Should see further verification, but I think the boundaries of the time
    # window actually move with sub-day resolution, i.e. the same query
    # performed within the same day may yield different outcomes. If that's
    # true, the rolling time window analysis performed internally at GitHub can
    # be perfectly inversed; yielding per-referrer traffic statistics at a
    # sub-day time resolution. That of course will require predictable,
    # periodic sampling. Let's keep that in mind for now.

    # One interesting way to look at the data: find the top 5 referrers based
    # on unique views, and for the entire time range seen.

    max_vu_map = {}
    for ename, edf in entity_dfs.items():
        max_vu_map[ename] = edf["views_unique"].max()
    del ename

    # Sort dict so that the first item is the referrer/path with the highest
    # views_unique seen.
    sorted_dict = {
        k: v
        for k, v in sorted(
            max_vu_map.items(), key=lambda i: i[1], reverse=True)
    }

    top_n = 10
    top_n_enames = list(sorted_dict.keys())[:top_n]

    # simulate a case where there are different timestamps across per-referrer
    # dfs: copy a 'row', and re-insert it with a different timestamp.
    # row = referrer_dfs["t.co"].take([-1])
    # print(row)
    # referrer_dfs["t.co"].loc["2020-12-30 12:25:08+00:00"] = row.iloc[0]
    # print(referrer_dfs["t.co"])

    df_top_vu = pd.DataFrame()
    for ename in top_n_enames:
        edf = entity_dfs[ename]
        # print(edf)
        df_top_vu[ename] = edf["views_unique"]
    # del ename

    log.info(
        "The top %s %s based on unique views, for the entire time range seen:\n%s",
        top_n,
        entity_type,
        df_top_vu,
    )

    # For plotting with Altair, reshape the data using pd.melt() to combine the
    # multiple columns into one, where the referrer name is not a column label,
    # but a value in a column. Ooor we could use the
    # transform_fold() technique
    # https://altair-viz.github.io/user_guide/data.html#converting-between-long-form-and-wide-form-pandas
    # with .transform_fold(top_n_rnames, as_=["referrer", "views_unique"])
    # Also copy index into a normal column via `reset_index()` for
    # https://altair-viz.github.io/user_guide/data.html#including-index-data
    df_melted = df_top_vu.melt(var_name=entity_type,
                               value_name="views_unique",
                               ignore_index=False).reset_index()
    # print(df_melted)

    # Normalize main metric to show a view count _per day_, and clarify in the
    # plot that this is a _mean_ value derived from the _last 14 days_.
    df_melted["views_unique_norm"] = df_melted["views_unique"] / 14.0

    # For paths, it's relevant to identify the common prefix (repo owner/name)

    # cmn_ename_prefix = os.path.commonprefix(list(unique_entity_names))
    # log.info("cmn_ename_prefix: %s", cmn_ename_prefix)

    # if entity_type == "path":
    #     log.info("remove common path prefix")
    #     df_melted["path"] = df_melted["path"].str.slice(start=len(cmn_ename_prefix))
    #     # The root path (e.g., `owner/repo`) is not an empty string. That's
    #     # not so cool, make the root be represented by a single slash.
    #     # df_melted[df_melted["path"] == ""]["path"] = "/"
    #     df_melted["path"].replace("", "/", inplace=True)

    panel_props = {"height": 300, "width": "container", "padding": 10}
    chart = (
        alt.Chart(df_melted).mark_line(point=True)
        # .encode(x="time:T", y="views_unique:Q", color="referrer:N")
        # the pandas dataframe datetimeindex contains timing information at
        # much higher resolution than 1 day. The resulting vega spec may
        # then see time values like this: `"time": "2021-01-03T00:00:00+00:00"`
        # -- suggesting to vega that we care about showing hours and minutes.
        # instruct vega to only care about _days_ (dates), via an altair-based
        # timeout unit transformation. Ref:
        # https://altair-viz.github.io/user_guide/transform/timeunit.html
        .encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "views_unique_norm",
                type="quantitative",
                title="unique visitors per day (mean from last 14 days)",
                scale=alt.Scale(
                    domain=(0, df_melted["views_unique_norm"].max() * 1.1),
                    zero=True,
                ),
            ),
            alt.Color(
                entity_type,
                type="nominal",
                sort=alt.SortField("order"),
            ),
        ).configure_point(size=50).properties(**panel_props))

    chart_spec = chart.to_json(indent=None)

    # From
    # https://altair-viz.github.io/user_guide/customization.html
    # "Note that this will only scale with the container if its parent element
    # has a size determined outside the chart itself; For example, the
    # container may be a <div> element that has style width: 100%; height:
    # 300px.""

    heading = "Top referrers" if entity_type == "referrer" else "Top paths"

    # Textual form: larger N, and no cutoff (arbitrary length and legend of
    # plot don't go well with each other).
    top_n = 15
    top_n_enames = list(sorted_dict.keys())[:top_n]
    top_n_enames_string_for_md = ", ".join(
        f"{str(i).zfill(2)}: `{n}`" for i, n in enumerate(top_n_enames, 1))

    MD_REPORT.write(
        textwrap.dedent(f"""


    #### {heading}


    <div id="chart_{entity_type}s_top_n_alltime" class="full-width-chart"></div>

    Top {top_n} {entity_type}s: {top_n_enames_string_for_md}


    """))
    JS_FOOTER_LINES.append(
        f"vegaEmbed('#chart_{entity_type}s_top_n_alltime', {chart_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);"
    )
Exemple #14
0
def run_relationship_per_year_all_countries():
    @st.cache
    def get_by_year(df, year):
        return df[df['Year'] == year]

    st.markdown('''
		## How is an economy indicator associated with a health indicator among different countries?
		
		As argued in existing literature [1], "in the long term, growing economies are associated with longer and healthier lives," whereas
		"in the short term, that may not be the case—economic booms can boost mortality rates and busts can reduce them." Thus, it is particularly
		important and interesting to visualize the trends in economy and health of a country.

		In this section, we explore the relationship between any pair of a national economy indicator and a national health indicator for a specific year,
		among all countries. We also hope to show you if relationship between any of the two indicators exhibits some correlations to 
		the life expactancy.
		
		## Let's look at the data
		
		Using the sidebar, you are free to choose 

		1. a specific year, 
		2. a economy indicator (one of Gini, GDP per capita, and unemployment rate),
		and finally 
		3. a health indicator (one of health expenditure as % of GDP and health expenditure per capita).
		
		This way, you can easily visualize the interaction between your selected pair of indicators.
		
		You can select a specific year to explore with the **slider** on the left sidebar.
	''')

    st.sidebar.header("Adjust Parameters")

    data, countries = load_merge_data()

    econ_factors = [
        'GDP per capita (current US$)',
        'Unemployment, total (% of total labor force)',
        'Gini',
    ]

    health_factors = [
        'Current health expenditure (% of GDP)',
        'Current health expenditure per capita (current US$)',
        #'Life expectancy at birth, total (years)',
    ]

    e_factor = st.sidebar.radio("Economics Factor", (econ_factors))
    h_factor = st.sidebar.radio("Health Factor", (health_factors))

    curr_data = dropna_by_feature(data, [e_factor, h_factor])

    max_year = curr_data['Year'].max().item()
    year = st.sidebar.select_slider("Year",
                                    options=list(
                                        np.sort(curr_data['Year'].unique())),
                                    value=max_year)

    curr_data = get_by_year(curr_data, year)
    #st.dataframe(curr_data[['Country Name', e_factor, h_factor]].assign(hack='').set_index('hack'))

    # plot a auxiliary life expectancy graph below

    # get max and min y
    max_life = curr_data['Life expectancy at birth, total (years)'].max().item(
    )
    min_life = curr_data['Life expectancy at birth, total (years)'].min().item(
    )

    # double click to clear brush
    brush = alt.selection_interval(encodings=['x'])
    highlight = alt.selection_single(encodings=['color'],
                                     on='mouseover',
                                     nearest=False,
                                     clear="mouseout")

    stripplot = alt.Chart(curr_data).mark_circle(size=40).encode(
        x=alt.X('Life expectancy at birth, total (years):Q',
                scale=alt.Scale(domain=(min_life, max_life))),
        y=alt.Y(
            'jitter:Q',
            title=None,
            axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
            scale=alt.Scale(),
        ),
        color=alt.Color('Country Name', legend=None),
        opacity=alt.condition(
            highlight, alt.value(0.7), alt.value(0.1))).transform_calculate(
                # Generate Gaussian jitter with a Box-Muller transform
                jitter='sqrt(-2*log(random()))*cos(2*PI*random())').properties(
                    width=700, height=50).add_selection(brush)
    #.transform_filter(highlight)

    # get max and min y
    maxy = curr_data[h_factor].max().item()
    miny = curr_data[h_factor].min().item()
    maxx = curr_data[e_factor].max().item()
    minx = curr_data[e_factor].min().item()
    plot = alt.Chart(curr_data).mark_point().encode(
        x=alt.X(e_factor, scale=alt.Scale(domain=(minx, maxx))),
        y=alt.Y(h_factor, scale=alt.Scale(domain=(miny, maxy))),
        color=alt.Color('Country Name', legend=None),
        tooltip=alt.Tooltip(['Country Name', e_factor,
                             h_factor])).transform_filter(brush).properties(
                                 width=700).add_selection(highlight)
    result = alt.vconcat(stripplot, plot)

    st.altair_chart(result, use_container_width=True)

    st.markdown('''
		The above graph consists of two charts. 
		The upper chart displays a one dimensional graph of life expectancy (we added some jitters to the vertical
		position of each data point, so that they are not clumped together).
		The lower one displays a scatter plot of your selected indicators in the specific year for all countries. 
		Its x-axis corresponds to your selected economy indicator, and the y axis corresponds to your selected 
		health indicator.
		For both sub-graphs, a dot represents one particular country. 

		By moving your mouse on a specific dot (a specific country) on the lower chart, the name of the country, its exact values 
		for the two indicators will be shown. Its corresponding life expectancy dot will be highlighted in the 
		upper chart. You can compare its relative position in both graphs among all data points. 

		You can also hold and drag to select a life expectancy interval in the upper graph. Only countries which has 
		life expectancy at birth in this interval in the selected year will be shown on the lower chart. If you have 
		already had a selection interval (shown with a gray background), you can hold and drag the selection interval
		to move it around, the lower chart will reflect the change while you shift the selection interval. Double clicking
		on the upper chart will reset the selection interval.


		### References
		[1]
		Austin B. Frakt (2018) - "How the Economy Affects Health". JAMA. 319(12):1187–1188. doi:10.1001/jama.2018.1739
	''')
Exemple #15
0
def criar_scatterplot(x, y, color, df):
    scatter = alt.Chart(df, width=800, height=400).mark_circle().encode(
        alt.X(x), alt.Y(y), color=color, tooltip=[x, y]).interactive()
    return scatter
Exemple #16
0
    x='company_size:O', y='campaign_spend:Q',
    color=alt.Color('company_size')).interactive()
st.altair_chart(c, use_container_width=True)

st.markdown("""
The data is consistent with the inutuition that the company size follows the budget and spend-
although there are quite a few instances when the spend is quite high for smaller companies """
            )

st.text("""Lets take a look at some of the mean campaign spend and budget
** Greater than 95 Percentile data removed, to reflect a better picture**""")
bars = alt.Chart(df[df['campaign_spend'] < a]).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3).encode(x=alt.X('mean(campaign_spend):Q',
                                           stack='zero'),
                                   y=alt.Y('company_size:N'),
                                   color=alt.Color('treatment')).interactive()

text = alt.Chart(df[df['campaign_budget'] < a]).mark_text(
    dx=-50, dy=3, color='white').encode(x=alt.X('mean(campaign_spend):Q',
                                                stack='zero'),
                                        y=alt.Y('company_size:N'),
                                        detail='treatment:N',
                                        text=alt.Text('mean(campaign_spend):Q',
                                                      format='.1f'))

st.altair_chart(bars + text, use_container_width=True)

bars = alt.Chart(df[df['campaign_spend'] < b]).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3).encode(x=alt.X('mean(campaign_budget):Q',
"""
Cumulative Wikipedia Donations
==============================

This chart shows cumulative donations to Wikipedia over the past 10 years. Inspired by this [Reddit post](https://www.reddit.com/r/dataisbeautiful/comments/7guwd0/cumulative_wikimedia_donations_over_the_past_10/) but using lines instead of areas.
"""
# category: line charts
import altair as alt

data = "https://frdata.wikimedia.org/donationdata-vs-day.csv"

alt.Chart(data).mark_line().encode(
    alt.X('date:T',
          timeUnit='monthdate',
          axis=alt.Axis(format='%B', title='Month')),
    alt.Y('max(ytdsum):Q',
          stack=None,
          axis=alt.Axis(title='Cumulative Donations')),
    alt.Color('date:O', timeUnit='year', legend=alt.Legend(title='Year')),
    alt.Order('data:O', timeUnit='year'))
Exemple #18
0
def hist(data, variable):
    """
    Creates a altair histogram indicating the position of the mean and median.

    Parameters
    ----------
    data : dataframe
        A pandas dataframe.
    variable : string
        A name of the variable inside data used to make the histogram.

    Returns
    --------
    altair plot
        Produces an altair histogram with vertical
        bars for the mean and median.

    Examples
    --------
    >>> from altairexpress.hist import hist
    >>> from gapminder import gapminder
    >>> gapminder.head()
    >>> hist(gapminder, 'lifeExp')
    """

    # Check if data is dataframe
    assert isinstance(data, pd.DataFrame), "TypeError: Data must be a pandas "\
                                           "dataframe. "

    # Check if variable name is a string
    assert isinstance(variable, str), "TypeError: Variable must be supplied " \
                                      "as a string "

    # # Check that variable is continuous numeric data
    # assert pd.api.types.is_numeric_dtype(
    #     data[variable]), "Variable needs to be numeric. Your data must be " \
    #                      "have a continuous numeric data type. "

    # check that variable is contained in data
    # assert variable in data.columns, "NameError: Variable provided is not " \
    #                                  "contained in data "

    # extract the variable
    v = data[variable]

    # get the variable statistics
    mean = np.round(np.mean(v), 2)
    median = np.round(np.median(v), 2)
    std_dev = np.round(np.std(v), 2)

    # set the x-axis position for annotations
    annotation_x = np.max(v) * 0.9

    # make base histogram
    p1 = alt.Chart(data).mark_bar().encode(
        alt.X(variable, bin=True, title=variable),
        alt.Y('count()', title="Count")).properties(
            title="Distribution of {0}".format(variable))

    # Specify summary statistic annotations
    annot1 = (alt.Chart(data).mark_text(
        color='pink', text="Mean is {0:.2f}".format(mean)).encode(
            alt.X('xval:Q'), alt.Y('yval:Q')).transform_calculate(
                xval=str(annotation_x), yval=str(np.mean(np.arange(0, 500)))))

    annot2 = (alt.Chart(data).mark_text(
        color='red', text="Median is {0:.2f}".format(median)).encode(
            alt.X("xval:Q"), alt.Y("yval:Q")).transform_calculate(
                xval=str(annotation_x), yval=str(np.mean(np.arange(0, 450)))))

    annot3 = (alt.Chart(data).mark_text(
        color='blue', text="Standard Dev is {0:.2f}".format(std_dev)).encode(
            alt.X("xval:Q"), alt.Y("yval:Q")).transform_calculate(
                xval=str(annotation_x), yval=str(np.mean(np.arange(0, 400)))))

    # make vertical bars for mean and median
    mean_string = 'mean(' + variable + '):Q'
    median_string = 'median(' + variable + '):Q'

    mean_line = (alt.Chart(data).mark_rule(color='pink', size=5).encode(
        alt.X(mean_string, title=variable)))

    median_line = (alt.Chart(data).mark_rule(color='red', size=5).encode(
        alt.X(median_string, title=variable)))

    return p1 + annot1 + annot2 + annot3 + mean_line + median_line
Exemple #19
0
def main():
    st.text(
        'Este aplicativo utiliza as informações atualizadas contidas no site Covid19Brazil'
    )
    st.write('https://covid19-brazil-api.now.sh/')
    st.title('Covid pelo Brasil')

    data = st.selectbox('Escolha a data que deseja visualizar', datas)
    data_aux = data.split('/')
    data = ''.join(data_aux[::-1])
    dados_brasil = get_dados_brasil_por_data(data)

    df = pd.DataFrame(dados_brasil['data'])
    if len(df) == 0:
        st.text('Não há dados para essa data no Brasil')
    else:
        df.merge(coordenadas_df.T)

        bar = alt.Chart(df).mark_bar().encode(
            alt.X('state:O', title='Estados'),
            alt.Y('deaths:Q'),
            color=alt.condition(
                alt.datum.deaths > 2072, alt.value('red'),
                alt.value('black'))).properties(
                    title='Mortes por Estado e linha de média de morte')

        rule = alt.Chart(df).mark_rule(color='red').encode(
            alt.Y('mean(deaths):Q', title='Quantidade de Mortes'))

        text = bar.mark_text(
            align='center',
            color='black',
            baseline='bottom',
            dx=3  # Nudges text to right so it doesn't appear on top of the bar
        ).encode(text='deaths:Q')

        st.write((bar + rule + text).properties(width=600, height=400))

        st.subheader('Números do Brasil')
        total_casos = df['cases'].sum()
        total_suspeitas = df['suspects'].sum()
        total_mortes = df['deaths'].sum()

        html_temp = """
         <div style="
            display:flex;
            margin-bottom:20px">
            <div style="
                background-color:#025951;
                display:flex;
                justify-content:center;
                align-items:center;
                color:white;
                border-radius:5px;
                width:150px;
                height:150px;
                padding:10px;
                font-size:15px;
                margin-right:10px;"
            >
                Total de Casos Confirmados
                <br>
                %d
            </div>
            <div style="
                background-color:#048ABF;
                display:flex;
                justify-content:center;
                align-items:center;
                color:white;
                border-radius:5px;
                width:150px;
                height:150px;
                padding:10px;
                font-size:15px;
                margin-right:10px;";
            >
                Total de Suspeitas    
                %d
            </div>
            <div style="
                background-color:#F25116;
                display:flex;
                justify-content:center;
                align-items:center;
                color:white;
                border-radius:5px;
                width:150px;
                height:150px;
                padding:10px;
                font-size:15px;
                margin-right:10px;";
            >
                Total de Mortes
                %d
            </div>
         </div>
        """ % (total_casos, total_suspeitas, total_mortes)
        st.write(html_temp, unsafe_allow_html=True)

        st.subheader('Números dos Estados do Brasil')
        estados = list(df['state'])
        estado = st.selectbox('Escolha o estado que deseja visualizar',
                              estados)
        estado_casos = df[df['state'] == estado]['cases'].sum()
        estado_suspeita = df[df['state'] == estado]['suspects'].sum()
        estado_morte = df[df['state'] == estado]['deaths'].sum()

        html_temp = """
         <div style="
            display:flex;
            margin-bottom:20px">
            <div style="
                background-color:#025951;
                display:flex;
                justify-content:center;
                align-items:center;
                color:white;
                border-radius:5px;
                width:150px;
                height:150px;
                padding:10px;
                font-size:15px;
                margin-right:10px;"
            >
                Total de Casos Confirmados
                <br>
                %d
            </div>
            <div style="
                background-color:#048ABF;
                display:flex;
                justify-content:center;
                align-items:center;
                color:white;
                border-radius:5px;
                width:150px;
                height:150px;
                padding:10px;
                font-size:15px;
                margin-right:10px;";
            >
                Total de Suspeitas    
                %d
            </div>
            <div style="
                background-color:#F25116;
                display:flex;
                justify-content:center;
                align-items:center;
                color:white;
                border-radius:5px;
                width:150px;
                height:150px;
                padding:10px;
                font-size:15px;
                margin-right:10px;";
            >
                Total de Mortes
                %d
           </div>
        """ % (estado_casos, estado_suspeita, estado_morte)
        st.write(html_temp, unsafe_allow_html=True)

    #sidebar
    dados_paises = get_dados_paises()
    dados_paises_df = pd.DataFrame(dados_paises['data'])
    paises = list(dados_paises_df['country'])

    st.sidebar.title('Números do Mundo')
    st.sidebar.text('Total de Casos Confirmados')
    st.sidebar.text(dados_paises_df['confirmed'].sum())
    st.sidebar.text('Total de Mortes')
    st.sidebar.text(dados_paises_df['deaths'].sum())
    st.sidebar.text('Total de Recuperações')
    st.sidebar.text(dados_paises_df['recovered'].sum())

    st.sidebar.title('Covid pelos Países')
    pais = st.sidebar.selectbox('Escolha o País', paises)

    st.sidebar.text('Total de Casos Confirmados')
    st.sidebar.text(
        dados_paises_df[dados_paises_df['country'] == pais]['confirmed'].sum())

    st.sidebar.text('Total de Mortes')
    st.sidebar.text(
        dados_paises_df[dados_paises_df['country'] == pais]['deaths'].sum())
    st.sidebar.text('Total de Recuperações')
    st.sidebar.text(
        dados_paises_df[dados_paises_df['country'] == pais]['recovered'].sum())

    html = """
    <style>
        .reportview-container {
        flex-direction: row-reverse;
       
        }
        header > .toolbar {
            flex-direction: row-reverse;
            left: 1rem;
            right: auto;
           
        }
        .sidebar .sidebar-collapse-control,
        .sidebar.--collapsed .sidebar-collapse-control {
            left: auto;
            right: 0.5rem;
            
        }

        .sidebar .sidebar-content {
            transition: margin-right .3s, box-shadow .3s;
            
        }

        .sidebar.--collapsed .sidebar-content {
            margin-left: auto;
            margin-right: -21rem;
            
        }

        @media (max-width: 991.98px) {
        .sidebar .sidebar-content {
            margin-left: auto;
        }
        }
    </style>
    """
    st.markdown(html, unsafe_allow_html=True)
                                   scale=alt.Scale(scheme="category20")),
                         alt.value("lightgray"),
                     ),
                     tooltip=[
                         alt.Tooltip("properties.name:N", title="County"),
                         alt.Tooltip("properties.region:N", title="Region"),
                     ],
                 ).add_selection(selector_map).properties(width=500,
                                                          height=800))

bar_chart = (alt.Chart(
    alt.Data(url=data_url,
             format=alt.DataFormat(
                 property="features", type="json"))).transform_fold(
                     ["properties." + c for c in columns],
                     as_=["metric", "value"]).mark_bar().encode(
                         x=alt.X("metric:N"),
                         y=alt.Y("value:Q", axis=alt.Axis(format="%")),
                         color="properties.region:N",
                     ).add_selection(selector_map).transform_filter(
                         selector_map).properties(width=200, height=200))

chart = CA_map | bar_chart

if __name__ == "__main__":

    docs_dir = Path(__file__).parent.parent.absolute()

    with open(docs_dir / "docs" / "_vega_out.html", "w") as f:
        f.write(chart.to_html())
Exemple #21
0
def visualize_cummulative_charts(df_filter):

	#Visualize Company is_tested
	is_tested_is_tested = df_filter['is_tested'].value_counts().reset_index()
	is_tested_chart = alt.Chart(is_tested_is_tested).mark_bar().encode(
		x = alt.X('index',title='is_tested',sort=['Influencer', 'Facebook']),
		y = alt.Y('is_tested',title='Count'),
		tooltip = [alt.Tooltip('is_tested',title='count')],
		color = alt.value('darkorange')
		).properties(
	    title='is_tested Distribution'
		)
	st.altair_chart(is_tested_chart.properties(height=400,width=600))


	#Visualize Item Price
	item_price_is_tested = df_filter['wage'].value_counts().reset_index()
	item_price_chart = alt.Chart(item_price_is_tested).mark_bar().encode(
		x = alt.X('index',title="Respondent's Wage",sort=['Less than 10 million','10 - 25 million','25 - 50 million','50 - 100 million','100 - 250 million','250 - 500 million','500+ million']),
		y = alt.Y('wage',title='Count'),
		tooltip = [alt.Tooltip('wage',title='count')],
		color = alt.value('darkorange')
		).properties(
	    title="Respondent's Wage Distribution"
		)
	st.altair_chart(item_price_chart.properties(height=400,width=600))


	#Visualize Age
	age_is_tested = df_filter['age'].value_counts().reset_index()
	age_chart = alt.Chart(age_is_tested).mark_bar().encode(
		x = alt.X('index',title="Respondent's Age",sort=['18-30', '30-50','50+']),
		y = alt.Y('age',title='Count'),
		tooltip = [alt.Tooltip('age',title='count')],
		color = alt.value('darkorange')
		).properties(
	    title="Respondent's Age Distribution"
		)
	st.altair_chart(age_chart.properties(height=400,width=600))


	#Visualize Gender
	gender_is_tested = df_filter['gender'].value_counts().reset_index()
	gender_chart = alt.Chart(gender_is_tested).mark_bar().encode(
		x = alt.X('index',title="Respondent's Gender",sort=['Female', 'Male']),
		y = alt.Y('gender',title='Count'),
		tooltip = [alt.Tooltip('gender',title='count')],
		color = alt.condition(
		alt.datum.index == 'Male',  
		alt.value('darkblue'),    
		alt.value('pink')   
		)
		).properties(
	    title="Respondent's Gender Distribution"
		)
	st.altair_chart(gender_chart.properties(height=400,width=600))


	#Visualize Ditribution of Region
	region_is_tested = df_filter['region'].value_counts().reset_index()
	region_chart = alt.Chart(region_is_tested).mark_bar().encode(
		x = alt.X('index',title='Region'),
		y = alt.Y('region',title='Count'),
		tooltip = [alt.Tooltip('region',title='count')],
		color = alt.value('darkorange')
		).properties(
	    title='Region Distribution'
		)
	st.altair_chart(region_chart.properties(height=400,width=600))
Exemple #22
0
    include_special_char=False,
    include_numeric=False,
    include_upper_case=False,
)

clf = RasaClassifier(pathlib.Path(model_folder) / model_file)

augs = aug.augment(text_input, n=n_generate)

data = reduce(lambda a, b: a + b,
              [clf.fetch_info_from_message(a)["intent_ranking"] for a in augs])
source = pd.DataFrame(data)[["name",
                             "confidence"]].rename(columns={"name": "intent"})

error_bars = (alt.Chart(source).mark_errorbar(extent="stdev").encode(
    x=alt.X("confidence:Q", scale=alt.Scale(zero=False)), y=alt.Y("intent:N")))

points = (alt.Chart(source).mark_point(filled=True, color="black").encode(
    x=alt.X("confidence:Q", aggregate="mean"),
    y=alt.Y("intent:N"),
))

st.markdown("## Simple Line View")
st.altair_chart(error_bars + points, use_container_width=True)

hist = (alt.Chart(source).mark_area(opacity=0.3, interpolate="step").encode(
    alt.X("confidence:Q", bin=alt.Bin(maxbins=100)),
    alt.Y("count()", stack=None),
    alt.Color("intent:N"),
))
Exemple #23
0
##
import pandas as pd
import altair as alt
from json import loads

G = 1024 * 1024 * 1024
##
df = pd.read_json("results/report.1.json", lines=True)
df["DVM"] = df["mem_size"] / df["memory"]
df["MachineMem"] = df["memory"].apply(lambda value: f"{value / G:.1f}G")
df["DatasetMemSize"] = df["mem_size"].apply(lambda value: f"{value / G:.1f}G")
##
chartDVM = (alt.Chart(
    df[df["DVM"] < 10],
    title="plot_correlation(df) Comparison").mark_line(point=True).encode(
        y=alt.Y("elapsed", title="Elapsed (s)"),
        x=alt.X("DVM", title="Dataset Size / Memory Size"),
        color="name:N",
        tooltip=[
            alt.Tooltip("name:N"),
            alt.Tooltip("elapsed:Q", format=".0s"),
            alt.Tooltip("MachineMem"),
            alt.Tooltip("DatasetMemSize"),
        ],
        column=alt.Column("format", title="Data Format"),
        row=alt.Column("reader", title="Data Reader"),
    ))
chartDVM
##
# chartBar = (
#     alt.Chart(
Exemple #24
0
    baselines = {
        day: pd.read_csv(path / day, index_col=0)
        for day in days
    }

    day = st.selectbox(
        'Select baseline day:',
        sorted(list(baselines.keys()))
    )

    data = baselines[day]
    data = data.set_index('target_index')
    baseline = data.loc[:, ['pred', 'target']].reset_index().melt('target_index')
    chart = alt.Chart(baseline).mark_line().encode(
        alt.X('target_index:T'),
        alt.Y('value:Q'),
        color='variable:N'
    )
    st.altair_chart(chart.properties(width=600, height=400))

    st.header('Both Baseline Error Analysis')
    error_chart = []
    for name in ['yesterday', 'week']:
        errors = DATAHOME / 'final' / 'baselines' / name / 'errors.csv'
        errors = pd.read_csv(errors)
        errors = errors.query('statistic == "mean"').query('variable == "abs-errors"').sort_values('target_day')
        errors = errors.drop(['statistic', 'variable'], axis=1)
        errors.columns = [name, errors.columns[1]]
        errors = errors.set_index(errors.columns[1])
        error_chart.append(errors)
Exemple #25
0
def plot_iroas_over_time(iroas_df: pd.DataFrame,
                         experiment_dates: pd.DataFrame,
                         cooldown_date: pd.DataFrame):
    """Returns a chart of the iROAS estimate over time with confidence bands.

  This function provides a visualization of the evolution of the iROAS estimate
  over the duration of the experiment and cooldown, together with confidence
  bands.

  Args:
    iroas_df: a dataframe with columns: date, lower, mean, upper
    experiment_dates: dataframe with columns (date, color) which contains two
      dates for each period (start, end), and the column color is the label
      used in the chart to refer to the corresponding period, e.g. "Experiment
      period" or "Pretes period".
    cooldown_date: dataframe with column (date, color) with only one entry,
      where date indicates the last day in the cooldown period, and color is the
      label used in the plot legend, e.g. "End of cooldown period".

  Returns:
    iroas_chart: Chart containing the plot.
  """
    iroas_base = alt.Chart(iroas_df).mark_line().encode(
        x=alt.X('date:T', axis=alt.Axis(title='', format=('%b %e'))))

    iroas_selection = alt.selection_single(fields=['date'],
                                           nearest=True,
                                           on='mouseover',
                                           empty='none',
                                           clear='mouseout')

    iroas_lines = iroas_base.mark_line().encode(
        y=alt.Y('mean:Q', axis=alt.Axis(title=' ', format='.3')))

    iroas_points = iroas_lines.mark_point().transform_filter(iroas_selection)

    iroas_rule1 = iroas_base.mark_rule().encode(
        tooltip=['date:T', 'mean:Q', 'lower:Q', 'upper:Q'])

    iroas_rule = iroas_rule1.encode(
        opacity=alt.condition(iroas_selection, alt.value(0.3), alt.value(
            0))).add_selection(iroas_selection)

    iroas_ci_bands_rule = alt.Chart(iroas_df).mark_area(color='gray').encode(
        alt.X('date:T'), y='lower:Q', y2='upper:Q', opacity=alt.value(0.5))

    date_rule = alt.Chart(experiment_dates[
        experiment_dates['color'] == 'Experiment period']).mark_rule(
            strokeWidth=2).encode(x='date:T',
                                  color=alt.Color('color',
                                                  scale=alt.Scale(domain=[
                                                      'Experiment period',
                                                      'End of cooldown period',
                                                      'iROAS estimate'
                                                  ],
                                                                  range=[
                                                                      'black',
                                                                      'black',
                                                                      '#1f77b4'
                                                                  ])))
    cooldown_date_rule = alt.Chart(cooldown_date).mark_rule(
        strokeWidth=2, strokeDash=[5,
                                   2], color='black').encode(x='date:T',
                                                             color='color:N')
    # Compile chart
    iroas_chart = alt.layer(iroas_lines, iroas_rule, iroas_points, date_rule,
                            cooldown_date_rule, iroas_ci_bands_rule)

    return iroas_chart
Exemple #26
0
import pandas as pd
import altair as alt
import datapane as dp

# download data & group by manufacturer
df = pd.read_csv(
    'https://covid.ourworldindata.org/data/vaccinations/vaccinations-by-manufacturer.csv',
    parse_dates=['date'])
df = df.groupby(['vaccine',
                 'date'])['total_vaccinations'].sum().tail(1000).reset_index()

# plot vaccinations over time using Altair
plot = alt.Chart(df).mark_area(opacity=0.4, stroke='black').encode(
    x='date:T',
    y=alt.Y('total_vaccinations:Q'),
    color=alt.Color('vaccine:N', scale=alt.Scale(scheme='set1')),
    tooltip='vaccine:N').interactive().properties(width='container')

# tablulate total vaccinations by manufacturer
total_df = df[df["date"] == df["date"].max()].sort_values(
    "total_vaccinations", ascending=False).reset_index(drop=True)
total_styled = total_df.style.bar(subset=["total_vaccinations"],
                                  color='#5fba7d',
                                  vmax=total_df["total_vaccinations"].sum())

# embed into a Datapane Report
report = dp.Report(
    "## Vaccination Report",
    dp.Plot(plot, caption="Vaccinations by manufacturer over time"),
    dp.Table(total_styled,
Exemple #27
0
def boxplot(
    x=None, y=None, hue=None, data=None, size=None, aspect=1,
    orient=None, color=None, palette=None, saturation=.75, dodge=True
):
    xs, ys = "x", "y"
    if data is None:
        data = pd.DataFrame({"x": x})
        x = "x"
        if y:
            data["y"] = y
            y = "y"

    if x is None and y is None:
        # Make a box plot for each numeric column
        numeric_cols = [c for c in data if data[c].dtype in [np.float32, np.float64]]
        col = []
        val = []
        for c in numeric_cols:
            for v in data[c]:
                col.append(c)
                val.append(v)

        data = pd.DataFrame({"column": col, "value": val})
        x = "column"
        y = "value"
        if orient == "h":
            x, y = y, x

    if y:
        orient = infer_orient(data[x], data[y], orient)
    elif orient is None:
        orient = "h"

    if orient == "h":
        x, y = y, x
        xs, ys = ys, xs

    xf = hue if hue and dodge else x

    # Main bar
    encodings = {
        ys: alt.Y(field=y, aggregate="q1", type="quantitative", axis={"title": y}),
        "%s2" % ys: alt.Y(field=y, aggregate="q3", type="quantitative"),
        "color": alt.Color(field=x ,type="nominal", legend=None),
        xs: alt.X(field=xf, type="nominal")
    }
    if x is None:
        del encodings["color"]
        del encodings[xs]
    if hue:
        legend = None if dodge else alt.Undefined
        encodings["color"] = alt.Color(field=hue ,type="nominal", legend=legend)
    bar_layer = alt.Chart().mark_bar().encode(**encodings)

    # Min/max range line
    range_encodings = {
        ys: alt.Y(field=y, aggregate="min", type="quantitative"),
        "%s2" % ys: alt.Y(field=y, aggregate="max", type="quantitative"),
        xs: alt.X(field=xf, type="nominal")
    }
    if x is None:
        del range_encodings[xs]
    range_layer = alt.Chart().mark_rule().encode(**range_encodings)

    # Median line
    median_encodings = {
        ys: alt.Y(field=y, aggregate="median", type="quantitative"),
        xs: alt.X(field=xf, type="nominal")
    }
    if x is None:
        del median_encodings[xs]
    median_layer = alt.Chart().mark_tick(size=18, color="black").encode(**median_encodings)

    chart = alt.LayerChart(data=data, layer=[range_layer, bar_layer, median_layer])

    if hue and dodge:
        facet_dir = "column" if orient == "v" else "row"
        chart = chart.facet(**{facet_dir: "%s:N" % x})

    size_chart(chart, size, aspect)

    pal = vega_palette(palette, color, saturation)
    return chart.configure_range(category=pal)
Exemple #28
0
def criar_barras(coluna_num, coluna_cat, df):
    bars = alt.Chart(df, width=600).mark_bar().encode(
        x=alt.X(coluna_num, stack='zero'),
        y=alt.Y(coluna_cat),
        tooltip=[coluna_cat, coluna_num]).interactive()
    return bars
Exemple #29
0
            <p><i>Use the brush feature to focus on points in either scatterplot.</i></p>
            """
st.markdown(multiIntro, unsafe_allow_html=True)
### Visualization 1

domain = ['Default', 'No Default']
range_ = ['#800080', 'steelblue']

brush = alt.selection_interval()

bars = alt.Chart(sample).mark_bar().encode(
    y='TARGET:O', color='TARGET:O',
    x='count(TARGET)').transform_filter(brush).properties(width=700)

brush_scatter = alt.Chart(sample).mark_circle(opacity=0.5).encode(
    y=alt.Y('AMT_INCOME_TOTAL'),
    color=alt.condition(brush,
                        'TARGET:N',
                        alt.value('lightgray'),
                        scale=alt.Scale(domain=domain,
                                        range=range_))).properties(
                                            width=350,
                                            height=350).add_selection(brush)
st.write(bars & (brush_scatter.encode(x='AMT_CREDIT')
                 | brush_scatter.encode(x='AMT_ANNUITY')))

### Visualization 2

st.markdown("<p> Click on the legend to filter by target. </p>",
            unsafe_allow_html=True)
Exemple #30
0
def run_var_relationship_per_country():
    st.markdown('''
	## How is an economy indicator associated with a health indicator for a specific country?
	
	As argued in existing literature [1], "in the long term, growing economies are associated with longer and healthier lives," whereas
	"in the short term, that may not be the case—economic booms can boost mortality rates and busts can reduce them." Thus, it is particularly
	important and interesting to visualize the trends in economy and health of a country.
	

	In this section, we explore the relationship between any pair of a national economy indicator and a national health indicator over time. 
	
	## Let's look at the data
	
	Using the sidebar, you are free to choose 

	1. a specific country, 
	2. a economy indicator (one of Gini, GDP per capita, and unemployment rate),
	and finally 
	3. a health indicator (one of health expenditure as % of GDP, health expenditure per capita, and life expectancy).
	
	This way, you can easily visualize the interaction between your selected pair of indicators. Each plot will have two y-axes correponding to each of the indicators.
	
	Our visualization is interactive. You can check out the specific value of an indicator in a specific year by moving your mouse over the desired point on a line.
	In addition, you can drag the graph to adjust the time range. Finally, you can use your touchpad to zoom in or zoom out. Again, double clicking will reset the plot.
	
	Please note that the original data has many missing entries. When there is no data for either indicator in the selected pair for the selected country, 
	we cannot generate a visualization for you, as indicated by "Data Not Available." Please try another pair of indicators.
	''')
    df, countries, econ_indicators, health_indicators = load_other_data()
    other_data_df = df.copy()
    other_data_df['Year'] = pd.to_datetime(other_data_df['Year'], format='%Y')
    st.sidebar.header("Adjust Parameters")

    country = st.sidebar.selectbox("Country", countries)
    country_df = other_data_df[other_data_df["Country Name"] == country]

    econ_indicator = st.sidebar.selectbox("Economy Indicator",
                                          econ_indicators,
                                          index=1)
    health_indicator = st.sidebar.selectbox("Health Indicator",
                                            health_indicators,
                                            index=2)
    bi_var_df = country_df[["Year", econ_indicator, health_indicator]]

    if bi_var_df.dropna().empty:
        st.write("Data Not Available")
    else:
        nearest1 = alt.selection(type='single',
                                 nearest=True,
                                 on='mouseover',
                                 fields=['Year'],
                                 empty='none')
        nearest2 = alt.selection(type='single',
                                 nearest=True,
                                 on='mouseover',
                                 fields=['x'],
                                 empty='none')
        base = alt.Chart(bi_var_df).encode(
            alt.X('Year:T', axis=alt.Axis(title='Year', format=("%Y"))))
        line1 = base.mark_line(color='#5276A7').encode(
            alt.Y(econ_indicator,
                  axis=alt.Axis(title=econ_indicator, titleColor='#5276A7')),
            tooltip=[alt.Tooltip(econ_indicator, title=econ_indicator)
                     ]).add_selection(nearest1).interactive()
        line2 = base.mark_line(color='#57A44C').encode(
            alt.Y(health_indicator,
                  axis=alt.Axis(title=health_indicator, titleColor='#57A44C')),
            tooltip=[alt.Tooltip(health_indicator, title=health_indicator)
                     ]).add_selection(nearest2).interactive()
        line_plot = alt.layer(line1, line2).resolve_scale(y='independent')
        st.altair_chart(line_plot, use_container_width=True)
    st.markdown('''
				### References
				[1]
				Austin B. Frakt (2018) - "How the Economy Affects Health". JAMA. 319(12):1187–1188. doi:10.1001/jama.2018.1739
			''')