if st.checkbox('Show Raw Data'):
    st.write(df_filtered)

st.write(
    "Let look at how Trump's sentiment changes over time from 2009-05 to 2020-06"
)
st.write(
    "Sentiment is a value between -1 and 1, where -1 stand for 100% negative and 1 stands for 100% positive"
)

brush = alt.selection_interval(encodings=['x'])
chart2 = alt.Chart(df_filtered).mark_bar().encode(
    x=alt.X('date:T'),
    y=alt.Y('sentiment:Q'),
    tooltip=['date:T', 'sentiment:Q']).properties(width=800, height=400)
chart2 = chart2.encode(color=alt.condition(
    brush, 'favorites:Q', alt.value('lightgray'))).add_selection(brush)

chart = alt.Chart(df_filtered).mark_circle().encode(
    x=alt.X('date:T'),
    y=alt.Y('sentiment:Q'),
    color=alt.condition(alt.datum.sentiment > .15, alt.value('red'),
                        alt.value('blue')),
    size=alt.Size('retweets:Q', scale=alt.Scale(range=[20, 200])),
    tooltip=['date:T', 'sentiment:Q', 'retweets:Q',
             'favorites:Q']).properties(width=800,
                                        height=400).transform_filter(brush)

st.write(chart2 & chart)
profile_df_list = []
for i, row in profile_and_dem_df.iterrows():
    profile_df = get_raster_values_for_line(row.geometry, row.filename)
    profile_df['area'] = row['area']
    profile_df['date'] = row['date']
    profile_df_list.append(profile_df)
profile_df = pd.concat(profile_df_list)

profile_df

alt.Chart(profile_df).mark_line().encode(
    x=alt.X('path_distance:Q', title='Pathwise Distance (m)'),
    y=alt.Y('raster_value:Q',
            title='Elevation (m)',
            scale=alt.Scale(zero=False)),
    color='date:N'
).properties(
    height=400,
    width=800,
    title={
        'text': ['Uncertainty analysis, Paradise road Mt. Rainier'],
        'subtitle': [
            'Comparing HSFM DEMs with and without error surface fitting correction'
        ]
    }).resolve_scale(x='independent', y='independent')

profile_df.date.unique()

# +
src = profile_df[profile_df['date'] == 'USGS LIDAR 2007/08']
Esempio n. 3
0
def analyse_top_x_snapshots(entity_type):
    assert entity_type in ["referrer", "path"]

    log.info("read 'top %s' snapshots (CSV docs)", entity_type)
    basename_suffix = f"_top_{entity_type}s_snapshot.csv"
    csvpaths = _glob_csvpaths(basename_suffix)
    snapshot_dfs = _get_snapshot_dfs(csvpaths, basename_suffix)

    # for df in snapshot_dfs:
    #     print(df)

    # Keep in mind: an entity_type is either a top 'referrer', or a top 'path'.
    # Find all entities seen across snapshots, by their name. For type referrer
    # a specific entity(referrer) name might be `github.com`.

    def _get_uens(snapshot_dfs):
        unique_entity_names = set()
        for df in snapshot_dfs:
            unique_entity_names.update(df[entity_type].values)

        return unique_entity_names

    unique_entity_names = _get_uens(snapshot_dfs)
    log.info("all %s entities seen: %s", entity_type, unique_entity_names)

    # Clarification: each snapshot dataframe corresponds to a single point in
    # time (the snapshot time) and contains information about multiple top
    # referrers/paths. Now, invert that structure: work towards individual
    # dataframes where each dataframe corresponds to a single referrer/path,
    # and contains imformation about multiple timestamps

    # First, create a dataframe containing all information.
    dfa = pd.concat(snapshot_dfs)

    if len(dfa) == 0:
        log.info("leave early: no data for entity of type %s", entity_type)

    # Build a dict: key is path/referrer name, and value is DF with
    # corresponding raw time series.
    entity_dfs = _build_entity_dfs(dfa, entity_type, unique_entity_names)

    # It's important to clarify what each data point in a per-referrer raw time
    # series means. Each data point has been returned by the GitHub traffic
    # API. Each sample (row in the df) I think it can/should be looked at as
    # the result of a rolling window analysis that shows cumulative values
    # summed up over a period of 14 days; noted at the _right edge_ of the
    # rolling time window.

    # Should see further verification, but I think the boundaries of the time
    # window actually move with sub-day resolution, i.e. the same query
    # performed within the same day may yield different outcomes. If that's
    # true, the rolling time window analysis performed internally at GitHub can
    # be perfectly inversed; yielding per-referrer traffic statistics at a
    # sub-day time resolution. That of course will require predictable,
    # periodic sampling. Let's keep that in mind for now.

    # One interesting way to look at the data: find the top 5 referrers based
    # on unique views, and for the entire time range seen.

    max_vu_map = {}
    for ename, edf in entity_dfs.items():
        max_vu_map[ename] = edf["views_unique"].max()
    del ename

    # Sort dict so that the first item is the referrer/path with the highest
    # views_unique seen.
    sorted_dict = {
        k: v
        for k, v in sorted(
            max_vu_map.items(), key=lambda i: i[1], reverse=True)
    }

    top_n = 10
    top_n_enames = list(sorted_dict.keys())[:top_n]

    # simulate a case where there are different timestamps across per-referrer
    # dfs: copy a 'row', and re-insert it with a different timestamp.
    # row = referrer_dfs["t.co"].take([-1])
    # print(row)
    # referrer_dfs["t.co"].loc["2020-12-30 12:25:08+00:00"] = row.iloc[0]
    # print(referrer_dfs["t.co"])

    df_top_vu = pd.DataFrame()
    for ename in top_n_enames:
        edf = entity_dfs[ename]
        # print(edf)
        df_top_vu[ename] = edf["views_unique"]
    # del ename

    log.info(
        "The top %s %s based on unique views, for the entire time range seen:\n%s",
        top_n,
        entity_type,
        df_top_vu,
    )

    # For plotting with Altair, reshape the data using pd.melt() to combine the
    # multiple columns into one, where the referrer name is not a column label,
    # but a value in a column. Ooor we could use the
    # transform_fold() technique
    # https://altair-viz.github.io/user_guide/data.html#converting-between-long-form-and-wide-form-pandas
    # with .transform_fold(top_n_rnames, as_=["referrer", "views_unique"])
    # Also copy index into a normal column via `reset_index()` for
    # https://altair-viz.github.io/user_guide/data.html#including-index-data
    df_melted = df_top_vu.melt(var_name=entity_type,
                               value_name="views_unique",
                               ignore_index=False).reset_index()
    # print(df_melted)

    # Normalize main metric to show a view count _per day_, and clarify in the
    # plot that this is a _mean_ value derived from the _last 14 days_.
    df_melted["views_unique_norm"] = df_melted["views_unique"] / 14.0

    # For paths, it's relevant to identify the common prefix (repo owner/name)

    # cmn_ename_prefix = os.path.commonprefix(list(unique_entity_names))
    # log.info("cmn_ename_prefix: %s", cmn_ename_prefix)

    # if entity_type == "path":
    #     log.info("remove common path prefix")
    #     df_melted["path"] = df_melted["path"].str.slice(start=len(cmn_ename_prefix))
    #     # The root path (e.g., `owner/repo`) is not an empty string. That's
    #     # not so cool, make the root be represented by a single slash.
    #     # df_melted[df_melted["path"] == ""]["path"] = "/"
    #     df_melted["path"].replace("", "/", inplace=True)

    panel_props = {"height": 300, "width": "container", "padding": 10}
    chart = (
        alt.Chart(df_melted).mark_line(point=True)
        # .encode(x="time:T", y="views_unique:Q", color="referrer:N")
        # the pandas dataframe datetimeindex contains timing information at
        # much higher resolution than 1 day. The resulting vega spec may
        # then see time values like this: `"time": "2021-01-03T00:00:00+00:00"`
        # -- suggesting to vega that we care about showing hours and minutes.
        # instruct vega to only care about _days_ (dates), via an altair-based
        # timeout unit transformation. Ref:
        # https://altair-viz.github.io/user_guide/transform/timeunit.html
        .encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "views_unique_norm",
                type="quantitative",
                title="unique visitors per day (mean from last 14 days)",
                scale=alt.Scale(
                    domain=(0, df_melted["views_unique_norm"].max() * 1.1),
                    zero=True,
                ),
            ),
            alt.Color(
                entity_type,
                type="nominal",
                sort=alt.SortField("order"),
            ),
        ).configure_point(size=50).properties(**panel_props))

    chart_spec = chart.to_json(indent=None)

    # From
    # https://altair-viz.github.io/user_guide/customization.html
    # "Note that this will only scale with the container if its parent element
    # has a size determined outside the chart itself; For example, the
    # container may be a <div> element that has style width: 100%; height:
    # 300px.""

    heading = "Top referrers" if entity_type == "referrer" else "Top paths"

    # Textual form: larger N, and no cutoff (arbitrary length and legend of
    # plot don't go well with each other).
    top_n = 15
    top_n_enames = list(sorted_dict.keys())[:top_n]
    top_n_enames_string_for_md = ", ".join(
        f"{str(i).zfill(2)}: `{n}`" for i, n in enumerate(top_n_enames, 1))

    MD_REPORT.write(
        textwrap.dedent(f"""


    #### {heading}


    <div id="chart_{entity_type}s_top_n_alltime" class="full-width-chart"></div>

    Top {top_n} {entity_type}s: {top_n_enames_string_for_md}


    """))
    JS_FOOTER_LINES.append(
        f"vegaEmbed('#chart_{entity_type}s_top_n_alltime', {chart_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);"
    )
Esempio n. 4
0
def gen_sample_plot(metadata):
    """Uses Altair to generate a JSON Vega-Lite spec for the sample plot.

    Parameters
    ----------

    metadata: pd.DataFrame
        DataFrame containing sample metadata information. (Indices correspond
        to samples, and columns correspond to sample metadata fields.)

        This should have already been matched with the BIOM table, had empty
        samples removed, etc.

    Returns
    -------

    sample_chart_json: dict
        A dict version of the alt.Chart for the sample plot.
    """
    sample_metadata = metadata.copy()

    # Used to set color
    default_metadata_col = sample_metadata.columns[0]

    # Since we don't bother setting a default log-ratio, we set the balance for
    # every sample to None so that Vega* will filter them out (producing an
    # empty scatterplot by default, which makes sense).
    sample_metadata["qurro_balance"] = None

    # "Reset the index" -- make the sample IDs a column (on the leftmost side)
    # First we rename the index "Sample ID", though. (Note that our use of
    # check_column_names() means that there shouldn't be any sample metadata
    # fields named "Sample ID".)
    sample_metadata.rename_axis("Sample ID", axis="index", inplace=True)
    sample_metadata.reset_index(inplace=True)

    # Create sample plot chart Vega-Lite spec using Altair.
    sample_chart = (alt.Chart(
        sample_metadata,
        title="Samples",
        background="#FFFFFF",
        autosize=alt.AutoSizeParams(resize=True),
    ).mark_circle().encode(
        alt.X(
            default_metadata_col,
            type="nominal",
            axis=alt.Axis(labelAngle=-45),
            scale=alt.Scale(zero=False),
        ),
        alt.Y(
            "qurro_balance:Q",
            title="Current Natural Log-Ratio",
            type="quantitative",
            scale=alt.Scale(zero=False),
        ),
        color=alt.Color(default_metadata_col, type="nominal"),
        tooltip=["Sample ID:N", "qurro_balance:Q"],
    ).configure_range(
        ramp=alt.SchemeConfig(scheme="blues"),
        category=alt.SchemeConfig(scheme="tableau10"),
    ).configure_axis(labelBound=True).interactive())

    # Replace the "mark": "circle" definition with a more explicit one. This
    # will be useful when adding attributes to the boxplot mark in the
    # visualization. (We have to resort to this hack because I haven't been
    # able to successfully use alt.MarkDef in the alt.Chart definition above.)
    sample_chart_dict = sample_chart.to_dict()
    sample_chart_dict["mark"] = {"type": "circle"}

    sm_fields = "qurro_sample_metadata_fields"
    check_json_dataset_names(sample_chart_dict, sm_fields)
    # Specify an alphabetical ordering for the sample metadata field names.
    # This will be used for populating the x-axis / color field selectors in
    # Qurro's sample plot controls.
    #
    # Importantly, this is case insensitive (by default, the json.dumps
    # sort_keys parameter considers names like "Sample ID" to occur before
    # names like "age" due to casing -- we use this list to get around this).
    # Solution based on this article:
    # https://www.afternerd.com/blog/python-sort-list/#sort-strings-case-insensitive
    #
    # Also, we remove qurro_balance from this list because it shouldn't be
    # exposed to the user in the Qurro interface. (It's already used on the
    # y-axis of the sample plot automatically.)
    sorted_md_cols = list(sorted(sample_metadata.columns, key=str.lower))
    sorted_md_cols.remove("qurro_balance")
    sample_chart_dict["datasets"][sm_fields] = sorted_md_cols
    return sample_chart_dict
def main():
    """ ETF Sparplan """
    ##General Settings
    st.set_page_config(page_title='ETF Sparplan Rechner')#, page_icon='logo.jpg')
    
    ## Hide Hamburger Menu
    hide_menu_style = """
        <style>
        #MainMenu {visibility: hidden;}
        </style>
        """
    st.markdown(hide_menu_style, unsafe_allow_html=True)

    st.success('ETF Sparplan Rechner')

###Eingabe
    df_etf  = pd.read_excel('ETF.xls', index_col='ETF')
    list = st.selectbox('Wähle deinen ETF:', df_etf.index)
    entry = df_etf['RIC']
    entry_list = entry[list]
    inf = df_etf['Branche/Region']

###Col Defintion    
    col0_1, col0_2 = st.beta_columns(2)
    col1, col2 = st.beta_columns(2)
    col2_1, col2_2 = st.beta_columns([3,1])
    
    with col0_1:
        st.info(inf[list])
    
    with col0_2:
        url = 'https://de.finance.yahoo.com/quote/' + entry_list + '?p=' + entry_list
        req = r.get(url)
        soup = BeautifulSoup(req.content, 'html.parser')
        try:
            cont_Kostenquote = soup.body.div.find('span', {'data-reactid': '115'}).text.replace(',', '.') #netto
            cont_Nettoverm = soup.body.div.find('span', {'data-reactid': '85'}).text.replace(',', '.')
            
            try:
                size = pd.DataFrame([], columns=['Volumen'], index=[0])
                size['Volumen'] = cont_Nettoverm
                size = size['Volumen'].str.replace('M', '')
                test = size.astype(float) * 1
                vol = test[0].astype(str) + ' Mio. EUR'
            except:
                try:
                    size = pd.DataFrame([], columns=['Volumen'], index=[0])
                    size['Volumen'] = cont_Nettoverm
                    size = size['Volumen'].str.replace('B', '')
                    test = size.astype(float) * 1
                    vol = test[0].astype(str) + ' Mrd. EUR'
                except:
                    vol = 'N/A'
            
            st.success('Nettovermögen d. Fonds: ' + vol)
            st.error('Netto Kostenquote (TER) p.a.: ' + cont_Kostenquote)
        except:
            try:
                cont_Kostenquote = soup.body.div.find('span', {'data-reactid': '113'}).text.replace(',', '.') #netto
                cont_Nettoverm = soup.body.div.find('span', {'data-reactid': '83'}).text.replace(',', '.')
                            
                try:
                    size = pd.DataFrame([], columns=['Volumen'], index=[0])
                    size['Volumen'] = cont_Nettoverm
                    size = size['Volumen'].str.replace('M', '')
                    test = size.astype(float) * 1
                    vol = test[0].astype(str) + ' Mio. EUR'
                except:
                    try:
                        size = pd.DataFrame([], columns=['Volumen'], index=[0])
                        size['Volumen'] = cont_Nettoverm
                        size = size['Volumen'].str.replace('B', '')
                        test = size.astype(float) * 1
                        vol = test[0].astype(str) + ' Mrd. EUR'
                    except:
                        vol = 'N/A'

                st.success('Nettovermögen d. Fonds: ' + vol)
                st.error('Netto Kostenquote (TER) p.a.: ' + cont_Kostenquote)
            except:
                cont_Kostenquote = soup.body.div.find('span', {'data-reactid': '111'}).text.replace(',', '.') #netto
                cont_Nettoverm = soup.body.div.find('span', {'data-reactid': '81'}).text.replace(',', '.')
                
                try:
                    size = pd.DataFrame([], columns=['Volumen'], index=[0])
                    size['Volumen'] = cont_Nettoverm
                    size = size['Volumen'].str.replace('M', '')
                    test = size.astype(float) * 1
                    vol = test[0].astype(str) + ' Mio. EUR'
                except:
                    try:
                        size = pd.DataFrame([], columns=['Volumen'], index=[0])
                        size['Volumen'] = cont_Nettoverm
                        size = size['Volumen'].str.replace('B', '')
                        test = size.astype(float) * 1
                        vol = test[0].astype(str) + ' Mrd. EUR'
                    except:
                        vol = 'N/A'

                st.success('Nettovermögen d. Fonds: ' + vol)
                st.error('Netto Kostenquote (TER) p.a.: ' + cont_Kostenquote)

    with col1:
        entry_money = st.number_input('Wie viel willst du pro Monat einzahlen?', min_value=(25), max_value=(1500), value=(500))
        
        start = st.date_input('Anfangsdatum', dt.datetime(2010, 1, 1), min_value=dt.datetime(2010, 1, 1), max_value=dt.datetime(2019, 1, 1))
        end = dt.datetime.now()
 
    @st.cache
    def key_data(key_data):
        data = web.DataReader(entry_list, 'yahoo', start, end)
        ###Basis zu relevanten Zeitwerten
        df = pd.DataFrame(data).reset_index()
        df = df[['Date', 'Close', 'Volume']]
        ##Extract year, month and day of Date
        df['year'] = df['Date'].dt.year
        df['month'] = df['Date'].dt.month
        df['day'] = df['Date'].dt.day
        ##Get max of each month
        df_time = pd.DataFrame(df.groupby(['year', 'month'])['day'].max()).reset_index()
        #Merge year, month and day of max
        df_time['Date'] = df_time['year'].astype(str) + '-' + df_time['month'].astype(str) + '-' + df_time['day'].astype(str)
        #convert to datetime
        df_time['Date'] = pd.to_datetime(df_time['Date'])
        #drop not needed columns
        df_time = df_time.drop(columns=['year', 'month', 'day'])
        ##merge oroginal df and needed timeseries df
        out = pd.merge(df, df_time, left_on='Date', right_on='Date').drop(columns=['year', 'month', 'day'])
        out['Close'] = round(out['Close'] ,2)

        ###Grundlage für Grafik
        df_out = pd.DataFrame(out).reset_index()
        df_out['index'] += 1
        df_out['Stueckzahl kum.'] = round(entry_money / df_out['Close'], 2).cumsum()
        df_out['Wertentwicklung Sparplan in EUR'] = round(df_out['Stueckzahl kum.'] * df_out['Close'], 2)
        df_out['Investiert in EUR'] = entry_money * df_out['index']
        df_out['Performance in %'] = round((df_out['Wertentwicklung Sparplan in EUR'] / df_out['Investiert in EUR'] -1) * 100, 2)
        df_out['max Kurs'] = df_out['Close'].cummax()
        df_out['Differenz zu max Kurs'] = round(((df_out['Close'] - df_out['max Kurs']) / df_out['max Kurs']) * 100, 2)
        df_out = df_out.rename(columns={'Date': 'Datum', 'Close': 'Tageschlusskurs', 'Volume': 'Handelsvolumen'})
        return df_out
    
    df_out = key_data(key_data)

###Zusatzinfo zu Produkt und Grafik
    with col2:
        count = df_out.index.max()
        perf_pyear = round(df_out['Performance in %'][count] / df_out['index'].max() * 12, 2).astype(str)
        max_drawdown = df_out['Differenz zu max Kurs'].min().astype(str)
        st.success('Durchschnittlich Performance pro Jahr seit Anfangsdatum: ' + perf_pyear + '%')
        st.error('max. Drawdown seit Anfangsdatum: ' + max_drawdown + '%')

###Grafik Historisch
    with col2_1:
        option = st.selectbox('Wertentwicklung anzeigen als: Zukünftig oder historisch und grafisch oder als Tabelle?', ('Zukünftig mit Grafik', 'Zukünftig mit Tabelle', 'Historisch mit Grafik', 'Historisch mit Tabelle'))
        breit = 500
        hoch = 450

        if option == 'Historisch mit Grafik':
            chart_plan = alt.Chart(df_out).mark_trail(point=True, clip=True, opacity=0.8).encode(
                alt.X('Datum',
                    #scale=alt.Scale(domain=(df_hist['Datum'].astype(int).min() -1, df_hist['Datum'].astype(int).max() + 1)),
                    title='Datum'),
                alt.Y('Wertentwicklung Sparplan in EUR',
                    scale=alt.Scale(domain=(df_out['Wertentwicklung Sparplan in EUR'].min() -1, df_out['Wertentwicklung Sparplan in EUR'].max() + 1)),
                    title='Wertentwicklung Sparplan in EUR'),
                tooltip=['Datum', 'Wertentwicklung Sparplan in EUR', 'Performance in %', 'Investiert in EUR'],
                size=alt.Size('Wertentwicklung Sparplan in EUR', scale=alt.Scale(range=[1, 4, 10]), legend=None),
            ).interactive().properties(
                width=breit,
                height=hoch
            )

            chart_invest = alt.Chart(df_out).mark_trail(point=True, clip=True, color='yellow', opacity=0.8).encode(
                alt.X('Datum',
                    title='Datum'),
                alt.Y('Investiert in EUR',
                    title='Investiert in EUR'),
                tooltip=['Datum', 'Wertentwicklung Sparplan in EUR', 'Performance in %', 'Investiert in EUR'],
                size=alt.Size('Investiert in EUR', scale=alt.Scale(range=[1, 4, 10]), legend=None),
            ).interactive()

            chart = chart_plan + chart_invest
            st.altair_chart(chart)
        elif option == 'Zukünftig mit Grafik':
            Laufzeit = st.number_input('Wie viele Jahre planst du zu investieren?', min_value=5, max_value=50, value=10)

            df_fut = pd.DataFrame([], columns=['Sparbetrag', 'Wertentwicklung', 'Zinsertrag (brutto)'], index=range(Laufzeit*12)).reset_index().rename(columns={'index': 'Monat'})
            df_fut['Monat'] = df_fut.index + 1 
            df_fut['Sparbetrag'] = (entry_money * df_fut['Monat'])
            perf_year = round(df_out['Performance in %'][count] / df_out['index'].max() * 12, 2)
            df_fut['Wertentwicklung'] = -1 * np.fv(perf_year/100/12, df_fut['Monat'], entry_money, 0, when=1)
            df_fut['Zinsertrag (brutto)'] = df_fut['Wertentwicklung'] - df_fut['Sparbetrag']
            df_fut['Wertentwicklung'] = round(df_fut['Wertentwicklung'] *1, 2)
            df_fut['Zinsertrag (brutto)'] = round(df_fut['Zinsertrag (brutto)'] *1, 2)
        
###Grafik Zukunft
            chart_fut = alt.Chart(df_fut).mark_trail(point=True, clip=True, opacity=0.8).encode(
                alt.X('Monat',
                    #scale=alt.Scale(domain=(df_hist['Datum'].astype(int).min() -1, df_hist['Datum'].astype(int).max() + 1)),
                    title='Monat'),
                alt.Y('Wertentwicklung',
                    scale=alt.Scale(domain=(df_fut['Wertentwicklung'].min() -1, df_fut['Wertentwicklung'].max() + 1)),
                    title='Wertentwicklung Sparplan in EUR'),
                tooltip=['Monat', 'Wertentwicklung', 'Sparbetrag', 'Zinsertrag (brutto)'],
                size=alt.Size('Wertentwicklung', scale=alt.Scale(range=[1, 4, 10]), legend=None),
            ).interactive().properties(
                width=breit,
                height=hoch
            )

            chart_spar = alt.Chart(df_fut).mark_trail(point=True, clip=True, color='yellow', opacity=0.8).encode(
                alt.X('Monat',
                    title='Monat'),
                alt.Y('Sparbetrag'),
                tooltip=['Monat', 'Wertentwicklung', 'Sparbetrag', 'Zinsertrag (brutto)'],
                size=alt.Size('Sparbetrag', scale=alt.Scale(range=[1, 4, 10]), legend=None),
            ).interactive().properties(
                width=breit,
                height=hoch
            )

            chart = chart_fut + chart_spar
            chart

###Tabelle historisch        
        elif option == 'Historisch mit Tabelle':
            df_out = df_out[['Datum', 'Investiert in EUR', 'Wertentwicklung Sparplan in EUR', 'Performance in %']].rename(columns={'Investiert in EUR': 'Sparbetrag in EUR', 'Wertentwicklung Sparplan in EUR': 'Wertentwicklung in EUR'})
            df_outhtml = df_out.to_html(escape=False, index=False)
            st.markdown(df_outhtml, unsafe_allow_html=True)
            

###Tabelle zukünftig       
        else:
            Laufzeit = st.number_input('Wie viele Jahre planst du zu investieren?', min_value=5, max_value=50, value=10)

            df_fut = pd.DataFrame([], columns=['Sparbetrag in EUR', 'Wertentwicklung in EUR', 'Zinsertrag in EUR'], index=range(Laufzeit*12)).reset_index().rename(columns={'index': 'Monat'})
            df_fut['Monat'] = df_fut.index + 1 
            df_fut['Sparbetrag in EUR'] = (entry_money * df_fut['Monat'])
            perf_year = round(df_out['Performance in %'][count] / df_out['index'].max() * 12, 2)
            df_fut['Wertentwicklung in EUR'] = -1 * np.fv(perf_year/100/12, df_fut['Monat'], entry_money, 0, when=1)
            df_fut['Zinsertrag in EUR'] = df_fut['Wertentwicklung in EUR'] - df_fut['Sparbetrag in EUR']
            df_fut['Wertentwicklung in EUR'] = round(df_fut['Wertentwicklung in EUR'] *1, 2)
            df_fut['Zinsertrag in EUR'] = round(df_fut['Zinsertrag in EUR'] *1, 2)
            df_futhtml = df_fut.to_html(escape=False, index=False)
            st.markdown(df_futhtml, unsafe_allow_html=True)

###Werbung
    with col2_2:
        st.text_area('', 'Diesen Broker nutze ich - nur zu empfehlen:')
        url_neu = 'https://financeads.net/tc.php?t=19947C274449896B'    
        link = pd.DataFrame(['<a href="' +url_neu+ '" target="_blank"><img src="https://etf-blog.com/wp-content/uploads/2020/10/trade_republic_sparplan.png" width="145" ></a>'], columns=[''])
        html = link.to_html(escape=False, index=False)   
        st.markdown(html, unsafe_allow_html=True)

###Sector Info
    ##Create Expander
    my_expander = st.beta_expander("Weitere Infos: Sektorgewichtung", expanded=False)
    with my_expander:
        @st.cache
        def key_sector(key_sector):
            url_sec = 'https://de.finance.yahoo.com/quote/' + entry_list + '/holdings?p=' + entry_list
            req_sec = r.get(url_sec)
            dat_sec = BeautifulSoup(req_sec.content, 'html.parser')
            cont_sec = dat_sec.body('div', {'class': 'Mb(25px)'})
            df_sec = pd.DataFrame(cont_sec[1])
            sec = df_sec[0].astype(str).str.split('</span>').to_list()
            df_sec2 = pd.DataFrame(sec).dropna().transpose()
            sec2 = df_sec2[1].astype(str).str.split('">').to_list()
            df_sec3 = pd.DataFrame(sec2)
            sec_industry = df_sec3[4][1:].dropna().reset_index().drop(columns=['index'])
            sec_percent = df_sec3[1].str.replace(',', '.').str.replace('%', '').apply(pd.to_numeric, errors='coerce').dropna().reset_index().drop(columns=['index'])
            df_merge = pd.merge(sec_industry, sec_percent, left_index=True, right_index=True).rename(columns={4: 'Sektor', 1: 'Gewichtung in %'}).sort_values(by=['Gewichtung in %'], ascending=False).reset_index().drop(columns=['index'])
            return df_merge
        df_merge = key_sector(key_sector)

        table = pd.DataFrame(df_merge).style.set_precision(2)
        st.table(table)
base = (
    alt.Chart(selected_frame)
    .mark_point()
    .encode(
        opacity=alt.value(0.5),
        tooltip=["id", "correct", "clustering", "edit_distance", color_by],
    )
    .add_selection(brush, zoom)
)

if color_type == "Q":
    base = base.encode(
        color=alt.condition(
            brush,
            alt.Color(f"{color_by}:{color_type}", scale=alt.Scale(scheme="viridis"),),
            alt.ColorValue("gray"),
        ),
    )
else:
    base = base.encode(
        color=alt.condition(
            brush, alt.Color(f"{color_by}:{color_type}",), alt.ColorValue("gray"),
        ),
    )


st.header("Data visualization")
(
    base.encode(
        x=alt.X("pca_x", axis=alt.Axis(labels=False, title="")),
Esempio n. 7
0
            "type": "HexagonLayer",
            "data": data,
            "radius": 100,
            "elevationScale": 4,
            "elevationRange": [0, 1000],
            "pickable": True,
            "extruded": True,
        }
    ],
)

st.subheader("Breakdown by minute between %i:00 and %i:00" % (hour, (hour + 1) % 24))
filtered = data[
    (data[DATE_TIME].dt.hour >= hour) & (data[DATE_TIME].dt.hour < (hour + 1))
]
hist = np.histogram(filtered[DATE_TIME].dt.minute, bins=60, range=(0, 60))[0]
chart_data = pd.DataFrame({"minute": range(60), "pickups": hist})
st.write(alt.Chart(chart_data, height=150)
    .mark_area(
        interpolate='step-after',
        line=True
    ).encode(
        x=alt.X("minute:Q", scale=alt.Scale(nice=False)),
        y=alt.Y("pickups:Q"),
        tooltip=['minute', 'pickups']
    ))

if st.checkbox("Show raw data", False):
    st.subheader("Raw data by minute between %i:00 and %i:00" % (hour, (hour + 1) % 24))
    st.write(data)
# drop NaNs
deaths_long = deaths_long.dropna()
#deaths_long

# Selection tool
selection = alt.selection_single(fields=['state'])
# Color change when clicked
color = alt.condition(selection, alt.Color('state:N'), alt.value('lightgray'))

# Base altair plot
base = alt.Chart(
    deaths_long,
    title="Mortes confirmadas pelo COVID-19 por estado").mark_line(
        strokeWidth=4, opacity=0.7).encode(
            x=alt.X('Day'),
            y=alt.Y('Deaths', scale=alt.Scale(type='log')),
            color=alt.Color('state', legend=None),
        )

# Chart
chart = base.encode(color=alt.condition(selection, 'state:N',
                                        alt.value('lightgray')),
                    tooltip=[
                        alt.Tooltip('state:N', title='Estado'),
                        alt.Tooltip('yearmonthdate(Date):N',
                                    title='Data',
                                    format='%d/%m/%Y'),
                        alt.Tooltip('Deaths:N', title='Mortes')
                    ]).add_selection(selection)

# Overlay
Esempio n. 9
0
                                        left_on='state',
                                        right_on='state',
                                        how='left')
    source = source.replace(np.nan, 0)

    highlight = alt.selection_single(on='mouseover',
                                     fields=['state'],
                                     empty='none')
    states = alt.topo_feature(data.us_10m.url, 'states')
    ## map
    state_map = alt.Chart(
        states, title='States Heatmap').mark_geoshape().encode(
            color=alt.condition(
                highlight, alt.value('yellow'),
                alt.Color(f'{pollutant}:Q',
                          scale=alt.Scale(scheme='lightorange'))),
            tooltip=['state:N', f'{pollutant}:Q']).transform_lookup(
                lookup='id',
                from_=alt.LookupData(
                    source, 'id',
                    [pollutant, 'state'])).add_selection(highlight).project(
                        type='albersUsa').properties(width=700, height=400)
    state_map
elif graph == 'Pollutants Relationship':
    pollutant1 = st.sidebar.selectbox('Select the first Pollutant ',
                                      pollutant_list,
                                      index=2,
                                      key='pollutant1')
    pollutant2 = st.sidebar.selectbox('Select the seconf Pollutant ',
                                      pollutant_list,
                                      index=1,
Esempio n. 10
0
chart.save('Simple Area Chart6.html')


"""
영역 그래프 (스트림 그래프)
"""
source = data.unemployment_across_industries.url

base = alt.Chart()

#interactive() 적용시 상하좌우 이동 가능 → 데이터가 없어도 상하좌우 이동 가능
area = base.mark_area().encode(
    alt.X('yearmonth(date):Q', axis=alt.Axis(format='%Y', domain=True, tickSize=0)), 
    alt.Y('sum(count):Q', stack='center', axis=None),
    alt.Color('series:N',scale=alt.Scale(scheme='category20b'))
).interactive()

chart = alt.layer(area, data = source)
chart.save('Simple Area Chart7.html')


"""
영역 그래프 - 분할 해서 그리기
"""
source = data.iowa_electricity()

base = alt.Chart()

area = base.mark_area().encode(
    x="year:T",
Esempio n. 11
0
        .encode(x=alt.X('days_since_100:Q', axis=alt.Axis(title='Days since 100th confirmed case')),
                y=alt.Y('predictions:Q', 
                        axis=alt.Axis(title='Confirmed cases')),
                color=alt.Color('pred_idx:Q', legend=None, scale=None),)
        .transform_filter(selectCountry)
       ).properties(
    width=width,
    height=height
)  

predlog = (alt.Chart(predictionsDF_filtered)
        .mark_line(opacity=.15)
        .encode(x=alt.X('days_since_100:Q', axis=alt.Axis(title='Days since 100th confirmed case')),
                y=alt.Y('predictions:Q', 
                        axis=alt.Axis(title=None),
                        scale=alt.Scale(type='log', base=10)),
                color=alt.Color('pred_idx:Q', legend=None, scale=None),)
        .transform_filter(selectCountry)
        ).properties(
    width=width,
    height=height
)   

##### Mark The Last Case Count #####

# Point

last_point = (alt.Chart(lastpointDF)
              .mark_circle(color="black", size=40)
              .encode(x='days_since_100:Q',
                      y='confirmed:Q')
Esempio n. 12
0
    def plot_mds(
        self,
        rank=Rank.Auto,
        metric=BetaDiversityMetric.BrayCurtis,
        method=OrdinationMethod.Pcoa,
        title=None,
        xlabel=None,
        ylabel=None,
        color=None,
        size=None,
        tooltip=None,
        return_chart=False,
        label=None,
        mark_size=100,
        width=None,
        height=None,
    ):
        """Plot beta diversity distance matrix using multidimensional scaling (MDS).

        Parameters
        ----------
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.
        metric : {'braycurtis', 'cityblock', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac', 'aitchison'}, optional
            Function to use when calculating the distance between two samples.
            Note that 'cityblock' and 'manhattan' are equivalent metrics.
        method : {'pcoa', 'smacof'}
            Algorithm to use for ordination. PCoA uses eigenvalue decomposition and is not well
            suited to non-euclidean distance functions. SMACOF is an iterative optimization strategy
            that can be used as an alternative.
        title : `string`, optional
            Text label at the top of the plot.
        xlabel : `string`, optional
            Text label along the horizontal axis.
        ylabel : `string`, optional
            Text label along the vertical axis.
        size : `string` or `tuple`, optional
            A string or a tuple containing strings representing metadata fields. The size of points
            in the resulting plot will change based on the metadata associated with each sample.
        color : `string` or `tuple`, optional
            A string or a tuple containing strings representing metadata fields. The color of points
            in the resulting plot will change based on the metadata associated with each sample.
        tooltip : `string` or `list`, optional
            A string or list containing strings representing metadata fields. When a point in the
            plot is hovered over, the value of the metadata associated with that sample will be
            displayed in a modal.
        label : `string` or `callable`, optional
            A metadata field (or function) used to label each analysis. If passing a function, a
            dict containing the metadata for each analysis is passed as the first and only
            positional argument. The callable function must return a string.

        Examples
        --------
        Scatter plot of weighted UniFrac distance between all our samples, using counts at the genus
        level.

        >>> plot_mds(rank='genus', metric='unifrac')

        Notes
        -----
        **For `smacof`**: The values reported on the axis labels are Pearson's correlations between
        the distances between points on each axis alone, and the corresponding distances in the
        distance matrix calculated using the user-specified metric. These values are related to the
        effectiveness of the MDS algorithm in placing points on the scatter plot in such a way that
        they truly represent the calculated distances. They do not reflect how well the distance
        metric captures similarities between the underlying data (in this case, an OTU table).
        """
        import altair as alt
        import numpy as np
        import pandas as pd
        from scipy.spatial.distance import squareform
        from scipy.stats import pearsonr
        from skbio.stats import ordination
        from sklearn import manifold
        from sklearn.metrics.pairwise import euclidean_distances

        if len(self._results) < 3:
            raise PlottingException(
                "There are too few samples for MDS/PCoA after filtering. Please select 3 or more "
                "samples to plot.")

        dists = self._compute_distance(rank, metric).to_data_frame()

        # here we figure out what to put in the tooltips and get the appropriate data
        if tooltip:
            if not isinstance(tooltip, list):
                tooltip = [tooltip]
        else:
            tooltip = []

        tooltip.insert(0, "Label")

        if color and color not in tooltip:
            tooltip.insert(1, color)

        if size and size not in tooltip:
            tooltip.insert(2, size)

        magic_metadata, magic_fields = self._metadata_fetch(tooltip,
                                                            label=label)

        if method == OrdinationMethod.Smacof:
            # adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html
            x_field = "MDS1"
            y_field = "MDS2"

            seed = np.random.RandomState(seed=3)
            mds = manifold.MDS(max_iter=3000,
                               eps=1e-12,
                               random_state=seed,
                               dissimilarity="precomputed",
                               n_jobs=1)
            pos = mds.fit(dists).embedding_
            plot_data = pd.DataFrame(pos,
                                     columns=[x_field, y_field],
                                     index=dists.index)
            plot_data = plot_data.div(plot_data.abs().max(axis=0),
                                      axis=1)  # normalize to [0,1]

            # determine how much of the original distance is captured by each of the axes after MDS.
            # this implementation of MDS does not use eigen decomposition and so there's no simple
            # way of returning a 'percent of variance explained' value
            r_squared = []

            for axis in [0, 1]:
                mds_dist = pos.copy()
                mds_dist[::, axis] = 0
                mds_dist = squareform(euclidean_distances(mds_dist).round(6))
                r_squared.append(pearsonr(mds_dist, squareform(dists))[0])

            # label the axes
            x_extra_label = "r² = %.02f" % (r_squared[0], )
            y_extra_label = "r² = %.02f" % (r_squared[1], )
        elif method == OrdinationMethod.Pcoa:
            # suppress eigenvalue warning from skbio--not because it's an invalid warning, but
            # because lots of folks in the field run pcoa on these distances functions, even if
            # statistically inappropriate. perhaps this will change if we ever become more
            # opinionated about the analyses that we allow our users to do (roo)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                ord_result = ordination.pcoa(
                    dists.round(6))  # round to avoid float precision errors

            plot_data = ord_result.samples.iloc[:,
                                                [0, 1
                                                 ]]  # get first two components
            plot_data = plot_data.div(plot_data.abs().max(axis=0),
                                      axis=1)  # normalize to [0,1]
            plot_data.index = dists.index
            x_field, y_field = plot_data.columns.tolist(
            )  # name of first two components

            x_extra_label = "%0.02f%%" % (ord_result.proportion_explained[0] *
                                          100, )
            y_extra_label = "%0.02f%%" % (ord_result.proportion_explained[1] *
                                          100, )
        else:
            raise OneCodexException("MDS method must be one of: {}".format(
                ", ".join(OrdinationMethod.values)))

        # label the axes
        if xlabel is None:
            xlabel = "{} ({})".format(x_field, x_extra_label)
        if ylabel is None:
            ylabel = "{} ({})".format(y_field, y_extra_label)

        plot_data = pd.concat([plot_data, magic_metadata],
                              axis=1).reset_index()

        alt_kwargs = dict(
            x=alt.X(x_field, axis=alt.Axis(title=xlabel)),
            y=alt.Y(y_field, axis=alt.Axis(title=ylabel)),
            tooltip=[magic_fields[t] for t in tooltip],
            href="url:N",
            url=get_base_classification_url() + alt.datum.classification_id,
        )

        # only add these parameters if they are in use
        if color:
            color_kwargs = {
                "legend": alt.Legend(title=magic_fields[color]),
            }
            if not is_continuous(plot_data[color]) or has_missing_values(
                    plot_data[color]):
                plot_data[color] = plot_data[color].fillna("N/A").astype(str)
                domain = plot_data[color].values
                color_range = interleave_palette(domain)
                color_kwargs["scale"] = alt.Scale(domain=domain,
                                                  range=color_range)

            alt_kwargs["color"] = alt.Color(magic_fields[color],
                                            **color_kwargs)
        if size:
            alt_kwargs["size"] = magic_fields[size]

        chart = (alt.Chart(plot_data).transform_calculate(
            url=alt_kwargs.pop("url")).mark_circle(size=mark_size).encode(
                **alt_kwargs))

        chart = chart.properties(
            **prepare_props(title=title, height=height, width=width))

        if return_chart:
            return chart
        else:
            chart.interactive().display()
Esempio n. 13
0
def __draw_metric_line_titles(metrics, size_constants):
    """Draws left hand side titles for metrics."""

    metric_line_titles = []

    for metric in metrics:
        # METRIC TITLE
        metric_title = (alt.Chart(DUMMY_DF).transform_calculate(
            y_position="1.2").mark_text(
                align="center",
                baseline="middle",
                font=FONT,
                fontWeight=Title.font_weight,
                size=Title.font_size,
                color=Title.font_color,
            ).encode(
                alt.Y("y_position:Q",
                      scale=alt.Scale(domain=[3, 1]),
                      axis=no_axis()),
                text=alt.value(metric.upper()),
            ))

        # GROUPS TEXT
        group_circles_title = (alt.Chart(DUMMY_DF).transform_calculate(
            y_position="2").mark_text(
                align="center",
                baseline="middle",
                font=FONT,
                size=Subtitle.font_size,
                color=Subtitle.font_color,
            ).encode(
                alt.Y("y_position:Q",
                      scale=alt.Scale(domain=[3, 1]),
                      axis=no_axis()),
                text=alt.value("Groups"),
            ))

        # PERCENT. POP TEXT
        population_percentage_title = (alt.Chart(DUMMY_DF).transform_calculate(
            y_position="2.7").mark_text(
                align="center",
                baseline="middle",
                font=FONT,
                size=Subtitle.font_size,
                color=Subtitle.font_color,
            ).encode(
                alt.Y("y_position:Q",
                      scale=alt.Scale(domain=[3, 1]),
                      axis=no_axis()),
                text=alt.value("% Pop."),
            ))

        metric_line_titles.append(
            (metric_title + group_circles_title +
             population_percentage_title).properties(
                 height=size_constants["line_height"],
                 width=size_constants["metric_titles_width"],
             ))

    # EMPTY CORNER SPACE
    # To make sure that the attribute columns align properly with the title column, we need to create a blank
    # space of the same size of the attribute titles. For this purpose, we use the same function (__draw_attribute_title)
    # and pass in an empty string so that nothing is actually drawn.
    top_left_corner_space = __draw_attribute_title(
        "", size_constants["metric_titles_width"], size_constants)

    # CONCATENATE SUBPLOTS
    metric_titles = alt.vconcat(
        top_left_corner_space,
        *metric_line_titles,
        spacing=size_constants["line_spacing"],
        bounds="flush",
    )

    return metric_titles
def app():
    st.title("Smoking Deaths")
    st.header("Why Tobacco is a deadly threat?")
    
    @st.cache(allow_output_mutation=True)
    def load_data():
        deaths = pd.read_csv('data/smoking-deaths-by-age.csv',
                            header=0,
                            names=[
                                'country',
                                'code',
                                'year',
                                '15 to 49',
                                '50 to 69',
                                'Above 70'])

        factors = pd.read_csv('data/number-of-deaths-by-risk-factor.csv',
                            header=0,
                            index_col=False,
                            names=[
                                'country',
                                'code',
                                'year',
                                'Diet low in vegetables',
                                'Diet low in whole grains',
                                'Diet low in nuts and seeds',
                                'Diet low in calcium',
                                'Unsafe sex',
                                'No access to handwashing facility',
                                'Child wasting',
                                'Child stunting',
                                'Diet high in red meat',
                                'Diet low in fiber',
                                'Diet low in seafood omega-3 fatty acids',
                                'Diet high in sodium',
                                'Low physical activity',
                                'Non-exclusive breastfeeding',
                                'Discontinued breastfeeding',
                                'Iron deficiency',
                                'Vitamin A deficiency',
                                'Zinc deficiency',
                                'Smoking',
                                'Secondhand smoke',
                                'Alcohol use',
                                'Drug use',
                                'High fasting plasma glucose',
                                'High total cholesterol',
                                'High systolic blood pressure',
                                'High body-mass index',
                                'Low bone mineral density',
                                'Diet low in fruits',
                                'Diet low in legumes',
                                'Low birth weight for gestation',
                                'Unsafe water source',
                                'Unsafe sanitation',
                                'Household air pollution from solid fuels',
                                'Air pollution',
                                'Outdoor air pollution'])

        # Drop columns with missing values and extremely low values
        factors.drop(columns=['Vitamin A deficiency', 'High total cholesterol', 'Zinc deficiency', 'Child stunting', 'Discontinued breastfeeding',
                                'Iron deficiency', 'Non-exclusive breastfeeding','Diet high in red meat', 'Unsafe sanitation', 
                                'No access to handwashing facility','Household air pollution from solid fuels', 'Unsafe water source', 'Child wasting',
                                'Low birth weight for gestation', 'Diet low in calcium', 'Low bone mineral density',], inplace=True)

        # Filter data with years
        factors = factors.drop(factors[factors.year > 2012].index)
        deaths = deaths.drop(deaths[deaths.year > 2012].index)

        # Convert data from wide to long
        deaths = pd.melt(deaths, id_vars=['country', 'year'], value_vars=['15 to 49', '50 to 69', 'Above 70'], var_name='Age')
        factors = pd.melt(factors, id_vars=['country', 'year'], value_vars=['Diet low in vegetables',
                                'Diet low in nuts and seeds',
                                'Unsafe sex',
                                'Diet low in fiber',
                                'Diet low in seafood omega-3 fatty acids',
                                'Diet high in sodium',
                                'Low physical activity',
                                'Smoking',
                                'Secondhand smoke',
                                'Alcohol use',
                                'Drug use',
                                'High fasting plasma glucose',
                                'High systolic blood pressure',
                                'High body-mass index',
                                'Diet low in fruits',
                                'Diet low in legumes',
                                'Air pollution',
                                'Outdoor air pollution'], var_name='risk_factor')

        countries = deaths['country'].unique() # get unique country names
        countries.sort() # sort alphabetically
        minyear = deaths.loc[:, 'year'].min()
        maxyear = deaths.loc[:, 'year'].max()
        return deaths, factors, countries, minyear, maxyear

    # Load data
    deaths, factors, countries, minyear, maxyear = load_data()

    # Country Selection
    selectCountry = st.selectbox('Select a country: ', countries, 77)

    # Year selection
    slider = st.slider('Select a period of time', int(str(minyear)), int(str(maxyear)), (1994, 2004))

    # Bar chart - Risk factors
    bar_factors = alt.Chart(factors, title="Ranking of the top 10 risk factors leading to deaths in "
                 + selectCountry + " from " + str(slider[0]) + " to " + str(slider[1])).mark_bar().transform_filter({'and': [{'field': 'country', 'equal': selectCountry},
                                                                                                                            {'field': 'year', 'range': slider}]}
    ).transform_aggregate(
        sum_deaths='sum(value)', # Calculate the total number of deaths
        groupby=["risk_factor"]
    ).transform_window(
        rank='rank(sum_deaths)',
        sort=[alt.SortField('sum_deaths', order='descending')]
    ).transform_filter(
        alt.datum.rank < 11 # Filter out top 10 factors
    ).encode(
        alt.X('sum_deaths:Q', title='Total number of deaths'),
        y=alt.Y('risk_factor:O',sort='-x', title='Risk factor'),
        tooltip=alt.Tooltip(["sum_deaths:Q"],format=",.0f",title="Deaths"),
        color=alt.condition(
          alt.datum['risk_factor'] == 'Smoking',
          alt.value("red"),  # Color for the smoking factor
          alt.value("lightgray")  # Color for the rest
        )
    ).properties(
        width=660,
        height=300
    )
    
    # Stacked bar chart - Smoking deaths by ages
    base = alt.Chart(deaths, title='Smoking deaths by age in ' + selectCountry).mark_bar().transform_filter({'and': [{'field': 'country', 'equal': selectCountry},
                                                                                                                    {'field': 'year', 'range': slider}]}
    ).encode(
        alt.X('year:O', title='Year'),
        y=alt.Y('value:Q', title='Number of smoking deaths'),
        order=alt.Order('Age:O', sort='ascending'),
        color=alt.Color('Age:O',
                        scale = alt.Scale(domain=['Above 70', '50 to 69', '15 to 49'], scheme='lightorange')), 
        tooltip=alt.Tooltip(["value:Q"],format=",.0f",title="Deaths"),
    ).properties(
        width=720,
        height=300
    )

    
    # Render the charts
    container1 = st.beta_container()
    with container1:
        st.altair_chart(base)

    st.markdown("From the chart above we can see that smoking is a critical factor leading to deaths, especially for old people. The numbers of people aged over 70 who died because of smoking are extremely high in all countries. \
                In the bar chart below, we can see how smoking ranks in the list of top 10 risk factors that lead to deaths in the chosen country in the chosen period of time.")

    container2 = st.beta_container()
    with container2:
        st.altair_chart(bar_factors)
Esempio n. 15
0
-----------------
This example shows a ranged dot plot that uses 'layer' to convey changing life expectancy for the five most populous countries (between 1955 and 2000).
"""

import altair as alt
from vega_datasets import data

source = data.countries()


line = alt.Chart().mark_line(color='#db646f').encode(
    x = 'life_expect',
    y = 'country',
    detail = 'country'
    ).interactive()

point = alt.Chart().mark_point(size = 100, opacity = 1, filled = True).encode(
    x = 'life_expect',
    y = 'country',
    color=alt.Color('year:O',
        scale=alt.Scale(
            domain=['1955', '2000'],
            range=['#e6959c', '#911a24']
        )
    )
)

chart = alt.layer(line + point, data = source,
                    transform = [{'filter': {"field": 'country', "oneOf": ["China", "India", "United States", "Indonesia", "Brazil"]}},
                                  {'filter': {"field": 'year', "oneOf": [1955, 2000]}}])
Esempio n. 16
0
 c1 = alt.Chart(df_inf).properties(width=150, height=300).mark_bar().encode(
     x=alt.X("Max Infection Rate:Q", title="Max Infection Rate"),
     y=alt.Y("countryy:N", title="Countries", sort=None),
     color=alt.Color('countryy:N', title="Country"),
     tooltip=[
         alt.Tooltip('countryy:N', title='Country'),
         alt.Tooltip('Max Infection Rate:Q', title='Max Infection Rate'),
         alt.Tooltip('inhabitants:Q', title='Inhabitants [mio]')
     ])
 st.altair_chart(c1, use_container_width=True)
 selection = st.selectbox("Which country to look at:", country)
 df = df[df["Country"] == selection]
 variables = ["Confirmed", "Recovered", "Deaths"]
 colors = ["steelblue", "orange", "black"]
 value_vars = variables
 SCALE = alt.Scale(domain=variables, range=colors)
 dfm = pd.melt(df.reset_index(), id_vars=["Date"], value_vars=value_vars)
 dfm['order'] = dfm['variable'].replace(
     {val: i
      for i, val in enumerate(variables[::-1])})
 if s2 == "cases":
     c = alt.Chart(dfm.reset_index()).mark_bar().properties(
         height=400,
         width=350).encode(x=alt.X("Date:T", title="Date"),
                           y=alt.Y("sum(value):Q",
                                   title="Cases",
                                   scale=alt.Scale(type='linear')),
                           color=alt.Color('variable:N',
                                           title="Category",
                                           scale=SCALE),
                           order='order')
Esempio n. 17
0
    def plotAltairLineChart(self):
        """ Returns an Altair line chart.

            Returns
            -------
                An Altair line chart.
        """

        df = self.getData(self.confirmed_deaths)

        iso_date: str = self.covid19_date.strftime('%Y-%m-%d')

        # Altair requires dataframe to be in "long format"
        df_countries = (df.drop(columns=['Province/State', 'Lat', 'Long'])
                          .groupby('Country/Region').agg('sum')
                          .sort_values(by=df.columns[-1], ascending=False)
                          .transpose()
                          .reset_index()
                          .melt(id_vars='index', value_name='Qty')
                          .rename(columns={'index': 'Date',
                                   'Country/Region': 'Country_Region'
                                  }
                          )
                          .set_index('Date')
                       )

        # Make index values actual datetime objects so that we can
        # leverage Panda's date filtering API
        df_countries.index = [datetime.strptime(day, '%m/%d/%y')
                              for day in df_countries.index
                             ]
    
        alt_chart = alt.Chart(df_countries[: iso_date]
                              .query("Country_Region in(@self.country)")
                              .reset_index()
                              .rename(columns={'index': 'Date'})
                             ).mark_line().encode(
                              x=alt.X(title='Date', field='Date', type='temporal'),
                              y=alt.Y(title='# of ' + self.confirmed_deaths, field='Qty',
                               type='quantitative', scale=alt.Scale(type=self.ylog)
                              ),
                              color=alt.Color(field='Country_Region', type='nominal',
                               legend=alt.Legend(title="Country/Region")
                              ),
                              tooltip=[alt.Tooltip(field='Country_Region', type= 'nominal'),
                                       alt.Tooltip(field='Qty', type= 'quantitative'),
                                       alt.Tooltip(field='Date', type= 'temporal')
                                      ]
                            )
        # To create filled circles in the legend per
        # https://github.com/altair-viz/altair/issues/1206
        points = alt.Chart(df_countries[: iso_date]
                  .query("Country_Region in(@self.country)")
                  .reset_index()
                  .rename(columns={'index': 'Date'})
                 ).mark_circle(size=0).encode(
                                       color='Country_Region'
                                      )

        # To add hover tips, but make it less sensitive per
        # https://github.com/altair-viz/altair/issues/1812
        tooltips = alt_chart.mark_point(size=100, opacity=0,
                    tooltip=alt.TooltipContent("data")
                   )

        alt_chart = alt_chart + points + tooltips

        return alt_chart.properties(
                title='COVID-19 ' + self.confirmed_deaths,
                width='container',
                height=400
               )
Esempio n. 18
0
        1370, 1468, 1566, 1664, 1762, 1860, 1958
    ]
}

# In[ ]:

scores_df = pd.DataFrame(scores_dict)
scores_df = scores_df.melt(id_vars=['n_features'],
                           value_vars=['mutual_info_classif', 'f_classif'],
                           var_name='metric',
                           value_name='mae')
max_value = scores_df['mae'].max() * 1.05
min_value = scores_df['mae'].min() * 0.95
artgor_utils.render(
    alt.Chart(scores_df).mark_line().encode(
        y=alt.Y('mae:Q', scale=alt.Scale(domain=(min_value, max_value))),
        x='n_features:O',
        color='metric:N',
        tooltip=['metric:N', 'n:O', 'mae:Q']).properties(
            title='Top N features by SelectPercentile vs CV').interactive())

# ### SelectKBest
#
# **Important notice**:  I run the cell below in `version 14` and printed the scores_dict. In the following versions I'll use `scores_dict` and plot the results instead of running feature selection each time

# In[ ]:

# %%time
# scores_dict = {'f_classif': [], 'mutual_info_classif': [], 'n_features': []}
# for i in np.arange(10, 1958, 100):
#     print(i)
                                        default=regions)
st.sidebar.info(
    "Merci à tous contributeurs du projet [opencovid19-fr](https://github.com/opencovid19-fr/data) pour leur travail de collecte des données officielles sur la progression de l'épidémie en France."
)

# get df_covid19_region based on region in multiselection
df_covid19_region = df_covid19_region[df_covid19_region["maille_nom"].isin(
    multiselection)].sort_values(by=["maille_nom", "date"],
                                 ascending=[True, False])

if check_box_table:
    st.write(df_covid19_region)

if option == "graph":
    if st.checkbox("Log Scale"):
        scale = alt.Scale(type="log", domain=[10, 5000], clamp=True)
    else:
        scale = alt.Scale(type="linear")

    if check_box_analyse:
        st.info(
            "[03/22] Les régions Grand-Est, Ile-de-France et Haut-de-France sont les plus touchées par l'épidémie. "
            "Par ailleurs l'affiche en échelle Log, nous montre que l'ensemble des régions suivent la même croissance en terme d'évolution"
        )
    # make plot on nb of deces by regions
    c_deces = (alt.Chart(df_covid19_region).mark_line(point=True).encode(
        alt.X("days_after_5_deaths"),
        alt.Y("deces", scale=scale),
        alt.Color("maille_nom"),
        tooltip=["days_after_5_deaths", "deces", "maille_nom"],
    ).interactive())
# year slider for Maps
slider = alt.binding_range(min=source['year'].min(),
                           max=source['year'].max(),
                           step=1)
select_year = alt.selection_single(name="year",
                                   fields=['year'],
                                   bind=slider,
                                   init={'year': source['year'].min()})

base = alt.Chart(source)

### trend line plot

plot = base.mark_point(filled=True).encode(alt.X('year:O',
                                                 scale=alt.Scale(zero=False)),
                                           alt.Y(
                                               'average(Female, Income: Q1):Q',
                                               scale=alt.Scale(zero=False)),
                                           color=alt.value("salmon"))

plot += base.mark_point(filled=True).encode(
    alt.X('year:O', scale=alt.Scale(zero=False)),
    alt.Y('average(Female, Income: Q2):Q', scale=alt.Scale(zero=False)),
    color=alt.value("salmon"),
    shape=alt.value("cross"))

plot += base.mark_point(filled=True).encode(
    alt.X('year:O', scale=alt.Scale(zero=False)),
    alt.Y('average(Female, Income: Q3):Q', scale=alt.Scale(zero=False)),
    color=alt.value("salmon"),
Esempio n. 21
0
def gen_rank_plot(V, rank_type, ranking_ids, feature_metadata_cols, table_sdf):
    """Uses Altair to generate a JSON Vega-Lite spec for the rank plot.

    Parameters
    ----------

    V: pd.DataFrame
        DataFrame containing feature rank (and feature metadata, if applicable)
        information. (Indices correspond to features, and columns correspond
        to feature ranking or feature metadata fields.)

        This should have already been matched with the BIOM table, filtered (if
        -x passed), had empty features removed, etc.

    rank_type: str
        Human-readable name for a given ranking column that will be used as the
        prefix for each y-axis label in the rank plot. (This should be either
        "Differential" or "Feature Loading".)

    ranking_ids: pd.Index
        IDs of the actual "feature ranking" columns in V.

    feature_metadata_cols: pd.Index or list
        IDs of the "feature metadata" columns in V (if there wasn't any
        feature metadata provided, this can just be an empty list).

    table_sdf: pd.SparseDataFrame
        A representation of the input BIOM table containing count data. This
        is used to calculate qurro_spc (the number of samples a feature is
        present in) for each feature in V. This should ONLY contain samples
        that will be used in the Qurro visualization -- the presence of extra
        samples will mess up _df_utils.add_sample_presence_count().

    Returns
    -------

    rank_chart_json: dict
        A dict version of the alt.Chart for the rank plot, with
        qurro_rank_ordering and qurro_feature_metadata_ordering datasets
        added in indicating which columns describe feature rankings and
        which describe feature metadata. (Also has a qurro_rank_type "dataset"
        (really just a string) that points to the specified rank_type.)
    """

    rank_data = V.copy()

    # NOTE that until this point we've treated the actual rank values as just
    # "objects", as far as pandas is concerned. However, if we continue to
    # treat them as objects when sorting them, we'll get a list of feature
    # ranks in lexicographic order... which is not what we want. So we just
    # ensure that all of the columns contain numeric data.
    for col in ranking_ids:
        rank_data[col] = pd.to_numeric(rank_data[col])

    # The default rank column is just whatever the first rank is. This is what
    # the rank plot will use when it's first drawn.
    default_rank_col = ranking_ids[0]

    # Set default classification of every feature to "None"
    # (This value will be updated when a feature is selected in the rank plot
    # as part of the numerator, denominator, or both parts of the current log
    # ratio.)
    rank_data["qurro_classification"] = "None"

    # Add a "qurro_spc" column indicating how many samples each feature is
    # present in.
    rank_data = add_sample_presence_count(rank_data, table_sdf)

    # Replace "index" with "Feature ID". looks nicer in the visualization :)
    rank_data.rename_axis("Feature ID", axis="index", inplace=True)
    rank_data.reset_index(inplace=True)

    # Now, we can actually create the rank plot.
    rank_chart = (
        alt.Chart(
            rank_data,
            title="Features",
            background="#FFFFFF",
            autosize=alt.AutoSizeParams(resize=True),
        ).mark_bar().transform_window(
            sort=[alt.SortField(field=default_rank_col, order="ascending")],
            # We don't use an alt.WindowFieldDef here because python gets
            # confused when you use "as" as an actual argument name. So we just
            # use this syntax.
            window=[{
                "op": "row_number",
                "as": "qurro_x"
            }],
        ).encode(
            # type="ordinal" needed on the scale here to make bars adjacent;
            # see https://stackoverflow.com/a/55544817/10730311.
            x=alt.X(
                "qurro_x",
                title="Feature Rankings",
                type="ordinal",
                scale=alt.Scale(paddingOuter=1, paddingInner=0, rangeStep=1),
                axis=alt.Axis(ticks=False, labelAngle=0),
            ),
            y=alt.Y(default_rank_col, type="quantitative"),
            color=alt.Color(
                "qurro_classification",
                title="Log-Ratio Classification",
                scale=alt.Scale(
                    domain=["None", "Numerator", "Denominator", "Both"],
                    range=["#e0e0e0", "#f00", "#00f", "#949"],
                ),
            ),
            tooltip=[
                alt.Tooltip(
                    field="qurro_x",
                    title="Current Ranking",
                    type="quantitative",
                ),
                alt.Tooltip(
                    field="qurro_classification",
                    title="Log-Ratio Classification",
                    type="nominal",
                ),
                alt.Tooltip(
                    field="qurro_spc",
                    title="Sample Presence Count",
                    type="quantitative",
                ),
                "Feature ID",
                *feature_metadata_cols,
                *ranking_ids,
            ],
        ).configure_axis(
            # Done in order to differentiate "None"-classification features
            # from grid lines
            gridColor="#f2f2f2",
            labelBound=True,
        ).interactive())

    rank_chart_json = rank_chart.to_dict()
    rank_ordering = "qurro_rank_ordering"
    fm_col_ordering = "qurro_feature_metadata_ordering"
    dataset_name_for_rank_type = "qurro_rank_type"
    check_json_dataset_names(rank_chart_json, rank_ordering, fm_col_ordering,
                             rank_type)

    # Note we don't use rank_data.columns for setting the rank ordering. This
    # is because rank_data's columns now include both the ranking IDs and the
    # "Feature ID" and "qurro_classification" columns (as well as any feature
    # metadata the user saw fit to pass in).
    rank_chart_json["datasets"][rank_ordering] = list(ranking_ids)
    rank_chart_json["datasets"][fm_col_ordering] = list(feature_metadata_cols)
    rank_chart_json["datasets"][dataset_name_for_rank_type] = rank_type
    return rank_chart_json
import pandas as pd
import altair as alt

office_ratings = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv')

#alt.data_transformers.disable_max_rows()
chart = alt.Chart(office_ratings).mark_tick().encode(
    #x='imdb_rating:Q',
    alt.X('imdb_rating:Q',
        scale=alt.Scale(domain=(6, 10)),
        axis=alt.Axis(title='IMDb Rating') #(format='%', title='percentage')
    ),
    alt.Y('season:O',
        axis=alt.Axis(title='Season') 
    )
).properties(
    #width=550, height=140, 
    title={
      "text": ['The Office IMDb Ratings Distribution by Season'], 
      "fontSize": 18,
      "font": 'Courier',
      "anchor": 'middle',
      "color": 'gray'
    }
)
alt.concat(chart, 
    title=alt.TitleParams(
        ['', '#30DayChartChallenge - strips - 2021/04/12', 
        'Dataset: TidyTuesday Dataset 2020-03-17', 
        'twitter.com/vivekparasharr | github.com/vivekparasharr | vivekparasharr.medium.com'],
def get_plot(df, max_x: int, fields: list):
    data = df[df['legend'].isin(fields)]
    return alt.Chart(data).mark_line().encode(
        x=alt.X('day', scale=alt.Scale(domain=(0, max_x))),
        y=alt.Y('cases'),
        color='legend').properties(width=800, height=400)
Esempio n. 24
0
def get_interactive_proportions_plot(gender_balance):
    source = data_frames[gender_balance]
    sort_order = sort_orders[gender_balance]
    pts = alt.selection(type="multi", encodings=['x'], empty='none')

    lin = alt.Chart(source).mark_line().encode(
        alt.X('year:O', title='Year', axis=alt.Axis(labelAngle=-45)),
        alt.Y('female_prop:Q',
              title="Female Percentage",
              axis=alt.Axis(format='%'),
              scale=alt.Scale(domain=[0, 1])),
        alt.Color('job:N', legend=None)).transform_filter(pts).properties(
            width=450, height=325, title="Female Percentage in a Job by Year")

    hrule = alt.Chart(pd.DataFrame({'y': [0.5]})).mark_rule(
        color='red', strokeDash=[5, 5]).encode(y=alt.Y('y:Q'))

    vrule = alt.Chart(pd.DataFrame({'x': [0.5]})).mark_rule(
        color='red', strokeDash=[5, 5]).encode(x=alt.X('x:Q'))

    lin_w_interaction = alt.layer(
        lin,  # base line chart
        alt.Chart().mark_rule(color='#aaa').encode(
            x='year:O').transform_filter(label),
        lin.mark_circle().encode(opacity=alt.condition(label, alt.value(
            1), alt.value(0))).add_selection(label),
        lin.mark_text(
            align='left', dx=5, dy=-5, stroke='white',
            strokeWidth=2).encode(text=alt.Text(
                'female_prop:Q', format='.2%')).transform_filter(label),
        lin.mark_text(align='left', dx=5, dy=-5).encode(text=alt.Text(
            'female_prop:Q', format='.2%')).transform_filter(label),
        hrule,
        data=source)

    bar = alt.Chart(source).mark_bar(size=30).encode(
        y=alt.Y('job:N', title='', sort=sort_order),
        x=alt.X('total_prop_female:Q',
                title="Female Percentage",
                axis=alt.Axis(format='%')),
        color=alt.condition(pts, alt.Color('job:N', legend=None),
                            alt.ColorValue("grey")),
        tooltip=[
            alt.Tooltip(field="job", type="nominal", title="Job"),
            alt.Tooltip(field="total_prop_female",
                        type="quantitative",
                        title="Total Female Percentage",
                        format='.2%')
        ]).properties(
            width=225,
            height=350,
            title="Jobs by Total Female Percentage (For the 10 most " +
            gender_balance + " jobs)").add_selection(pts)

    bar_w_vrule = alt.layer(bar, vrule, data=source)
    if (gender_balance == 'male dominated'):
        interactive_job_chart = alt.hconcat(
            lin_w_interaction, bar).resolve_legend(
                color="independent",
                size="independent").configure_axis(labelFontSize=13,
                                                   titleFontSize=14)
    else:
        interactive_job_chart = alt.hconcat(
            lin_w_interaction,
            bar_w_vrule).resolve_legend(color="independent",
                                        size="independent").configure_axis(
                                            labelFontSize=13, titleFontSize=14)

    # Save html as a StringIO object in memory
    job_gender_proportions_html = io.StringIO()
    interactive_job_chart.save(job_gender_proportions_html, 'html')

    # Return the html from StringIO object
    return job_gender_proportions_html.getvalue()
Esempio n. 25
0
def visualize_sentiment(sentiment_df, plot_type="Standard"):
    """
    Takes in the output of sentiment_analysis and creates
    a visualization of user's tweets with sentimental analysis.

    Parameters:
    -----------
    sentiment_df : dataframe
        Output of tweet_sentiment_analysis,
        dataframe that contains added columns from tweet_sentiment_analysis

    plot_type : string
        Optional: Type of plot to return, 3 options:'Standard', 'Stacked', and 'Separate'
        'Standard' Returns bar plot of most common words tweeted color coded by sentiment
        'Stacked' Returns same as 'Standard' but if words are found in other sentiments they are stacked together
        'Separate' Returns 3 bar plots with the sentiment of 'Postive' 'Neutral', and 'Negative' separated

    Returns:
    --------
     plot:
        A bar plot of the user's tweets containing in order
        the most common words, colour coded by the word's sentiment class.
    """

    # check inputs
    options = ("Standard", "Stacked", "Separate")
    if plot_type not in options:
        raise TypeError("Invalid argument for plot_type: You must enter one of 'Standard', 'Stacked', 'Separate'")
    elif not isinstance(sentiment_df, pd.DataFrame):
        raise Exception("""The input of sentiment_df should be a Pandas DataFrame,
                           did you use output of tweet_sentiment_analysis?""")
    elif 'sentiment' not in sentiment_df:
        raise KeyError("Input does not contain column for sentiment, did you use output of tweet_sentiment_analysis?")

    # Define tweet_rank function
    def tweet_rank(df, sentiment):
        """function to return most common words tweeted for a specified sentiment"""
        df_senti = df[df['sentiment'] == sentiment]
        countVectorizer = CountVectorizer(analyzer=text_cleaning, stop_words='english')
        countVector = countVectorizer.fit_transform(df_senti['tweet'])
        count_vect_df = pd.DataFrame(countVector.toarray(), columns=countVectorizer.get_feature_names())
        count_vect_df.head()
        count = pd.DataFrame(count_vect_df.sum())
        countdf = count.sort_values(0, ascending=False).head(20)
        return countdf[1:11]

    dataframes = dict()            # create empty dictionary to store sentiment dataframes
    for sentiment in np.unique(sentiment_df["sentiment"]):
        sent_df = tweet_rank(sentiment_df, sentiment)
        sent_df.columns = ['frequency']                # rename columns and include column for sentiment and word
        sent_df["sentiment"] = sentiment
        sent_df['Word'] = sent_df.index
        dataframes[sentiment] = sent_df     # append sentiment dataframe to dictionary

    # add all dataframes together, may need adjustment later
    top_words_df = pd.concat([dataframes['positive'], dataframes['neutral'], dataframes['negative']])

    # Plot if standard is selected
    if plot_type == "Standard":
        top_words_df['Word'] = top_words_df['Word'] + ' (' + top_words_df["sentiment"] + ')'
        standard_plot = alt.Chart(top_words_df, title='Most Common Words used by Twitter User').mark_bar().encode(
            x=alt.X('frequency', title='Number of Occurences'),
            y=alt.Y('Word', sort='-x'),
            color=alt.Color("sentiment", scale=alt.Scale(domain=['positive', 'neutral', 'negative'],
                            range=['blue', 'orange', 'red'])))
        return standard_plot

    # Plot if stacked is selected
    elif plot_type == "Stacked":
        top_words_df['Word'] = top_words_df.index
        stacked_plot = alt.Chart(top_words_df, title='Most Common Words used by Twitter User').mark_bar().encode(
            x=alt.X('frequency', title='Number of Occurences'),
            y=alt.Y('Word', sort='-x'),
            color=alt.Color("sentiment", scale=alt.Scale(domain=['positive', 'neutral', 'negative'],
                            range=['blue', 'orange', 'red'])))
        return stacked_plot

    # Plot if Separate is selected
    elif plot_type == "Separate":
        negative = alt.Chart(dataframes['negative'],
                             title='Most Common Negative Words used by Twitter User').mark_bar().encode(
            x=alt.X('frequency', title='Number of Occurences'),
            y=alt.Y('Word', sort='-x'),
            color=alt.value("red"))

        positive = alt.Chart(dataframes['positive'],
                             title='Most Common Postive Words used by Twitter User').mark_bar().encode(
            x=alt.X('frequency', title='Number of Occurences'),
            y=alt.Y('Word', sort='-x'),
            color=alt.value("blue"))

        neutral = alt.Chart(dataframes['neutral'],
                            title='Most Common Neutral Words used by Twitter User').mark_bar().encode(
            x=alt.X('frequency', title='Number of Occurences'),
            y=alt.Y('Word', sort='-x'),
            color=alt.value("orange"))

        separate_plot = positive | neutral | negative
        return separate_plot
Esempio n. 26
0
import pandas as pd
import altair as alt

st.title("Let's analyze some Penguin Data 🐧📊.")

@st.cache  # add caching so we load the data only once
def load_data():
    # Load the penguin data from https://github.com/allisonhorst/palmerpenguins.
    penguins_url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/v0.1.0/inst/extdata/penguins.csv"
    return pd.read_csv(penguins_url)

df = load_data()

st.write("Let's look at raw data in the Pandas Data Frame.")

st.write(df)

st.write("Hmm 🤔, is there some correlation between body mass and flipper length? Let's make a scatterplot with [Altair](https://altair-viz.github.io/) to find.")

chart = alt.Chart(df).mark_point().encode(
    x=alt.X("body_mass_g", scale=alt.Scale(zero=False)),
    y=alt.Y("flipper_length_mm", scale=alt.Scale(zero=False)),
    color=alt.Y("species")
).properties(
    width=600, height=400
).interactive()

st.write(chart)

st.markdown("This project was created by Student1 and Student2 for the [Interactive Data Science](https://dig.cmu.edu/ids2022) course at [Carnegie Mellon University](https://www.cmu.edu).")
Esempio n. 27
0
"""
Natural Disasters
-----------------
This example shows a visualization of global deaths from natural disasters.
"""
# category: scatter plots
import altair as alt
from altair.expr import datum
from vega_datasets import data

source = data.disasters.url

alt.Chart(source).mark_circle(
    opacity=0.8, stroke='black', strokeWidth=1).encode(
        alt.X('Year:O', axis=alt.Axis(labelAngle=0)), alt.Y('Entity:N'),
        alt.Size('Deaths:Q',
                 scale=alt.Scale(range=[0, 5000]),
                 legend=alt.Legend(title='Annual Global Deaths')),
        alt.Color('Entity:N', legend=None)).properties(
            width=480, height=350).transform_filter(
                datum.Entity != 'All natural disasters')
This example shows a ranged dot plot that uses 'layer' to convey changing life expectancy for the five most populous countries (between 1955 and 2000).
"""
# category: other charts
import altair as alt
from vega_datasets import data

source = data.countries.url

chart = alt.layer(data=source).transform_filter(
    filter={
        "field": 'country',
        "oneOf": ["China", "India", "United States", "Indonesia", "Brazil"]
    }).transform_filter(filter={
        'field': 'year',
        "oneOf": [1955, 2000]
    })

chart += alt.Chart().mark_line(color='#db646f').encode(x='life_expect:Q',
                                                       y='country:N',
                                                       detail='country:N')
# Add points for life expectancy in 1955 & 2000
chart += alt.Chart().mark_point(size=100, opacity=1, filled=True).encode(
    x='life_expect:Q',
    y='country:N',
    color=alt.Color('year:O',
                    scale=alt.Scale(domain=['1955', '2000'],
                                    range=['#e6959c',
                                           '#911a24']))).interactive()

chart
Esempio n. 29
0
def analyse_view_clones_ts_fragments():

    log.info("read views/clones time series fragments (CSV docs)")

    basename_suffix = "_views_clones_series_fragment.csv"
    csvpaths = _glob_csvpaths(basename_suffix)

    dfs = []
    column_names_seen = set()

    for p in csvpaths:
        log.info("attempt to parse %s", p)
        snapshot_time = _get_snapshot_time_from_path(p, basename_suffix)

        df = pd.read_csv(
            p,
            index_col=["time_iso8601"],
            date_parser=lambda col: pd.to_datetime(col, utc=True),
        )

        # A time series fragment might look like this:
        #
        # df_views_clones:
        #                            clones_total  ...  views_unique
        # time_iso8601                             ...
        # 2020-12-21 00:00:00+00:00           NaN  ...             2
        # 2020-12-22 00:00:00+00:00           2.0  ...            23
        # 2020-12-23 00:00:00+00:00           2.0  ...            20
        # ...
        # 2021-01-03 00:00:00+00:00           8.0  ...            21
        # 2021-01-04 00:00:00+00:00           7.0  ...            18
        #
        # Note the NaN and the floaty type.

        # All metrics are known to be integers by definition here. NaN values
        # are expected to be present anywhere in this dataframe, and they
        # semantically mean "0". Therefore, replace those with zeros. Also see
        # https://github.com/jgehrcke/github-repo-stats/issues/4
        df = df.fillna(0)
        # Make sure numbers are treated as integers from here on. This actually
        # matters in a cosmetic way only for outputting the aggregate CSV later
        # #       # not for plotting and number crunching).
        df = df.astype(int)

        # attach snapshot time as meta data prop to df
        df.attrs["snapshot_time"] = snapshot_time

        # The index is not of string type anymore, but of type
        # `pd.DatetimeIndex`. Reflect that in the name.
        df.index.rename("time", inplace=True)

        if column_names_seen and set(df.columns) != column_names_seen:
            log.error("columns seen so far: %s", column_names_seen)
            log.error("columns in %s: %s", p, df.columns)
            sys.exit(1)

        column_names_seen.update(df.columns)

        df = df.sort_index()

        # Sanity check: snapshot time _after_ latest timestamp in time series?
        # This could hit in on a machine with a bad time setting when fetching
        # data.
        if df.index.max() > snapshot_time:
            log.error(
                "for CSV file %s the snapshot time %s is older than the newest sample",
                p,
                snapshot_time,
            )
            sys.exit(1)

        dfs.append(df)

    # for df in dfs:
    #     print(df)

    log.info("total sample count: %s", sum(len(df) for df in dfs))

    newest_snapshot_time = max(df.attrs["snapshot_time"] for df in dfs)

    df_prev_agg = None
    if ARGS.views_clones_aggregate_inpath:
        if os.path.exists(ARGS.views_clones_aggregate_inpath):
            log.info("read previous aggregate: %s",
                     ARGS.views_clones_aggregate_inpath)
            df_prev_agg = pd.read_csv(
                ARGS.views_clones_aggregate_inpath,
                index_col=["time_iso8601"],
                date_parser=lambda col: pd.to_datetime(col, utc=True),
            )
            df_prev_agg.index.rename("time", inplace=True)
        else:
            log.info(
                "previous aggregate file does not exist: %s",
                ARGS.views_clones_aggregate_inpath,
            )

    log.info("time of newest snapshot: %s", newest_snapshot_time)
    log.info("build aggregate, drop duplicate data")

    # Each dataframe in `dfs` corresponds to one time series fragment
    # ("snapshot") obtained from the GitHub API. Each time series fragment
    # contains 15 samples (rows), with two adjacent samples being 24 hours
    # apart. Ideally, the time series fragments overlap in time. They overlap
    # potentially by a lot, depending on when the individual snapshots were
    # taken (think: take one snapshot per day; then 14 out of 15 data points
    # are expected to be "the same" as in the snapshot taken the day before).
    # Stich these fragments together (with a buch of "duplicate samples), and
    # then sort this result by time.
    log.info("pd.concat(dfs)")
    dfall = pd.concat(dfs)

    if df_prev_agg is not None:
        if set(df_prev_agg.columns) != set(dfall.columns):
            log.error(
                "set(df_prev_agg.columns) != set (dfall.columns): %s, %s",
                df_prev_agg.columns,
                dfall.columns,
            )
            sys.exit(1)
        log.info("pd.concat(dfall, df_prev_agg)")
        dfall = pd.concat([dfall, df_prev_agg])

    dfall.sort_index(inplace=True)

    log.info("shape of dataframe before dropping duplicates: %s", dfall.shape)
    # print(dfall)

    # Now, the goal is to drop duplicate data. And again, as of a lot of
    # overlap between snapshots there's a lot of duplicate data to be expected.
    # What does "duplicat data" mean? We expect that there are multiple samples
    # from different snapshots with equivalent timestamp. OK, we should just
    # take any one of them. They should all be the same, right? They are not
    # all equivalent. I've found that at the boundaries of each time series
    # fragment, the values returned by the GitHub API are subject to a
    # non-obvious cutoff effect: for example, in a snapshot obtained on Dec 15,
    # the sample for Dec 7 is within the mid part of the fragment and shows a
    # value of 73 for `clones_total`. The snapshot obtained on Dec 21 has the
    # sample for Dec 7 at the boundary (left-hand, towards the past), and that
    # shows a value of 18 for `clones_total`. 73 vs 18 -- how is that possible?
    # That's easily possible, assuming that GitHub uses a rolling window of 14
    # days width with a precision higher than 1 day and after all the cutoff
    # for the data points at the boundary depends on the _exact time_ when the
    # snapshot was taken. That is, for aggregation (for dropping duplicate/bad
    # data) we want to look for the maximum data value for any given timestamp.
    # Using that method, we effectively ignore said cutoff artifact. In short:
    # group by timestamp (index), take the maximum.
    df_agg = dfall.groupby(dfall.index).max()

    log.info("shape of dataframe after dropping duplicates: %s", df_agg.shape)

    # Write aggregate
    # agg_fname = (
    #     datetime.strftime(newest_snapshot_time, "%Y-%m-%d_%H%M%S")
    #     + "_views_clones_aggregate.csv"
    # )
    # agg_fpath = os.path.join(ARGS.snapshotdir, agg_fname)
    if ARGS.views_clones_aggregate_outpath:

        if os.path.exists(ARGS.views_clones_aggregate_outpath):
            log.info("file exists: %s", ARGS.views_clones_aggregate_outpath)
            if not ARGS.views_clones_aggregate_inpath:
                log.error(
                    "would overwrite output aggregate w/o reading input aggregate -- you know what you're doing?"
                )
                sys.exit(1)

        log.info("write aggregate to %s", ARGS.views_clones_aggregate_outpath)
        # Pragmatic strategy against partial write / encoding problems.
        tpath = ARGS.views_clones_aggregate_outpath + ".tmp"
        df_agg.to_csv(tpath, index_label="time_iso8601")
        os.rename(tpath, ARGS.views_clones_aggregate_outpath)

        if ARGS.delete_ts_fragments:
            # Iterate through precisely the set of files that was read above.
            # If unlinkling fails at OS boundary then don't crash this program.
            for p in csvpaths:
                log.info("delete %s as of --delete-ts-fragments", p)
                try:
                    os.unlink(p)
                except Exception as e:
                    log.warning("could not unlink %s: %s", p, str(e))

    # print(df_agg)

    # matplotlib_config()
    # log.info("aggregated sample count: %s", len(df_agg))
    # df_agg.plot(
    #     linestyle="solid",
    #     marker="o",
    #     markersize=5,
    #     subplots=True,
    #     # ylabel="count",
    #     xlabel="",
    #     # logy="sym",
    # )
    # plt.ylim([0, None])
    # plt.tight_layout()
    # plt.show()

    # Why reset_index()? See
    # https://github.com/altair-viz/altair/issues/271#issuecomment-573480284
    df_agg = df_agg.reset_index()
    df_agg_views = df_agg.drop(columns=["clones_unique", "clones_total"])
    df_agg_clones = df_agg.drop(columns=["views_unique", "views_total"])

    PANEL_WIDTH = "container"
    PANEL_HEIGHT = 200

    panel_props = {"height": PANEL_HEIGHT, "width": PANEL_WIDTH, "padding": 10}

    chart_clones_unique = ((alt.Chart(df_agg_clones).mark_line(
        point=True).encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "clones_unique",
                type="quantitative",
                title="unique clones per day",
                scale=alt.Scale(
                    domain=(0, df_agg_clones["clones_unique"].max() * 1.1),
                    zero=True,
                ),
            ),
        )).configure_axisY(labelBound=True).configure_point(
            size=100).properties(**panel_props))

    chart_clones_total = ((alt.Chart(df_agg_clones).mark_line(
        point=True).encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "clones_total",
                type="quantitative",
                title="total clones per day",
                scale=alt.Scale(
                    domain=(0, df_agg_clones["clones_total"].max() * 1.1),
                    zero=True,
                ),
            ),
        )).configure_axisY(labelBound=True).configure_point(
            size=100).properties(**panel_props))

    chart_views_unique = ((alt.Chart(df_agg_views).mark_line(
        point=True).encode(
            alt.X("time",
                  type="temporal",
                  title="date",
                  timeUnit="yearmonthdate"),
            alt.Y(
                "views_unique",
                type="quantitative",
                title="unique views per day",
                scale=alt.Scale(
                    domain=(0, df_agg_views["views_unique"].max() * 1.1),
                    zero=True,
                ),
            ),
        )).configure_axisY(labelBound=True).configure_point(
            size=100).properties(**panel_props))

    chart_views_total = ((alt.Chart(df_agg_views).mark_line(point=True).encode(
        alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"),
        alt.Y(
            "views_total",
            type="quantitative",
            title="total views per day",
            scale=alt.Scale(
                domain=(0, df_agg_views["views_total"].max() * 1.1),
                zero=True,
            ),
        ),
    )).configure_axisY(labelBound=True).configure_point(size=100).properties(
        **panel_props))

    chart_views_unique_spec = chart_views_unique.to_json(indent=None)
    chart_views_total_spec = chart_views_total.to_json(indent=None)
    chart_clones_unique_spec = chart_clones_unique.to_json(indent=None)
    chart_clones_total_spec = chart_clones_total.to_json(indent=None)

    MD_REPORT.write(
        textwrap.dedent("""


    ## Views

    #### Unique visitors
    <div id="chart_views_unique" class="full-width-chart"></div>

    #### Total views
    <div id="chart_views_total" class="full-width-chart"></div>

    <div class="pagebreak-for-print"> </div>


    ## Clones

    #### Unique cloners
    <div id="chart_clones_unique" class="full-width-chart"></div>

    #### Total clones
    <div id="chart_clones_total" class="full-width-chart"></div>

    """))
    JS_FOOTER_LINES.extend([
        f"vegaEmbed('#chart_views_unique', {chart_views_unique_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
        f"vegaEmbed('#chart_views_total', {chart_views_total_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
        f"vegaEmbed('#chart_clones_unique', {chart_clones_unique_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
        f"vegaEmbed('#chart_clones_total', {chart_clones_total_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);",
    ])
Esempio n. 30
0
    'Select region', ("Asia & Australasia", "Eastern Europe", "Latin America",
                      "Middle East and North Africa", "North America",
                      "Sub-Saharan Africa", "Western Europe"),
    key='1')

df_combined = df_combined[df_combined['Region'].isin(choice)]

# Configure the options common to all layers
brush = alt.selection(type='interval')
base = alt.Chart(df_combined).add_selection(brush)

points = alt.Chart(df_combined).mark_circle().encode(
    alt.X('Democracy_Score', title='Democracy Index'),
    alt.Y('SocialProgress_Score',
          title='Social Progress Index',
          scale=alt.Scale(domain=(30, 100))),
    color=('Region:N'),
    size='Population')

# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)

x_ticks = base.mark_tick().encode(alt.X('Democracy_Score',
                                        title='',
                                        axis=tick_axis),
                                  alt.Y('Region', title='', axis=tick_axis),
                                  color=alt.condition(brush, 'Region',
                                                      alt.value('lightgrey')))

y_ticks = base.mark_tick().encode(alt.X('Region', title='', axis=tick_axis),
                                  alt.Y('SocialProgress_Score',