if st.checkbox('Show Raw Data'): st.write(df_filtered) st.write( "Let look at how Trump's sentiment changes over time from 2009-05 to 2020-06" ) st.write( "Sentiment is a value between -1 and 1, where -1 stand for 100% negative and 1 stands for 100% positive" ) brush = alt.selection_interval(encodings=['x']) chart2 = alt.Chart(df_filtered).mark_bar().encode( x=alt.X('date:T'), y=alt.Y('sentiment:Q'), tooltip=['date:T', 'sentiment:Q']).properties(width=800, height=400) chart2 = chart2.encode(color=alt.condition( brush, 'favorites:Q', alt.value('lightgray'))).add_selection(brush) chart = alt.Chart(df_filtered).mark_circle().encode( x=alt.X('date:T'), y=alt.Y('sentiment:Q'), color=alt.condition(alt.datum.sentiment > .15, alt.value('red'), alt.value('blue')), size=alt.Size('retweets:Q', scale=alt.Scale(range=[20, 200])), tooltip=['date:T', 'sentiment:Q', 'retweets:Q', 'favorites:Q']).properties(width=800, height=400).transform_filter(brush) st.write(chart2 & chart)
profile_df_list = [] for i, row in profile_and_dem_df.iterrows(): profile_df = get_raster_values_for_line(row.geometry, row.filename) profile_df['area'] = row['area'] profile_df['date'] = row['date'] profile_df_list.append(profile_df) profile_df = pd.concat(profile_df_list) profile_df alt.Chart(profile_df).mark_line().encode( x=alt.X('path_distance:Q', title='Pathwise Distance (m)'), y=alt.Y('raster_value:Q', title='Elevation (m)', scale=alt.Scale(zero=False)), color='date:N' ).properties( height=400, width=800, title={ 'text': ['Uncertainty analysis, Paradise road Mt. Rainier'], 'subtitle': [ 'Comparing HSFM DEMs with and without error surface fitting correction' ] }).resolve_scale(x='independent', y='independent') profile_df.date.unique() # + src = profile_df[profile_df['date'] == 'USGS LIDAR 2007/08']
def analyse_top_x_snapshots(entity_type): assert entity_type in ["referrer", "path"] log.info("read 'top %s' snapshots (CSV docs)", entity_type) basename_suffix = f"_top_{entity_type}s_snapshot.csv" csvpaths = _glob_csvpaths(basename_suffix) snapshot_dfs = _get_snapshot_dfs(csvpaths, basename_suffix) # for df in snapshot_dfs: # print(df) # Keep in mind: an entity_type is either a top 'referrer', or a top 'path'. # Find all entities seen across snapshots, by their name. For type referrer # a specific entity(referrer) name might be `github.com`. def _get_uens(snapshot_dfs): unique_entity_names = set() for df in snapshot_dfs: unique_entity_names.update(df[entity_type].values) return unique_entity_names unique_entity_names = _get_uens(snapshot_dfs) log.info("all %s entities seen: %s", entity_type, unique_entity_names) # Clarification: each snapshot dataframe corresponds to a single point in # time (the snapshot time) and contains information about multiple top # referrers/paths. Now, invert that structure: work towards individual # dataframes where each dataframe corresponds to a single referrer/path, # and contains imformation about multiple timestamps # First, create a dataframe containing all information. dfa = pd.concat(snapshot_dfs) if len(dfa) == 0: log.info("leave early: no data for entity of type %s", entity_type) # Build a dict: key is path/referrer name, and value is DF with # corresponding raw time series. entity_dfs = _build_entity_dfs(dfa, entity_type, unique_entity_names) # It's important to clarify what each data point in a per-referrer raw time # series means. Each data point has been returned by the GitHub traffic # API. Each sample (row in the df) I think it can/should be looked at as # the result of a rolling window analysis that shows cumulative values # summed up over a period of 14 days; noted at the _right edge_ of the # rolling time window. # Should see further verification, but I think the boundaries of the time # window actually move with sub-day resolution, i.e. the same query # performed within the same day may yield different outcomes. If that's # true, the rolling time window analysis performed internally at GitHub can # be perfectly inversed; yielding per-referrer traffic statistics at a # sub-day time resolution. That of course will require predictable, # periodic sampling. Let's keep that in mind for now. # One interesting way to look at the data: find the top 5 referrers based # on unique views, and for the entire time range seen. max_vu_map = {} for ename, edf in entity_dfs.items(): max_vu_map[ename] = edf["views_unique"].max() del ename # Sort dict so that the first item is the referrer/path with the highest # views_unique seen. sorted_dict = { k: v for k, v in sorted( max_vu_map.items(), key=lambda i: i[1], reverse=True) } top_n = 10 top_n_enames = list(sorted_dict.keys())[:top_n] # simulate a case where there are different timestamps across per-referrer # dfs: copy a 'row', and re-insert it with a different timestamp. # row = referrer_dfs["t.co"].take([-1]) # print(row) # referrer_dfs["t.co"].loc["2020-12-30 12:25:08+00:00"] = row.iloc[0] # print(referrer_dfs["t.co"]) df_top_vu = pd.DataFrame() for ename in top_n_enames: edf = entity_dfs[ename] # print(edf) df_top_vu[ename] = edf["views_unique"] # del ename log.info( "The top %s %s based on unique views, for the entire time range seen:\n%s", top_n, entity_type, df_top_vu, ) # For plotting with Altair, reshape the data using pd.melt() to combine the # multiple columns into one, where the referrer name is not a column label, # but a value in a column. Ooor we could use the # transform_fold() technique # https://altair-viz.github.io/user_guide/data.html#converting-between-long-form-and-wide-form-pandas # with .transform_fold(top_n_rnames, as_=["referrer", "views_unique"]) # Also copy index into a normal column via `reset_index()` for # https://altair-viz.github.io/user_guide/data.html#including-index-data df_melted = df_top_vu.melt(var_name=entity_type, value_name="views_unique", ignore_index=False).reset_index() # print(df_melted) # Normalize main metric to show a view count _per day_, and clarify in the # plot that this is a _mean_ value derived from the _last 14 days_. df_melted["views_unique_norm"] = df_melted["views_unique"] / 14.0 # For paths, it's relevant to identify the common prefix (repo owner/name) # cmn_ename_prefix = os.path.commonprefix(list(unique_entity_names)) # log.info("cmn_ename_prefix: %s", cmn_ename_prefix) # if entity_type == "path": # log.info("remove common path prefix") # df_melted["path"] = df_melted["path"].str.slice(start=len(cmn_ename_prefix)) # # The root path (e.g., `owner/repo`) is not an empty string. That's # # not so cool, make the root be represented by a single slash. # # df_melted[df_melted["path"] == ""]["path"] = "/" # df_melted["path"].replace("", "/", inplace=True) panel_props = {"height": 300, "width": "container", "padding": 10} chart = ( alt.Chart(df_melted).mark_line(point=True) # .encode(x="time:T", y="views_unique:Q", color="referrer:N") # the pandas dataframe datetimeindex contains timing information at # much higher resolution than 1 day. The resulting vega spec may # then see time values like this: `"time": "2021-01-03T00:00:00+00:00"` # -- suggesting to vega that we care about showing hours and minutes. # instruct vega to only care about _days_ (dates), via an altair-based # timeout unit transformation. Ref: # https://altair-viz.github.io/user_guide/transform/timeunit.html .encode( alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"), alt.Y( "views_unique_norm", type="quantitative", title="unique visitors per day (mean from last 14 days)", scale=alt.Scale( domain=(0, df_melted["views_unique_norm"].max() * 1.1), zero=True, ), ), alt.Color( entity_type, type="nominal", sort=alt.SortField("order"), ), ).configure_point(size=50).properties(**panel_props)) chart_spec = chart.to_json(indent=None) # From # https://altair-viz.github.io/user_guide/customization.html # "Note that this will only scale with the container if its parent element # has a size determined outside the chart itself; For example, the # container may be a <div> element that has style width: 100%; height: # 300px."" heading = "Top referrers" if entity_type == "referrer" else "Top paths" # Textual form: larger N, and no cutoff (arbitrary length and legend of # plot don't go well with each other). top_n = 15 top_n_enames = list(sorted_dict.keys())[:top_n] top_n_enames_string_for_md = ", ".join( f"{str(i).zfill(2)}: `{n}`" for i, n in enumerate(top_n_enames, 1)) MD_REPORT.write( textwrap.dedent(f""" #### {heading} <div id="chart_{entity_type}s_top_n_alltime" class="full-width-chart"></div> Top {top_n} {entity_type}s: {top_n_enames_string_for_md} """)) JS_FOOTER_LINES.append( f"vegaEmbed('#chart_{entity_type}s_top_n_alltime', {chart_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);" )
def gen_sample_plot(metadata): """Uses Altair to generate a JSON Vega-Lite spec for the sample plot. Parameters ---------- metadata: pd.DataFrame DataFrame containing sample metadata information. (Indices correspond to samples, and columns correspond to sample metadata fields.) This should have already been matched with the BIOM table, had empty samples removed, etc. Returns ------- sample_chart_json: dict A dict version of the alt.Chart for the sample plot. """ sample_metadata = metadata.copy() # Used to set color default_metadata_col = sample_metadata.columns[0] # Since we don't bother setting a default log-ratio, we set the balance for # every sample to None so that Vega* will filter them out (producing an # empty scatterplot by default, which makes sense). sample_metadata["qurro_balance"] = None # "Reset the index" -- make the sample IDs a column (on the leftmost side) # First we rename the index "Sample ID", though. (Note that our use of # check_column_names() means that there shouldn't be any sample metadata # fields named "Sample ID".) sample_metadata.rename_axis("Sample ID", axis="index", inplace=True) sample_metadata.reset_index(inplace=True) # Create sample plot chart Vega-Lite spec using Altair. sample_chart = (alt.Chart( sample_metadata, title="Samples", background="#FFFFFF", autosize=alt.AutoSizeParams(resize=True), ).mark_circle().encode( alt.X( default_metadata_col, type="nominal", axis=alt.Axis(labelAngle=-45), scale=alt.Scale(zero=False), ), alt.Y( "qurro_balance:Q", title="Current Natural Log-Ratio", type="quantitative", scale=alt.Scale(zero=False), ), color=alt.Color(default_metadata_col, type="nominal"), tooltip=["Sample ID:N", "qurro_balance:Q"], ).configure_range( ramp=alt.SchemeConfig(scheme="blues"), category=alt.SchemeConfig(scheme="tableau10"), ).configure_axis(labelBound=True).interactive()) # Replace the "mark": "circle" definition with a more explicit one. This # will be useful when adding attributes to the boxplot mark in the # visualization. (We have to resort to this hack because I haven't been # able to successfully use alt.MarkDef in the alt.Chart definition above.) sample_chart_dict = sample_chart.to_dict() sample_chart_dict["mark"] = {"type": "circle"} sm_fields = "qurro_sample_metadata_fields" check_json_dataset_names(sample_chart_dict, sm_fields) # Specify an alphabetical ordering for the sample metadata field names. # This will be used for populating the x-axis / color field selectors in # Qurro's sample plot controls. # # Importantly, this is case insensitive (by default, the json.dumps # sort_keys parameter considers names like "Sample ID" to occur before # names like "age" due to casing -- we use this list to get around this). # Solution based on this article: # https://www.afternerd.com/blog/python-sort-list/#sort-strings-case-insensitive # # Also, we remove qurro_balance from this list because it shouldn't be # exposed to the user in the Qurro interface. (It's already used on the # y-axis of the sample plot automatically.) sorted_md_cols = list(sorted(sample_metadata.columns, key=str.lower)) sorted_md_cols.remove("qurro_balance") sample_chart_dict["datasets"][sm_fields] = sorted_md_cols return sample_chart_dict
def main(): """ ETF Sparplan """ ##General Settings st.set_page_config(page_title='ETF Sparplan Rechner')#, page_icon='logo.jpg') ## Hide Hamburger Menu hide_menu_style = """ <style> #MainMenu {visibility: hidden;} </style> """ st.markdown(hide_menu_style, unsafe_allow_html=True) st.success('ETF Sparplan Rechner') ###Eingabe df_etf = pd.read_excel('ETF.xls', index_col='ETF') list = st.selectbox('Wähle deinen ETF:', df_etf.index) entry = df_etf['RIC'] entry_list = entry[list] inf = df_etf['Branche/Region'] ###Col Defintion col0_1, col0_2 = st.beta_columns(2) col1, col2 = st.beta_columns(2) col2_1, col2_2 = st.beta_columns([3,1]) with col0_1: st.info(inf[list]) with col0_2: url = 'https://de.finance.yahoo.com/quote/' + entry_list + '?p=' + entry_list req = r.get(url) soup = BeautifulSoup(req.content, 'html.parser') try: cont_Kostenquote = soup.body.div.find('span', {'data-reactid': '115'}).text.replace(',', '.') #netto cont_Nettoverm = soup.body.div.find('span', {'data-reactid': '85'}).text.replace(',', '.') try: size = pd.DataFrame([], columns=['Volumen'], index=[0]) size['Volumen'] = cont_Nettoverm size = size['Volumen'].str.replace('M', '') test = size.astype(float) * 1 vol = test[0].astype(str) + ' Mio. EUR' except: try: size = pd.DataFrame([], columns=['Volumen'], index=[0]) size['Volumen'] = cont_Nettoverm size = size['Volumen'].str.replace('B', '') test = size.astype(float) * 1 vol = test[0].astype(str) + ' Mrd. EUR' except: vol = 'N/A' st.success('Nettovermögen d. Fonds: ' + vol) st.error('Netto Kostenquote (TER) p.a.: ' + cont_Kostenquote) except: try: cont_Kostenquote = soup.body.div.find('span', {'data-reactid': '113'}).text.replace(',', '.') #netto cont_Nettoverm = soup.body.div.find('span', {'data-reactid': '83'}).text.replace(',', '.') try: size = pd.DataFrame([], columns=['Volumen'], index=[0]) size['Volumen'] = cont_Nettoverm size = size['Volumen'].str.replace('M', '') test = size.astype(float) * 1 vol = test[0].astype(str) + ' Mio. EUR' except: try: size = pd.DataFrame([], columns=['Volumen'], index=[0]) size['Volumen'] = cont_Nettoverm size = size['Volumen'].str.replace('B', '') test = size.astype(float) * 1 vol = test[0].astype(str) + ' Mrd. EUR' except: vol = 'N/A' st.success('Nettovermögen d. Fonds: ' + vol) st.error('Netto Kostenquote (TER) p.a.: ' + cont_Kostenquote) except: cont_Kostenquote = soup.body.div.find('span', {'data-reactid': '111'}).text.replace(',', '.') #netto cont_Nettoverm = soup.body.div.find('span', {'data-reactid': '81'}).text.replace(',', '.') try: size = pd.DataFrame([], columns=['Volumen'], index=[0]) size['Volumen'] = cont_Nettoverm size = size['Volumen'].str.replace('M', '') test = size.astype(float) * 1 vol = test[0].astype(str) + ' Mio. EUR' except: try: size = pd.DataFrame([], columns=['Volumen'], index=[0]) size['Volumen'] = cont_Nettoverm size = size['Volumen'].str.replace('B', '') test = size.astype(float) * 1 vol = test[0].astype(str) + ' Mrd. EUR' except: vol = 'N/A' st.success('Nettovermögen d. Fonds: ' + vol) st.error('Netto Kostenquote (TER) p.a.: ' + cont_Kostenquote) with col1: entry_money = st.number_input('Wie viel willst du pro Monat einzahlen?', min_value=(25), max_value=(1500), value=(500)) start = st.date_input('Anfangsdatum', dt.datetime(2010, 1, 1), min_value=dt.datetime(2010, 1, 1), max_value=dt.datetime(2019, 1, 1)) end = dt.datetime.now() @st.cache def key_data(key_data): data = web.DataReader(entry_list, 'yahoo', start, end) ###Basis zu relevanten Zeitwerten df = pd.DataFrame(data).reset_index() df = df[['Date', 'Close', 'Volume']] ##Extract year, month and day of Date df['year'] = df['Date'].dt.year df['month'] = df['Date'].dt.month df['day'] = df['Date'].dt.day ##Get max of each month df_time = pd.DataFrame(df.groupby(['year', 'month'])['day'].max()).reset_index() #Merge year, month and day of max df_time['Date'] = df_time['year'].astype(str) + '-' + df_time['month'].astype(str) + '-' + df_time['day'].astype(str) #convert to datetime df_time['Date'] = pd.to_datetime(df_time['Date']) #drop not needed columns df_time = df_time.drop(columns=['year', 'month', 'day']) ##merge oroginal df and needed timeseries df out = pd.merge(df, df_time, left_on='Date', right_on='Date').drop(columns=['year', 'month', 'day']) out['Close'] = round(out['Close'] ,2) ###Grundlage für Grafik df_out = pd.DataFrame(out).reset_index() df_out['index'] += 1 df_out['Stueckzahl kum.'] = round(entry_money / df_out['Close'], 2).cumsum() df_out['Wertentwicklung Sparplan in EUR'] = round(df_out['Stueckzahl kum.'] * df_out['Close'], 2) df_out['Investiert in EUR'] = entry_money * df_out['index'] df_out['Performance in %'] = round((df_out['Wertentwicklung Sparplan in EUR'] / df_out['Investiert in EUR'] -1) * 100, 2) df_out['max Kurs'] = df_out['Close'].cummax() df_out['Differenz zu max Kurs'] = round(((df_out['Close'] - df_out['max Kurs']) / df_out['max Kurs']) * 100, 2) df_out = df_out.rename(columns={'Date': 'Datum', 'Close': 'Tageschlusskurs', 'Volume': 'Handelsvolumen'}) return df_out df_out = key_data(key_data) ###Zusatzinfo zu Produkt und Grafik with col2: count = df_out.index.max() perf_pyear = round(df_out['Performance in %'][count] / df_out['index'].max() * 12, 2).astype(str) max_drawdown = df_out['Differenz zu max Kurs'].min().astype(str) st.success('Durchschnittlich Performance pro Jahr seit Anfangsdatum: ' + perf_pyear + '%') st.error('max. Drawdown seit Anfangsdatum: ' + max_drawdown + '%') ###Grafik Historisch with col2_1: option = st.selectbox('Wertentwicklung anzeigen als: Zukünftig oder historisch und grafisch oder als Tabelle?', ('Zukünftig mit Grafik', 'Zukünftig mit Tabelle', 'Historisch mit Grafik', 'Historisch mit Tabelle')) breit = 500 hoch = 450 if option == 'Historisch mit Grafik': chart_plan = alt.Chart(df_out).mark_trail(point=True, clip=True, opacity=0.8).encode( alt.X('Datum', #scale=alt.Scale(domain=(df_hist['Datum'].astype(int).min() -1, df_hist['Datum'].astype(int).max() + 1)), title='Datum'), alt.Y('Wertentwicklung Sparplan in EUR', scale=alt.Scale(domain=(df_out['Wertentwicklung Sparplan in EUR'].min() -1, df_out['Wertentwicklung Sparplan in EUR'].max() + 1)), title='Wertentwicklung Sparplan in EUR'), tooltip=['Datum', 'Wertentwicklung Sparplan in EUR', 'Performance in %', 'Investiert in EUR'], size=alt.Size('Wertentwicklung Sparplan in EUR', scale=alt.Scale(range=[1, 4, 10]), legend=None), ).interactive().properties( width=breit, height=hoch ) chart_invest = alt.Chart(df_out).mark_trail(point=True, clip=True, color='yellow', opacity=0.8).encode( alt.X('Datum', title='Datum'), alt.Y('Investiert in EUR', title='Investiert in EUR'), tooltip=['Datum', 'Wertentwicklung Sparplan in EUR', 'Performance in %', 'Investiert in EUR'], size=alt.Size('Investiert in EUR', scale=alt.Scale(range=[1, 4, 10]), legend=None), ).interactive() chart = chart_plan + chart_invest st.altair_chart(chart) elif option == 'Zukünftig mit Grafik': Laufzeit = st.number_input('Wie viele Jahre planst du zu investieren?', min_value=5, max_value=50, value=10) df_fut = pd.DataFrame([], columns=['Sparbetrag', 'Wertentwicklung', 'Zinsertrag (brutto)'], index=range(Laufzeit*12)).reset_index().rename(columns={'index': 'Monat'}) df_fut['Monat'] = df_fut.index + 1 df_fut['Sparbetrag'] = (entry_money * df_fut['Monat']) perf_year = round(df_out['Performance in %'][count] / df_out['index'].max() * 12, 2) df_fut['Wertentwicklung'] = -1 * np.fv(perf_year/100/12, df_fut['Monat'], entry_money, 0, when=1) df_fut['Zinsertrag (brutto)'] = df_fut['Wertentwicklung'] - df_fut['Sparbetrag'] df_fut['Wertentwicklung'] = round(df_fut['Wertentwicklung'] *1, 2) df_fut['Zinsertrag (brutto)'] = round(df_fut['Zinsertrag (brutto)'] *1, 2) ###Grafik Zukunft chart_fut = alt.Chart(df_fut).mark_trail(point=True, clip=True, opacity=0.8).encode( alt.X('Monat', #scale=alt.Scale(domain=(df_hist['Datum'].astype(int).min() -1, df_hist['Datum'].astype(int).max() + 1)), title='Monat'), alt.Y('Wertentwicklung', scale=alt.Scale(domain=(df_fut['Wertentwicklung'].min() -1, df_fut['Wertentwicklung'].max() + 1)), title='Wertentwicklung Sparplan in EUR'), tooltip=['Monat', 'Wertentwicklung', 'Sparbetrag', 'Zinsertrag (brutto)'], size=alt.Size('Wertentwicklung', scale=alt.Scale(range=[1, 4, 10]), legend=None), ).interactive().properties( width=breit, height=hoch ) chart_spar = alt.Chart(df_fut).mark_trail(point=True, clip=True, color='yellow', opacity=0.8).encode( alt.X('Monat', title='Monat'), alt.Y('Sparbetrag'), tooltip=['Monat', 'Wertentwicklung', 'Sparbetrag', 'Zinsertrag (brutto)'], size=alt.Size('Sparbetrag', scale=alt.Scale(range=[1, 4, 10]), legend=None), ).interactive().properties( width=breit, height=hoch ) chart = chart_fut + chart_spar chart ###Tabelle historisch elif option == 'Historisch mit Tabelle': df_out = df_out[['Datum', 'Investiert in EUR', 'Wertentwicklung Sparplan in EUR', 'Performance in %']].rename(columns={'Investiert in EUR': 'Sparbetrag in EUR', 'Wertentwicklung Sparplan in EUR': 'Wertentwicklung in EUR'}) df_outhtml = df_out.to_html(escape=False, index=False) st.markdown(df_outhtml, unsafe_allow_html=True) ###Tabelle zukünftig else: Laufzeit = st.number_input('Wie viele Jahre planst du zu investieren?', min_value=5, max_value=50, value=10) df_fut = pd.DataFrame([], columns=['Sparbetrag in EUR', 'Wertentwicklung in EUR', 'Zinsertrag in EUR'], index=range(Laufzeit*12)).reset_index().rename(columns={'index': 'Monat'}) df_fut['Monat'] = df_fut.index + 1 df_fut['Sparbetrag in EUR'] = (entry_money * df_fut['Monat']) perf_year = round(df_out['Performance in %'][count] / df_out['index'].max() * 12, 2) df_fut['Wertentwicklung in EUR'] = -1 * np.fv(perf_year/100/12, df_fut['Monat'], entry_money, 0, when=1) df_fut['Zinsertrag in EUR'] = df_fut['Wertentwicklung in EUR'] - df_fut['Sparbetrag in EUR'] df_fut['Wertentwicklung in EUR'] = round(df_fut['Wertentwicklung in EUR'] *1, 2) df_fut['Zinsertrag in EUR'] = round(df_fut['Zinsertrag in EUR'] *1, 2) df_futhtml = df_fut.to_html(escape=False, index=False) st.markdown(df_futhtml, unsafe_allow_html=True) ###Werbung with col2_2: st.text_area('', 'Diesen Broker nutze ich - nur zu empfehlen:') url_neu = 'https://financeads.net/tc.php?t=19947C274449896B' link = pd.DataFrame(['<a href="' +url_neu+ '" target="_blank"><img src="https://etf-blog.com/wp-content/uploads/2020/10/trade_republic_sparplan.png" width="145" ></a>'], columns=['']) html = link.to_html(escape=False, index=False) st.markdown(html, unsafe_allow_html=True) ###Sector Info ##Create Expander my_expander = st.beta_expander("Weitere Infos: Sektorgewichtung", expanded=False) with my_expander: @st.cache def key_sector(key_sector): url_sec = 'https://de.finance.yahoo.com/quote/' + entry_list + '/holdings?p=' + entry_list req_sec = r.get(url_sec) dat_sec = BeautifulSoup(req_sec.content, 'html.parser') cont_sec = dat_sec.body('div', {'class': 'Mb(25px)'}) df_sec = pd.DataFrame(cont_sec[1]) sec = df_sec[0].astype(str).str.split('</span>').to_list() df_sec2 = pd.DataFrame(sec).dropna().transpose() sec2 = df_sec2[1].astype(str).str.split('">').to_list() df_sec3 = pd.DataFrame(sec2) sec_industry = df_sec3[4][1:].dropna().reset_index().drop(columns=['index']) sec_percent = df_sec3[1].str.replace(',', '.').str.replace('%', '').apply(pd.to_numeric, errors='coerce').dropna().reset_index().drop(columns=['index']) df_merge = pd.merge(sec_industry, sec_percent, left_index=True, right_index=True).rename(columns={4: 'Sektor', 1: 'Gewichtung in %'}).sort_values(by=['Gewichtung in %'], ascending=False).reset_index().drop(columns=['index']) return df_merge df_merge = key_sector(key_sector) table = pd.DataFrame(df_merge).style.set_precision(2) st.table(table)
base = ( alt.Chart(selected_frame) .mark_point() .encode( opacity=alt.value(0.5), tooltip=["id", "correct", "clustering", "edit_distance", color_by], ) .add_selection(brush, zoom) ) if color_type == "Q": base = base.encode( color=alt.condition( brush, alt.Color(f"{color_by}:{color_type}", scale=alt.Scale(scheme="viridis"),), alt.ColorValue("gray"), ), ) else: base = base.encode( color=alt.condition( brush, alt.Color(f"{color_by}:{color_type}",), alt.ColorValue("gray"), ), ) st.header("Data visualization") ( base.encode( x=alt.X("pca_x", axis=alt.Axis(labels=False, title="")),
"type": "HexagonLayer", "data": data, "radius": 100, "elevationScale": 4, "elevationRange": [0, 1000], "pickable": True, "extruded": True, } ], ) st.subheader("Breakdown by minute between %i:00 and %i:00" % (hour, (hour + 1) % 24)) filtered = data[ (data[DATE_TIME].dt.hour >= hour) & (data[DATE_TIME].dt.hour < (hour + 1)) ] hist = np.histogram(filtered[DATE_TIME].dt.minute, bins=60, range=(0, 60))[0] chart_data = pd.DataFrame({"minute": range(60), "pickups": hist}) st.write(alt.Chart(chart_data, height=150) .mark_area( interpolate='step-after', line=True ).encode( x=alt.X("minute:Q", scale=alt.Scale(nice=False)), y=alt.Y("pickups:Q"), tooltip=['minute', 'pickups'] )) if st.checkbox("Show raw data", False): st.subheader("Raw data by minute between %i:00 and %i:00" % (hour, (hour + 1) % 24)) st.write(data)
# drop NaNs deaths_long = deaths_long.dropna() #deaths_long # Selection tool selection = alt.selection_single(fields=['state']) # Color change when clicked color = alt.condition(selection, alt.Color('state:N'), alt.value('lightgray')) # Base altair plot base = alt.Chart( deaths_long, title="Mortes confirmadas pelo COVID-19 por estado").mark_line( strokeWidth=4, opacity=0.7).encode( x=alt.X('Day'), y=alt.Y('Deaths', scale=alt.Scale(type='log')), color=alt.Color('state', legend=None), ) # Chart chart = base.encode(color=alt.condition(selection, 'state:N', alt.value('lightgray')), tooltip=[ alt.Tooltip('state:N', title='Estado'), alt.Tooltip('yearmonthdate(Date):N', title='Data', format='%d/%m/%Y'), alt.Tooltip('Deaths:N', title='Mortes') ]).add_selection(selection) # Overlay
left_on='state', right_on='state', how='left') source = source.replace(np.nan, 0) highlight = alt.selection_single(on='mouseover', fields=['state'], empty='none') states = alt.topo_feature(data.us_10m.url, 'states') ## map state_map = alt.Chart( states, title='States Heatmap').mark_geoshape().encode( color=alt.condition( highlight, alt.value('yellow'), alt.Color(f'{pollutant}:Q', scale=alt.Scale(scheme='lightorange'))), tooltip=['state:N', f'{pollutant}:Q']).transform_lookup( lookup='id', from_=alt.LookupData( source, 'id', [pollutant, 'state'])).add_selection(highlight).project( type='albersUsa').properties(width=700, height=400) state_map elif graph == 'Pollutants Relationship': pollutant1 = st.sidebar.selectbox('Select the first Pollutant ', pollutant_list, index=2, key='pollutant1') pollutant2 = st.sidebar.selectbox('Select the seconf Pollutant ', pollutant_list, index=1,
chart.save('Simple Area Chart6.html') """ 영역 그래프 (스트림 그래프) """ source = data.unemployment_across_industries.url base = alt.Chart() #interactive() 적용시 상하좌우 이동 가능 → 데이터가 없어도 상하좌우 이동 가능 area = base.mark_area().encode( alt.X('yearmonth(date):Q', axis=alt.Axis(format='%Y', domain=True, tickSize=0)), alt.Y('sum(count):Q', stack='center', axis=None), alt.Color('series:N',scale=alt.Scale(scheme='category20b')) ).interactive() chart = alt.layer(area, data = source) chart.save('Simple Area Chart7.html') """ 영역 그래프 - 분할 해서 그리기 """ source = data.iowa_electricity() base = alt.Chart() area = base.mark_area().encode( x="year:T",
.encode(x=alt.X('days_since_100:Q', axis=alt.Axis(title='Days since 100th confirmed case')), y=alt.Y('predictions:Q', axis=alt.Axis(title='Confirmed cases')), color=alt.Color('pred_idx:Q', legend=None, scale=None),) .transform_filter(selectCountry) ).properties( width=width, height=height ) predlog = (alt.Chart(predictionsDF_filtered) .mark_line(opacity=.15) .encode(x=alt.X('days_since_100:Q', axis=alt.Axis(title='Days since 100th confirmed case')), y=alt.Y('predictions:Q', axis=alt.Axis(title=None), scale=alt.Scale(type='log', base=10)), color=alt.Color('pred_idx:Q', legend=None, scale=None),) .transform_filter(selectCountry) ).properties( width=width, height=height ) ##### Mark The Last Case Count ##### # Point last_point = (alt.Chart(lastpointDF) .mark_circle(color="black", size=40) .encode(x='days_since_100:Q', y='confirmed:Q')
def plot_mds( self, rank=Rank.Auto, metric=BetaDiversityMetric.BrayCurtis, method=OrdinationMethod.Pcoa, title=None, xlabel=None, ylabel=None, color=None, size=None, tooltip=None, return_chart=False, label=None, mark_size=100, width=None, height=None, ): """Plot beta diversity distance matrix using multidimensional scaling (MDS). Parameters ---------- rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. metric : {'braycurtis', 'cityblock', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac', 'aitchison'}, optional Function to use when calculating the distance between two samples. Note that 'cityblock' and 'manhattan' are equivalent metrics. method : {'pcoa', 'smacof'} Algorithm to use for ordination. PCoA uses eigenvalue decomposition and is not well suited to non-euclidean distance functions. SMACOF is an iterative optimization strategy that can be used as an alternative. title : `string`, optional Text label at the top of the plot. xlabel : `string`, optional Text label along the horizontal axis. ylabel : `string`, optional Text label along the vertical axis. size : `string` or `tuple`, optional A string or a tuple containing strings representing metadata fields. The size of points in the resulting plot will change based on the metadata associated with each sample. color : `string` or `tuple`, optional A string or a tuple containing strings representing metadata fields. The color of points in the resulting plot will change based on the metadata associated with each sample. tooltip : `string` or `list`, optional A string or list containing strings representing metadata fields. When a point in the plot is hovered over, the value of the metadata associated with that sample will be displayed in a modal. label : `string` or `callable`, optional A metadata field (or function) used to label each analysis. If passing a function, a dict containing the metadata for each analysis is passed as the first and only positional argument. The callable function must return a string. Examples -------- Scatter plot of weighted UniFrac distance between all our samples, using counts at the genus level. >>> plot_mds(rank='genus', metric='unifrac') Notes ----- **For `smacof`**: The values reported on the axis labels are Pearson's correlations between the distances between points on each axis alone, and the corresponding distances in the distance matrix calculated using the user-specified metric. These values are related to the effectiveness of the MDS algorithm in placing points on the scatter plot in such a way that they truly represent the calculated distances. They do not reflect how well the distance metric captures similarities between the underlying data (in this case, an OTU table). """ import altair as alt import numpy as np import pandas as pd from scipy.spatial.distance import squareform from scipy.stats import pearsonr from skbio.stats import ordination from sklearn import manifold from sklearn.metrics.pairwise import euclidean_distances if len(self._results) < 3: raise PlottingException( "There are too few samples for MDS/PCoA after filtering. Please select 3 or more " "samples to plot.") dists = self._compute_distance(rank, metric).to_data_frame() # here we figure out what to put in the tooltips and get the appropriate data if tooltip: if not isinstance(tooltip, list): tooltip = [tooltip] else: tooltip = [] tooltip.insert(0, "Label") if color and color not in tooltip: tooltip.insert(1, color) if size and size not in tooltip: tooltip.insert(2, size) magic_metadata, magic_fields = self._metadata_fetch(tooltip, label=label) if method == OrdinationMethod.Smacof: # adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html x_field = "MDS1" y_field = "MDS2" seed = np.random.RandomState(seed=3) mds = manifold.MDS(max_iter=3000, eps=1e-12, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(dists).embedding_ plot_data = pd.DataFrame(pos, columns=[x_field, y_field], index=dists.index) plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1) # normalize to [0,1] # determine how much of the original distance is captured by each of the axes after MDS. # this implementation of MDS does not use eigen decomposition and so there's no simple # way of returning a 'percent of variance explained' value r_squared = [] for axis in [0, 1]: mds_dist = pos.copy() mds_dist[::, axis] = 0 mds_dist = squareform(euclidean_distances(mds_dist).round(6)) r_squared.append(pearsonr(mds_dist, squareform(dists))[0]) # label the axes x_extra_label = "r² = %.02f" % (r_squared[0], ) y_extra_label = "r² = %.02f" % (r_squared[1], ) elif method == OrdinationMethod.Pcoa: # suppress eigenvalue warning from skbio--not because it's an invalid warning, but # because lots of folks in the field run pcoa on these distances functions, even if # statistically inappropriate. perhaps this will change if we ever become more # opinionated about the analyses that we allow our users to do (roo) with warnings.catch_warnings(): warnings.simplefilter("ignore") ord_result = ordination.pcoa( dists.round(6)) # round to avoid float precision errors plot_data = ord_result.samples.iloc[:, [0, 1 ]] # get first two components plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1) # normalize to [0,1] plot_data.index = dists.index x_field, y_field = plot_data.columns.tolist( ) # name of first two components x_extra_label = "%0.02f%%" % (ord_result.proportion_explained[0] * 100, ) y_extra_label = "%0.02f%%" % (ord_result.proportion_explained[1] * 100, ) else: raise OneCodexException("MDS method must be one of: {}".format( ", ".join(OrdinationMethod.values))) # label the axes if xlabel is None: xlabel = "{} ({})".format(x_field, x_extra_label) if ylabel is None: ylabel = "{} ({})".format(y_field, y_extra_label) plot_data = pd.concat([plot_data, magic_metadata], axis=1).reset_index() alt_kwargs = dict( x=alt.X(x_field, axis=alt.Axis(title=xlabel)), y=alt.Y(y_field, axis=alt.Axis(title=ylabel)), tooltip=[magic_fields[t] for t in tooltip], href="url:N", url=get_base_classification_url() + alt.datum.classification_id, ) # only add these parameters if they are in use if color: color_kwargs = { "legend": alt.Legend(title=magic_fields[color]), } if not is_continuous(plot_data[color]) or has_missing_values( plot_data[color]): plot_data[color] = plot_data[color].fillna("N/A").astype(str) domain = plot_data[color].values color_range = interleave_palette(domain) color_kwargs["scale"] = alt.Scale(domain=domain, range=color_range) alt_kwargs["color"] = alt.Color(magic_fields[color], **color_kwargs) if size: alt_kwargs["size"] = magic_fields[size] chart = (alt.Chart(plot_data).transform_calculate( url=alt_kwargs.pop("url")).mark_circle(size=mark_size).encode( **alt_kwargs)) chart = chart.properties( **prepare_props(title=title, height=height, width=width)) if return_chart: return chart else: chart.interactive().display()
def __draw_metric_line_titles(metrics, size_constants): """Draws left hand side titles for metrics.""" metric_line_titles = [] for metric in metrics: # METRIC TITLE metric_title = (alt.Chart(DUMMY_DF).transform_calculate( y_position="1.2").mark_text( align="center", baseline="middle", font=FONT, fontWeight=Title.font_weight, size=Title.font_size, color=Title.font_color, ).encode( alt.Y("y_position:Q", scale=alt.Scale(domain=[3, 1]), axis=no_axis()), text=alt.value(metric.upper()), )) # GROUPS TEXT group_circles_title = (alt.Chart(DUMMY_DF).transform_calculate( y_position="2").mark_text( align="center", baseline="middle", font=FONT, size=Subtitle.font_size, color=Subtitle.font_color, ).encode( alt.Y("y_position:Q", scale=alt.Scale(domain=[3, 1]), axis=no_axis()), text=alt.value("Groups"), )) # PERCENT. POP TEXT population_percentage_title = (alt.Chart(DUMMY_DF).transform_calculate( y_position="2.7").mark_text( align="center", baseline="middle", font=FONT, size=Subtitle.font_size, color=Subtitle.font_color, ).encode( alt.Y("y_position:Q", scale=alt.Scale(domain=[3, 1]), axis=no_axis()), text=alt.value("% Pop."), )) metric_line_titles.append( (metric_title + group_circles_title + population_percentage_title).properties( height=size_constants["line_height"], width=size_constants["metric_titles_width"], )) # EMPTY CORNER SPACE # To make sure that the attribute columns align properly with the title column, we need to create a blank # space of the same size of the attribute titles. For this purpose, we use the same function (__draw_attribute_title) # and pass in an empty string so that nothing is actually drawn. top_left_corner_space = __draw_attribute_title( "", size_constants["metric_titles_width"], size_constants) # CONCATENATE SUBPLOTS metric_titles = alt.vconcat( top_left_corner_space, *metric_line_titles, spacing=size_constants["line_spacing"], bounds="flush", ) return metric_titles
def app(): st.title("Smoking Deaths") st.header("Why Tobacco is a deadly threat?") @st.cache(allow_output_mutation=True) def load_data(): deaths = pd.read_csv('data/smoking-deaths-by-age.csv', header=0, names=[ 'country', 'code', 'year', '15 to 49', '50 to 69', 'Above 70']) factors = pd.read_csv('data/number-of-deaths-by-risk-factor.csv', header=0, index_col=False, names=[ 'country', 'code', 'year', 'Diet low in vegetables', 'Diet low in whole grains', 'Diet low in nuts and seeds', 'Diet low in calcium', 'Unsafe sex', 'No access to handwashing facility', 'Child wasting', 'Child stunting', 'Diet high in red meat', 'Diet low in fiber', 'Diet low in seafood omega-3 fatty acids', 'Diet high in sodium', 'Low physical activity', 'Non-exclusive breastfeeding', 'Discontinued breastfeeding', 'Iron deficiency', 'Vitamin A deficiency', 'Zinc deficiency', 'Smoking', 'Secondhand smoke', 'Alcohol use', 'Drug use', 'High fasting plasma glucose', 'High total cholesterol', 'High systolic blood pressure', 'High body-mass index', 'Low bone mineral density', 'Diet low in fruits', 'Diet low in legumes', 'Low birth weight for gestation', 'Unsafe water source', 'Unsafe sanitation', 'Household air pollution from solid fuels', 'Air pollution', 'Outdoor air pollution']) # Drop columns with missing values and extremely low values factors.drop(columns=['Vitamin A deficiency', 'High total cholesterol', 'Zinc deficiency', 'Child stunting', 'Discontinued breastfeeding', 'Iron deficiency', 'Non-exclusive breastfeeding','Diet high in red meat', 'Unsafe sanitation', 'No access to handwashing facility','Household air pollution from solid fuels', 'Unsafe water source', 'Child wasting', 'Low birth weight for gestation', 'Diet low in calcium', 'Low bone mineral density',], inplace=True) # Filter data with years factors = factors.drop(factors[factors.year > 2012].index) deaths = deaths.drop(deaths[deaths.year > 2012].index) # Convert data from wide to long deaths = pd.melt(deaths, id_vars=['country', 'year'], value_vars=['15 to 49', '50 to 69', 'Above 70'], var_name='Age') factors = pd.melt(factors, id_vars=['country', 'year'], value_vars=['Diet low in vegetables', 'Diet low in nuts and seeds', 'Unsafe sex', 'Diet low in fiber', 'Diet low in seafood omega-3 fatty acids', 'Diet high in sodium', 'Low physical activity', 'Smoking', 'Secondhand smoke', 'Alcohol use', 'Drug use', 'High fasting plasma glucose', 'High systolic blood pressure', 'High body-mass index', 'Diet low in fruits', 'Diet low in legumes', 'Air pollution', 'Outdoor air pollution'], var_name='risk_factor') countries = deaths['country'].unique() # get unique country names countries.sort() # sort alphabetically minyear = deaths.loc[:, 'year'].min() maxyear = deaths.loc[:, 'year'].max() return deaths, factors, countries, minyear, maxyear # Load data deaths, factors, countries, minyear, maxyear = load_data() # Country Selection selectCountry = st.selectbox('Select a country: ', countries, 77) # Year selection slider = st.slider('Select a period of time', int(str(minyear)), int(str(maxyear)), (1994, 2004)) # Bar chart - Risk factors bar_factors = alt.Chart(factors, title="Ranking of the top 10 risk factors leading to deaths in " + selectCountry + " from " + str(slider[0]) + " to " + str(slider[1])).mark_bar().transform_filter({'and': [{'field': 'country', 'equal': selectCountry}, {'field': 'year', 'range': slider}]} ).transform_aggregate( sum_deaths='sum(value)', # Calculate the total number of deaths groupby=["risk_factor"] ).transform_window( rank='rank(sum_deaths)', sort=[alt.SortField('sum_deaths', order='descending')] ).transform_filter( alt.datum.rank < 11 # Filter out top 10 factors ).encode( alt.X('sum_deaths:Q', title='Total number of deaths'), y=alt.Y('risk_factor:O',sort='-x', title='Risk factor'), tooltip=alt.Tooltip(["sum_deaths:Q"],format=",.0f",title="Deaths"), color=alt.condition( alt.datum['risk_factor'] == 'Smoking', alt.value("red"), # Color for the smoking factor alt.value("lightgray") # Color for the rest ) ).properties( width=660, height=300 ) # Stacked bar chart - Smoking deaths by ages base = alt.Chart(deaths, title='Smoking deaths by age in ' + selectCountry).mark_bar().transform_filter({'and': [{'field': 'country', 'equal': selectCountry}, {'field': 'year', 'range': slider}]} ).encode( alt.X('year:O', title='Year'), y=alt.Y('value:Q', title='Number of smoking deaths'), order=alt.Order('Age:O', sort='ascending'), color=alt.Color('Age:O', scale = alt.Scale(domain=['Above 70', '50 to 69', '15 to 49'], scheme='lightorange')), tooltip=alt.Tooltip(["value:Q"],format=",.0f",title="Deaths"), ).properties( width=720, height=300 ) # Render the charts container1 = st.beta_container() with container1: st.altair_chart(base) st.markdown("From the chart above we can see that smoking is a critical factor leading to deaths, especially for old people. The numbers of people aged over 70 who died because of smoking are extremely high in all countries. \ In the bar chart below, we can see how smoking ranks in the list of top 10 risk factors that lead to deaths in the chosen country in the chosen period of time.") container2 = st.beta_container() with container2: st.altair_chart(bar_factors)
----------------- This example shows a ranged dot plot that uses 'layer' to convey changing life expectancy for the five most populous countries (between 1955 and 2000). """ import altair as alt from vega_datasets import data source = data.countries() line = alt.Chart().mark_line(color='#db646f').encode( x = 'life_expect', y = 'country', detail = 'country' ).interactive() point = alt.Chart().mark_point(size = 100, opacity = 1, filled = True).encode( x = 'life_expect', y = 'country', color=alt.Color('year:O', scale=alt.Scale( domain=['1955', '2000'], range=['#e6959c', '#911a24'] ) ) ) chart = alt.layer(line + point, data = source, transform = [{'filter': {"field": 'country', "oneOf": ["China", "India", "United States", "Indonesia", "Brazil"]}}, {'filter': {"field": 'year', "oneOf": [1955, 2000]}}])
c1 = alt.Chart(df_inf).properties(width=150, height=300).mark_bar().encode( x=alt.X("Max Infection Rate:Q", title="Max Infection Rate"), y=alt.Y("countryy:N", title="Countries", sort=None), color=alt.Color('countryy:N', title="Country"), tooltip=[ alt.Tooltip('countryy:N', title='Country'), alt.Tooltip('Max Infection Rate:Q', title='Max Infection Rate'), alt.Tooltip('inhabitants:Q', title='Inhabitants [mio]') ]) st.altair_chart(c1, use_container_width=True) selection = st.selectbox("Which country to look at:", country) df = df[df["Country"] == selection] variables = ["Confirmed", "Recovered", "Deaths"] colors = ["steelblue", "orange", "black"] value_vars = variables SCALE = alt.Scale(domain=variables, range=colors) dfm = pd.melt(df.reset_index(), id_vars=["Date"], value_vars=value_vars) dfm['order'] = dfm['variable'].replace( {val: i for i, val in enumerate(variables[::-1])}) if s2 == "cases": c = alt.Chart(dfm.reset_index()).mark_bar().properties( height=400, width=350).encode(x=alt.X("Date:T", title="Date"), y=alt.Y("sum(value):Q", title="Cases", scale=alt.Scale(type='linear')), color=alt.Color('variable:N', title="Category", scale=SCALE), order='order')
def plotAltairLineChart(self): """ Returns an Altair line chart. Returns ------- An Altair line chart. """ df = self.getData(self.confirmed_deaths) iso_date: str = self.covid19_date.strftime('%Y-%m-%d') # Altair requires dataframe to be in "long format" df_countries = (df.drop(columns=['Province/State', 'Lat', 'Long']) .groupby('Country/Region').agg('sum') .sort_values(by=df.columns[-1], ascending=False) .transpose() .reset_index() .melt(id_vars='index', value_name='Qty') .rename(columns={'index': 'Date', 'Country/Region': 'Country_Region' } ) .set_index('Date') ) # Make index values actual datetime objects so that we can # leverage Panda's date filtering API df_countries.index = [datetime.strptime(day, '%m/%d/%y') for day in df_countries.index ] alt_chart = alt.Chart(df_countries[: iso_date] .query("Country_Region in(@self.country)") .reset_index() .rename(columns={'index': 'Date'}) ).mark_line().encode( x=alt.X(title='Date', field='Date', type='temporal'), y=alt.Y(title='# of ' + self.confirmed_deaths, field='Qty', type='quantitative', scale=alt.Scale(type=self.ylog) ), color=alt.Color(field='Country_Region', type='nominal', legend=alt.Legend(title="Country/Region") ), tooltip=[alt.Tooltip(field='Country_Region', type= 'nominal'), alt.Tooltip(field='Qty', type= 'quantitative'), alt.Tooltip(field='Date', type= 'temporal') ] ) # To create filled circles in the legend per # https://github.com/altair-viz/altair/issues/1206 points = alt.Chart(df_countries[: iso_date] .query("Country_Region in(@self.country)") .reset_index() .rename(columns={'index': 'Date'}) ).mark_circle(size=0).encode( color='Country_Region' ) # To add hover tips, but make it less sensitive per # https://github.com/altair-viz/altair/issues/1812 tooltips = alt_chart.mark_point(size=100, opacity=0, tooltip=alt.TooltipContent("data") ) alt_chart = alt_chart + points + tooltips return alt_chart.properties( title='COVID-19 ' + self.confirmed_deaths, width='container', height=400 )
1370, 1468, 1566, 1664, 1762, 1860, 1958 ] } # In[ ]: scores_df = pd.DataFrame(scores_dict) scores_df = scores_df.melt(id_vars=['n_features'], value_vars=['mutual_info_classif', 'f_classif'], var_name='metric', value_name='mae') max_value = scores_df['mae'].max() * 1.05 min_value = scores_df['mae'].min() * 0.95 artgor_utils.render( alt.Chart(scores_df).mark_line().encode( y=alt.Y('mae:Q', scale=alt.Scale(domain=(min_value, max_value))), x='n_features:O', color='metric:N', tooltip=['metric:N', 'n:O', 'mae:Q']).properties( title='Top N features by SelectPercentile vs CV').interactive()) # ### SelectKBest # # **Important notice**: I run the cell below in `version 14` and printed the scores_dict. In the following versions I'll use `scores_dict` and plot the results instead of running feature selection each time # In[ ]: # %%time # scores_dict = {'f_classif': [], 'mutual_info_classif': [], 'n_features': []} # for i in np.arange(10, 1958, 100): # print(i)
default=regions) st.sidebar.info( "Merci à tous contributeurs du projet [opencovid19-fr](https://github.com/opencovid19-fr/data) pour leur travail de collecte des données officielles sur la progression de l'épidémie en France." ) # get df_covid19_region based on region in multiselection df_covid19_region = df_covid19_region[df_covid19_region["maille_nom"].isin( multiselection)].sort_values(by=["maille_nom", "date"], ascending=[True, False]) if check_box_table: st.write(df_covid19_region) if option == "graph": if st.checkbox("Log Scale"): scale = alt.Scale(type="log", domain=[10, 5000], clamp=True) else: scale = alt.Scale(type="linear") if check_box_analyse: st.info( "[03/22] Les régions Grand-Est, Ile-de-France et Haut-de-France sont les plus touchées par l'épidémie. " "Par ailleurs l'affiche en échelle Log, nous montre que l'ensemble des régions suivent la même croissance en terme d'évolution" ) # make plot on nb of deces by regions c_deces = (alt.Chart(df_covid19_region).mark_line(point=True).encode( alt.X("days_after_5_deaths"), alt.Y("deces", scale=scale), alt.Color("maille_nom"), tooltip=["days_after_5_deaths", "deces", "maille_nom"], ).interactive())
# year slider for Maps slider = alt.binding_range(min=source['year'].min(), max=source['year'].max(), step=1) select_year = alt.selection_single(name="year", fields=['year'], bind=slider, init={'year': source['year'].min()}) base = alt.Chart(source) ### trend line plot plot = base.mark_point(filled=True).encode(alt.X('year:O', scale=alt.Scale(zero=False)), alt.Y( 'average(Female, Income: Q1):Q', scale=alt.Scale(zero=False)), color=alt.value("salmon")) plot += base.mark_point(filled=True).encode( alt.X('year:O', scale=alt.Scale(zero=False)), alt.Y('average(Female, Income: Q2):Q', scale=alt.Scale(zero=False)), color=alt.value("salmon"), shape=alt.value("cross")) plot += base.mark_point(filled=True).encode( alt.X('year:O', scale=alt.Scale(zero=False)), alt.Y('average(Female, Income: Q3):Q', scale=alt.Scale(zero=False)), color=alt.value("salmon"),
def gen_rank_plot(V, rank_type, ranking_ids, feature_metadata_cols, table_sdf): """Uses Altair to generate a JSON Vega-Lite spec for the rank plot. Parameters ---------- V: pd.DataFrame DataFrame containing feature rank (and feature metadata, if applicable) information. (Indices correspond to features, and columns correspond to feature ranking or feature metadata fields.) This should have already been matched with the BIOM table, filtered (if -x passed), had empty features removed, etc. rank_type: str Human-readable name for a given ranking column that will be used as the prefix for each y-axis label in the rank plot. (This should be either "Differential" or "Feature Loading".) ranking_ids: pd.Index IDs of the actual "feature ranking" columns in V. feature_metadata_cols: pd.Index or list IDs of the "feature metadata" columns in V (if there wasn't any feature metadata provided, this can just be an empty list). table_sdf: pd.SparseDataFrame A representation of the input BIOM table containing count data. This is used to calculate qurro_spc (the number of samples a feature is present in) for each feature in V. This should ONLY contain samples that will be used in the Qurro visualization -- the presence of extra samples will mess up _df_utils.add_sample_presence_count(). Returns ------- rank_chart_json: dict A dict version of the alt.Chart for the rank plot, with qurro_rank_ordering and qurro_feature_metadata_ordering datasets added in indicating which columns describe feature rankings and which describe feature metadata. (Also has a qurro_rank_type "dataset" (really just a string) that points to the specified rank_type.) """ rank_data = V.copy() # NOTE that until this point we've treated the actual rank values as just # "objects", as far as pandas is concerned. However, if we continue to # treat them as objects when sorting them, we'll get a list of feature # ranks in lexicographic order... which is not what we want. So we just # ensure that all of the columns contain numeric data. for col in ranking_ids: rank_data[col] = pd.to_numeric(rank_data[col]) # The default rank column is just whatever the first rank is. This is what # the rank plot will use when it's first drawn. default_rank_col = ranking_ids[0] # Set default classification of every feature to "None" # (This value will be updated when a feature is selected in the rank plot # as part of the numerator, denominator, or both parts of the current log # ratio.) rank_data["qurro_classification"] = "None" # Add a "qurro_spc" column indicating how many samples each feature is # present in. rank_data = add_sample_presence_count(rank_data, table_sdf) # Replace "index" with "Feature ID". looks nicer in the visualization :) rank_data.rename_axis("Feature ID", axis="index", inplace=True) rank_data.reset_index(inplace=True) # Now, we can actually create the rank plot. rank_chart = ( alt.Chart( rank_data, title="Features", background="#FFFFFF", autosize=alt.AutoSizeParams(resize=True), ).mark_bar().transform_window( sort=[alt.SortField(field=default_rank_col, order="ascending")], # We don't use an alt.WindowFieldDef here because python gets # confused when you use "as" as an actual argument name. So we just # use this syntax. window=[{ "op": "row_number", "as": "qurro_x" }], ).encode( # type="ordinal" needed on the scale here to make bars adjacent; # see https://stackoverflow.com/a/55544817/10730311. x=alt.X( "qurro_x", title="Feature Rankings", type="ordinal", scale=alt.Scale(paddingOuter=1, paddingInner=0, rangeStep=1), axis=alt.Axis(ticks=False, labelAngle=0), ), y=alt.Y(default_rank_col, type="quantitative"), color=alt.Color( "qurro_classification", title="Log-Ratio Classification", scale=alt.Scale( domain=["None", "Numerator", "Denominator", "Both"], range=["#e0e0e0", "#f00", "#00f", "#949"], ), ), tooltip=[ alt.Tooltip( field="qurro_x", title="Current Ranking", type="quantitative", ), alt.Tooltip( field="qurro_classification", title="Log-Ratio Classification", type="nominal", ), alt.Tooltip( field="qurro_spc", title="Sample Presence Count", type="quantitative", ), "Feature ID", *feature_metadata_cols, *ranking_ids, ], ).configure_axis( # Done in order to differentiate "None"-classification features # from grid lines gridColor="#f2f2f2", labelBound=True, ).interactive()) rank_chart_json = rank_chart.to_dict() rank_ordering = "qurro_rank_ordering" fm_col_ordering = "qurro_feature_metadata_ordering" dataset_name_for_rank_type = "qurro_rank_type" check_json_dataset_names(rank_chart_json, rank_ordering, fm_col_ordering, rank_type) # Note we don't use rank_data.columns for setting the rank ordering. This # is because rank_data's columns now include both the ranking IDs and the # "Feature ID" and "qurro_classification" columns (as well as any feature # metadata the user saw fit to pass in). rank_chart_json["datasets"][rank_ordering] = list(ranking_ids) rank_chart_json["datasets"][fm_col_ordering] = list(feature_metadata_cols) rank_chart_json["datasets"][dataset_name_for_rank_type] = rank_type return rank_chart_json
import pandas as pd import altair as alt office_ratings = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv') #alt.data_transformers.disable_max_rows() chart = alt.Chart(office_ratings).mark_tick().encode( #x='imdb_rating:Q', alt.X('imdb_rating:Q', scale=alt.Scale(domain=(6, 10)), axis=alt.Axis(title='IMDb Rating') #(format='%', title='percentage') ), alt.Y('season:O', axis=alt.Axis(title='Season') ) ).properties( #width=550, height=140, title={ "text": ['The Office IMDb Ratings Distribution by Season'], "fontSize": 18, "font": 'Courier', "anchor": 'middle', "color": 'gray' } ) alt.concat(chart, title=alt.TitleParams( ['', '#30DayChartChallenge - strips - 2021/04/12', 'Dataset: TidyTuesday Dataset 2020-03-17', 'twitter.com/vivekparasharr | github.com/vivekparasharr | vivekparasharr.medium.com'],
def get_plot(df, max_x: int, fields: list): data = df[df['legend'].isin(fields)] return alt.Chart(data).mark_line().encode( x=alt.X('day', scale=alt.Scale(domain=(0, max_x))), y=alt.Y('cases'), color='legend').properties(width=800, height=400)
def get_interactive_proportions_plot(gender_balance): source = data_frames[gender_balance] sort_order = sort_orders[gender_balance] pts = alt.selection(type="multi", encodings=['x'], empty='none') lin = alt.Chart(source).mark_line().encode( alt.X('year:O', title='Year', axis=alt.Axis(labelAngle=-45)), alt.Y('female_prop:Q', title="Female Percentage", axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 1])), alt.Color('job:N', legend=None)).transform_filter(pts).properties( width=450, height=325, title="Female Percentage in a Job by Year") hrule = alt.Chart(pd.DataFrame({'y': [0.5]})).mark_rule( color='red', strokeDash=[5, 5]).encode(y=alt.Y('y:Q')) vrule = alt.Chart(pd.DataFrame({'x': [0.5]})).mark_rule( color='red', strokeDash=[5, 5]).encode(x=alt.X('x:Q')) lin_w_interaction = alt.layer( lin, # base line chart alt.Chart().mark_rule(color='#aaa').encode( x='year:O').transform_filter(label), lin.mark_circle().encode(opacity=alt.condition(label, alt.value( 1), alt.value(0))).add_selection(label), lin.mark_text( align='left', dx=5, dy=-5, stroke='white', strokeWidth=2).encode(text=alt.Text( 'female_prop:Q', format='.2%')).transform_filter(label), lin.mark_text(align='left', dx=5, dy=-5).encode(text=alt.Text( 'female_prop:Q', format='.2%')).transform_filter(label), hrule, data=source) bar = alt.Chart(source).mark_bar(size=30).encode( y=alt.Y('job:N', title='', sort=sort_order), x=alt.X('total_prop_female:Q', title="Female Percentage", axis=alt.Axis(format='%')), color=alt.condition(pts, alt.Color('job:N', legend=None), alt.ColorValue("grey")), tooltip=[ alt.Tooltip(field="job", type="nominal", title="Job"), alt.Tooltip(field="total_prop_female", type="quantitative", title="Total Female Percentage", format='.2%') ]).properties( width=225, height=350, title="Jobs by Total Female Percentage (For the 10 most " + gender_balance + " jobs)").add_selection(pts) bar_w_vrule = alt.layer(bar, vrule, data=source) if (gender_balance == 'male dominated'): interactive_job_chart = alt.hconcat( lin_w_interaction, bar).resolve_legend( color="independent", size="independent").configure_axis(labelFontSize=13, titleFontSize=14) else: interactive_job_chart = alt.hconcat( lin_w_interaction, bar_w_vrule).resolve_legend(color="independent", size="independent").configure_axis( labelFontSize=13, titleFontSize=14) # Save html as a StringIO object in memory job_gender_proportions_html = io.StringIO() interactive_job_chart.save(job_gender_proportions_html, 'html') # Return the html from StringIO object return job_gender_proportions_html.getvalue()
def visualize_sentiment(sentiment_df, plot_type="Standard"): """ Takes in the output of sentiment_analysis and creates a visualization of user's tweets with sentimental analysis. Parameters: ----------- sentiment_df : dataframe Output of tweet_sentiment_analysis, dataframe that contains added columns from tweet_sentiment_analysis plot_type : string Optional: Type of plot to return, 3 options:'Standard', 'Stacked', and 'Separate' 'Standard' Returns bar plot of most common words tweeted color coded by sentiment 'Stacked' Returns same as 'Standard' but if words are found in other sentiments they are stacked together 'Separate' Returns 3 bar plots with the sentiment of 'Postive' 'Neutral', and 'Negative' separated Returns: -------- plot: A bar plot of the user's tweets containing in order the most common words, colour coded by the word's sentiment class. """ # check inputs options = ("Standard", "Stacked", "Separate") if plot_type not in options: raise TypeError("Invalid argument for plot_type: You must enter one of 'Standard', 'Stacked', 'Separate'") elif not isinstance(sentiment_df, pd.DataFrame): raise Exception("""The input of sentiment_df should be a Pandas DataFrame, did you use output of tweet_sentiment_analysis?""") elif 'sentiment' not in sentiment_df: raise KeyError("Input does not contain column for sentiment, did you use output of tweet_sentiment_analysis?") # Define tweet_rank function def tweet_rank(df, sentiment): """function to return most common words tweeted for a specified sentiment""" df_senti = df[df['sentiment'] == sentiment] countVectorizer = CountVectorizer(analyzer=text_cleaning, stop_words='english') countVector = countVectorizer.fit_transform(df_senti['tweet']) count_vect_df = pd.DataFrame(countVector.toarray(), columns=countVectorizer.get_feature_names()) count_vect_df.head() count = pd.DataFrame(count_vect_df.sum()) countdf = count.sort_values(0, ascending=False).head(20) return countdf[1:11] dataframes = dict() # create empty dictionary to store sentiment dataframes for sentiment in np.unique(sentiment_df["sentiment"]): sent_df = tweet_rank(sentiment_df, sentiment) sent_df.columns = ['frequency'] # rename columns and include column for sentiment and word sent_df["sentiment"] = sentiment sent_df['Word'] = sent_df.index dataframes[sentiment] = sent_df # append sentiment dataframe to dictionary # add all dataframes together, may need adjustment later top_words_df = pd.concat([dataframes['positive'], dataframes['neutral'], dataframes['negative']]) # Plot if standard is selected if plot_type == "Standard": top_words_df['Word'] = top_words_df['Word'] + ' (' + top_words_df["sentiment"] + ')' standard_plot = alt.Chart(top_words_df, title='Most Common Words used by Twitter User').mark_bar().encode( x=alt.X('frequency', title='Number of Occurences'), y=alt.Y('Word', sort='-x'), color=alt.Color("sentiment", scale=alt.Scale(domain=['positive', 'neutral', 'negative'], range=['blue', 'orange', 'red']))) return standard_plot # Plot if stacked is selected elif plot_type == "Stacked": top_words_df['Word'] = top_words_df.index stacked_plot = alt.Chart(top_words_df, title='Most Common Words used by Twitter User').mark_bar().encode( x=alt.X('frequency', title='Number of Occurences'), y=alt.Y('Word', sort='-x'), color=alt.Color("sentiment", scale=alt.Scale(domain=['positive', 'neutral', 'negative'], range=['blue', 'orange', 'red']))) return stacked_plot # Plot if Separate is selected elif plot_type == "Separate": negative = alt.Chart(dataframes['negative'], title='Most Common Negative Words used by Twitter User').mark_bar().encode( x=alt.X('frequency', title='Number of Occurences'), y=alt.Y('Word', sort='-x'), color=alt.value("red")) positive = alt.Chart(dataframes['positive'], title='Most Common Postive Words used by Twitter User').mark_bar().encode( x=alt.X('frequency', title='Number of Occurences'), y=alt.Y('Word', sort='-x'), color=alt.value("blue")) neutral = alt.Chart(dataframes['neutral'], title='Most Common Neutral Words used by Twitter User').mark_bar().encode( x=alt.X('frequency', title='Number of Occurences'), y=alt.Y('Word', sort='-x'), color=alt.value("orange")) separate_plot = positive | neutral | negative return separate_plot
import pandas as pd import altair as alt st.title("Let's analyze some Penguin Data 🐧📊.") @st.cache # add caching so we load the data only once def load_data(): # Load the penguin data from https://github.com/allisonhorst/palmerpenguins. penguins_url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/v0.1.0/inst/extdata/penguins.csv" return pd.read_csv(penguins_url) df = load_data() st.write("Let's look at raw data in the Pandas Data Frame.") st.write(df) st.write("Hmm 🤔, is there some correlation between body mass and flipper length? Let's make a scatterplot with [Altair](https://altair-viz.github.io/) to find.") chart = alt.Chart(df).mark_point().encode( x=alt.X("body_mass_g", scale=alt.Scale(zero=False)), y=alt.Y("flipper_length_mm", scale=alt.Scale(zero=False)), color=alt.Y("species") ).properties( width=600, height=400 ).interactive() st.write(chart) st.markdown("This project was created by Student1 and Student2 for the [Interactive Data Science](https://dig.cmu.edu/ids2022) course at [Carnegie Mellon University](https://www.cmu.edu).")
""" Natural Disasters ----------------- This example shows a visualization of global deaths from natural disasters. """ # category: scatter plots import altair as alt from altair.expr import datum from vega_datasets import data source = data.disasters.url alt.Chart(source).mark_circle( opacity=0.8, stroke='black', strokeWidth=1).encode( alt.X('Year:O', axis=alt.Axis(labelAngle=0)), alt.Y('Entity:N'), alt.Size('Deaths:Q', scale=alt.Scale(range=[0, 5000]), legend=alt.Legend(title='Annual Global Deaths')), alt.Color('Entity:N', legend=None)).properties( width=480, height=350).transform_filter( datum.Entity != 'All natural disasters')
This example shows a ranged dot plot that uses 'layer' to convey changing life expectancy for the five most populous countries (between 1955 and 2000). """ # category: other charts import altair as alt from vega_datasets import data source = data.countries.url chart = alt.layer(data=source).transform_filter( filter={ "field": 'country', "oneOf": ["China", "India", "United States", "Indonesia", "Brazil"] }).transform_filter(filter={ 'field': 'year', "oneOf": [1955, 2000] }) chart += alt.Chart().mark_line(color='#db646f').encode(x='life_expect:Q', y='country:N', detail='country:N') # Add points for life expectancy in 1955 & 2000 chart += alt.Chart().mark_point(size=100, opacity=1, filled=True).encode( x='life_expect:Q', y='country:N', color=alt.Color('year:O', scale=alt.Scale(domain=['1955', '2000'], range=['#e6959c', '#911a24']))).interactive() chart
def analyse_view_clones_ts_fragments(): log.info("read views/clones time series fragments (CSV docs)") basename_suffix = "_views_clones_series_fragment.csv" csvpaths = _glob_csvpaths(basename_suffix) dfs = [] column_names_seen = set() for p in csvpaths: log.info("attempt to parse %s", p) snapshot_time = _get_snapshot_time_from_path(p, basename_suffix) df = pd.read_csv( p, index_col=["time_iso8601"], date_parser=lambda col: pd.to_datetime(col, utc=True), ) # A time series fragment might look like this: # # df_views_clones: # clones_total ... views_unique # time_iso8601 ... # 2020-12-21 00:00:00+00:00 NaN ... 2 # 2020-12-22 00:00:00+00:00 2.0 ... 23 # 2020-12-23 00:00:00+00:00 2.0 ... 20 # ... # 2021-01-03 00:00:00+00:00 8.0 ... 21 # 2021-01-04 00:00:00+00:00 7.0 ... 18 # # Note the NaN and the floaty type. # All metrics are known to be integers by definition here. NaN values # are expected to be present anywhere in this dataframe, and they # semantically mean "0". Therefore, replace those with zeros. Also see # https://github.com/jgehrcke/github-repo-stats/issues/4 df = df.fillna(0) # Make sure numbers are treated as integers from here on. This actually # matters in a cosmetic way only for outputting the aggregate CSV later # # # not for plotting and number crunching). df = df.astype(int) # attach snapshot time as meta data prop to df df.attrs["snapshot_time"] = snapshot_time # The index is not of string type anymore, but of type # `pd.DatetimeIndex`. Reflect that in the name. df.index.rename("time", inplace=True) if column_names_seen and set(df.columns) != column_names_seen: log.error("columns seen so far: %s", column_names_seen) log.error("columns in %s: %s", p, df.columns) sys.exit(1) column_names_seen.update(df.columns) df = df.sort_index() # Sanity check: snapshot time _after_ latest timestamp in time series? # This could hit in on a machine with a bad time setting when fetching # data. if df.index.max() > snapshot_time: log.error( "for CSV file %s the snapshot time %s is older than the newest sample", p, snapshot_time, ) sys.exit(1) dfs.append(df) # for df in dfs: # print(df) log.info("total sample count: %s", sum(len(df) for df in dfs)) newest_snapshot_time = max(df.attrs["snapshot_time"] for df in dfs) df_prev_agg = None if ARGS.views_clones_aggregate_inpath: if os.path.exists(ARGS.views_clones_aggregate_inpath): log.info("read previous aggregate: %s", ARGS.views_clones_aggregate_inpath) df_prev_agg = pd.read_csv( ARGS.views_clones_aggregate_inpath, index_col=["time_iso8601"], date_parser=lambda col: pd.to_datetime(col, utc=True), ) df_prev_agg.index.rename("time", inplace=True) else: log.info( "previous aggregate file does not exist: %s", ARGS.views_clones_aggregate_inpath, ) log.info("time of newest snapshot: %s", newest_snapshot_time) log.info("build aggregate, drop duplicate data") # Each dataframe in `dfs` corresponds to one time series fragment # ("snapshot") obtained from the GitHub API. Each time series fragment # contains 15 samples (rows), with two adjacent samples being 24 hours # apart. Ideally, the time series fragments overlap in time. They overlap # potentially by a lot, depending on when the individual snapshots were # taken (think: take one snapshot per day; then 14 out of 15 data points # are expected to be "the same" as in the snapshot taken the day before). # Stich these fragments together (with a buch of "duplicate samples), and # then sort this result by time. log.info("pd.concat(dfs)") dfall = pd.concat(dfs) if df_prev_agg is not None: if set(df_prev_agg.columns) != set(dfall.columns): log.error( "set(df_prev_agg.columns) != set (dfall.columns): %s, %s", df_prev_agg.columns, dfall.columns, ) sys.exit(1) log.info("pd.concat(dfall, df_prev_agg)") dfall = pd.concat([dfall, df_prev_agg]) dfall.sort_index(inplace=True) log.info("shape of dataframe before dropping duplicates: %s", dfall.shape) # print(dfall) # Now, the goal is to drop duplicate data. And again, as of a lot of # overlap between snapshots there's a lot of duplicate data to be expected. # What does "duplicat data" mean? We expect that there are multiple samples # from different snapshots with equivalent timestamp. OK, we should just # take any one of them. They should all be the same, right? They are not # all equivalent. I've found that at the boundaries of each time series # fragment, the values returned by the GitHub API are subject to a # non-obvious cutoff effect: for example, in a snapshot obtained on Dec 15, # the sample for Dec 7 is within the mid part of the fragment and shows a # value of 73 for `clones_total`. The snapshot obtained on Dec 21 has the # sample for Dec 7 at the boundary (left-hand, towards the past), and that # shows a value of 18 for `clones_total`. 73 vs 18 -- how is that possible? # That's easily possible, assuming that GitHub uses a rolling window of 14 # days width with a precision higher than 1 day and after all the cutoff # for the data points at the boundary depends on the _exact time_ when the # snapshot was taken. That is, for aggregation (for dropping duplicate/bad # data) we want to look for the maximum data value for any given timestamp. # Using that method, we effectively ignore said cutoff artifact. In short: # group by timestamp (index), take the maximum. df_agg = dfall.groupby(dfall.index).max() log.info("shape of dataframe after dropping duplicates: %s", df_agg.shape) # Write aggregate # agg_fname = ( # datetime.strftime(newest_snapshot_time, "%Y-%m-%d_%H%M%S") # + "_views_clones_aggregate.csv" # ) # agg_fpath = os.path.join(ARGS.snapshotdir, agg_fname) if ARGS.views_clones_aggregate_outpath: if os.path.exists(ARGS.views_clones_aggregate_outpath): log.info("file exists: %s", ARGS.views_clones_aggregate_outpath) if not ARGS.views_clones_aggregate_inpath: log.error( "would overwrite output aggregate w/o reading input aggregate -- you know what you're doing?" ) sys.exit(1) log.info("write aggregate to %s", ARGS.views_clones_aggregate_outpath) # Pragmatic strategy against partial write / encoding problems. tpath = ARGS.views_clones_aggregate_outpath + ".tmp" df_agg.to_csv(tpath, index_label="time_iso8601") os.rename(tpath, ARGS.views_clones_aggregate_outpath) if ARGS.delete_ts_fragments: # Iterate through precisely the set of files that was read above. # If unlinkling fails at OS boundary then don't crash this program. for p in csvpaths: log.info("delete %s as of --delete-ts-fragments", p) try: os.unlink(p) except Exception as e: log.warning("could not unlink %s: %s", p, str(e)) # print(df_agg) # matplotlib_config() # log.info("aggregated sample count: %s", len(df_agg)) # df_agg.plot( # linestyle="solid", # marker="o", # markersize=5, # subplots=True, # # ylabel="count", # xlabel="", # # logy="sym", # ) # plt.ylim([0, None]) # plt.tight_layout() # plt.show() # Why reset_index()? See # https://github.com/altair-viz/altair/issues/271#issuecomment-573480284 df_agg = df_agg.reset_index() df_agg_views = df_agg.drop(columns=["clones_unique", "clones_total"]) df_agg_clones = df_agg.drop(columns=["views_unique", "views_total"]) PANEL_WIDTH = "container" PANEL_HEIGHT = 200 panel_props = {"height": PANEL_HEIGHT, "width": PANEL_WIDTH, "padding": 10} chart_clones_unique = ((alt.Chart(df_agg_clones).mark_line( point=True).encode( alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"), alt.Y( "clones_unique", type="quantitative", title="unique clones per day", scale=alt.Scale( domain=(0, df_agg_clones["clones_unique"].max() * 1.1), zero=True, ), ), )).configure_axisY(labelBound=True).configure_point( size=100).properties(**panel_props)) chart_clones_total = ((alt.Chart(df_agg_clones).mark_line( point=True).encode( alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"), alt.Y( "clones_total", type="quantitative", title="total clones per day", scale=alt.Scale( domain=(0, df_agg_clones["clones_total"].max() * 1.1), zero=True, ), ), )).configure_axisY(labelBound=True).configure_point( size=100).properties(**panel_props)) chart_views_unique = ((alt.Chart(df_agg_views).mark_line( point=True).encode( alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"), alt.Y( "views_unique", type="quantitative", title="unique views per day", scale=alt.Scale( domain=(0, df_agg_views["views_unique"].max() * 1.1), zero=True, ), ), )).configure_axisY(labelBound=True).configure_point( size=100).properties(**panel_props)) chart_views_total = ((alt.Chart(df_agg_views).mark_line(point=True).encode( alt.X("time", type="temporal", title="date", timeUnit="yearmonthdate"), alt.Y( "views_total", type="quantitative", title="total views per day", scale=alt.Scale( domain=(0, df_agg_views["views_total"].max() * 1.1), zero=True, ), ), )).configure_axisY(labelBound=True).configure_point(size=100).properties( **panel_props)) chart_views_unique_spec = chart_views_unique.to_json(indent=None) chart_views_total_spec = chart_views_total.to_json(indent=None) chart_clones_unique_spec = chart_clones_unique.to_json(indent=None) chart_clones_total_spec = chart_clones_total.to_json(indent=None) MD_REPORT.write( textwrap.dedent(""" ## Views #### Unique visitors <div id="chart_views_unique" class="full-width-chart"></div> #### Total views <div id="chart_views_total" class="full-width-chart"></div> <div class="pagebreak-for-print"> </div> ## Clones #### Unique cloners <div id="chart_clones_unique" class="full-width-chart"></div> #### Total clones <div id="chart_clones_total" class="full-width-chart"></div> """)) JS_FOOTER_LINES.extend([ f"vegaEmbed('#chart_views_unique', {chart_views_unique_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);", f"vegaEmbed('#chart_views_total', {chart_views_total_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);", f"vegaEmbed('#chart_clones_unique', {chart_clones_unique_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);", f"vegaEmbed('#chart_clones_total', {chart_clones_total_spec}, {VEGA_EMBED_OPTIONS_JSON}).catch(console.error);", ])
'Select region', ("Asia & Australasia", "Eastern Europe", "Latin America", "Middle East and North Africa", "North America", "Sub-Saharan Africa", "Western Europe"), key='1') df_combined = df_combined[df_combined['Region'].isin(choice)] # Configure the options common to all layers brush = alt.selection(type='interval') base = alt.Chart(df_combined).add_selection(brush) points = alt.Chart(df_combined).mark_circle().encode( alt.X('Democracy_Score', title='Democracy Index'), alt.Y('SocialProgress_Score', title='Social Progress Index', scale=alt.Scale(domain=(30, 100))), color=('Region:N'), size='Population') # Configure the ticks tick_axis = alt.Axis(labels=False, domain=False, ticks=False) x_ticks = base.mark_tick().encode(alt.X('Democracy_Score', title='', axis=tick_axis), alt.Y('Region', title='', axis=tick_axis), color=alt.condition(brush, 'Region', alt.value('lightgrey'))) y_ticks = base.mark_tick().encode(alt.X('Region', title='', axis=tick_axis), alt.Y('SocialProgress_Score',