def explore_dataset(users, movies, ratings): # Exploring the MovieLens Data (Users) print(users.describe()) # User features print(users.describe(include=[np.object])) # Categorical user features # Create filters to slice the data occupation_filter = alt.selection_multi(fields=["occupation"]) occupation_chart = alt.Chart().mark_bar().encode( x="count()", y=alt.Y("occupation:N"), color=alt.condition( occupation_filter, alt.Color("occupation:N", scale=alt.Scale(scheme='category20')), alt.value("lightgray")), ).properties(width=300, height=300, selection=occupation_filter) # Create the chart users_ratings = (ratings.groupby('user_id', as_index=False).agg({ 'rating': ['count', 'mean'] }).flatten_cols().merge(users, on='user_id')) # Create a chart for the count, and one for the mean. altair_viewer.show( alt.hconcat(filtered_hist('rating count', '# ratings / user', occupation_filter), filtered_hist('rating mean', 'mean user rating', occupation_filter), occupation_chart, data=users_ratings)) # Exploring the MovieLens Data (Movies) movies_ratings = get_movie_ratings(movies, ratings) genre_filter, genre_chart = filter_and_chart() (movies_ratings[['title', 'rating count', 'rating mean']].sort_values('rating count', ascending=False).head(10)) (movies_ratings[['title', 'rating count', 'rating mean']].mask( 'rating count', lambda x: x > 20).sort_values('rating mean', ascending=False).head(10)) # Display the number of ratings and average rating per movie altair_viewer.show( alt.hconcat(filtered_hist('rating count', '# ratings / movie', genre_filter), filtered_hist('rating mean', 'mean movie rating', genre_filter), genre_chart, data=movies_ratings))
def movie_embedding_norm(models): """Visualizes the norm and number of ratings of the movie embeddings. Args: model: A MFModel object. """ if not isinstance(models, list): models = [models] df = pd.DataFrame({ 'title': movies['title'], 'genre': movies['genre'], 'num_ratings': movies_ratings['rating count'], }) charts = [] brush = alt.selection_interval() for i, model in enumerate(models): norm_key = 'norm' + str(i) df[norm_key] = np.linalg.norm(model.embeddings["movie_id"], axis=1) nearest = alt.selection( type='single', encodings=['x', 'y'], on='mouseover', nearest=True, empty='none') base = alt.Chart().mark_circle().encode( x='num_ratings', y=norm_key, color=alt.condition(brush, alt.value('#4c78a8'), alt.value('lightgray')) ).properties( selection=nearest).add_selection(brush) text = alt.Chart().mark_text(align='center', dx=5, dy=-5).encode( x='num_ratings', y=norm_key, text=alt.condition(nearest, 'title', alt.value(''))) charts.append(alt.layer(base, text)) return altair_viewer.show(alt.hconcat(*charts, data=df))
def bias_line_chart(data3): """ docstring """ Chart3 = alt.Chart(data3).mark_line().encode( x='variance', y='bias_squared' ).interactive() return altair_viewer.show(Chart3)
def test_bar(data): """ A test bar chart """ Chart = alt.Chart(data).mark_bar().encode( x='x', y='y', ).interactive() return altair_viewer.show(Chart)
def basic_bar(data2): """ for a basic chart, convert data to a dataframe or other accepted data sources """ Chart2 = alt.Chart(data2).mark_bar().encode( x='movies', y='num_oscars', color='movies', ).interactive() return altair_viewer.show(Chart2)
def bar_chart(data2): """ A bar chart, like the one above but with small custom alterations title and size """ Chart2 = alt.Chart(data2).mark_bar().encode( alt.X ('movies', title="My Favorite Movies"), alt.Y ('num_oscars', title="# of Academy Awards"), color='movies', ).properties(width=200).interactive() return altair_viewer.show(Chart2)
def visualize_movie_embeddings(data, x, y): nearest = alt.selection( type='single', encodings=['x', 'y'], on='mouseover', nearest=True, empty='none') base = alt.Chart().mark_circle().encode( x=x, y=y, color=alt.condition(genre_filter, "genre", alt.value("whitesmoke")), ).properties( width=600, height=600, selection=nearest) text = alt.Chart().mark_text(align='left', dx=5, dy=-5).encode( x=x, y=y, text=alt.condition(nearest, 'title', alt.value(''))) return altair_viewer.show(alt.hconcat(alt.layer(base, text), genre_chart, data=data))
def av(chart): print(chart.to_json()) altview.show(chart)
""" base = alt.Chart().mark_bar().encode(x=alt.X(field, bin=alt.Bin(maxbins=10), title=label), y="count()", ).properties(width=300, ) return alt.layer( base.transform_filter(filter), base.encode(color=alt.value('lightgray'), opacity=alt.value(.7)), ).resolve_scale(y='independent') # Create the chart users_ratings = ( ratings.groupby('user_id', as_index=False).agg({'rating': ['count', 'mean']}).flatten_cols().merge(users, on='user_id')) # Create a chart for the count, and one for the mean. altair_viewer.show( alt.hconcat(filtered_hist('rating count', '# ratings / user', occupation_filter), filtered_hist('rating mean', 'mean user rating', occupation_filter), occupation_chart, data=users_ratings)) # Exploring the MovieLens Data (Movies) movies_ratings = movies.merge( ratings.groupby('movie_id', as_index=False).agg({'rating': ['count', 'mean']}).flatten_cols(), on='movie_id') genre_filter = alt.selection_multi(fields=['genre']) genre_chart = alt.Chart().mark_bar().encode(x="count()", y=alt.Y('genre'), color=alt.condition( genre_filter, alt.Color("genre:N"), alt.value('lightgray'))).properties(height=300, selection=genre_filter) (movies_ratings[['title', 'rating count', 'rating mean']] .sort_values('rating count', ascending=False) .head(10)) (movies_ratings[['title', 'rating count', 'rating mean']]
cornerRadiusTopLeft=3, cornerRadiusTopRight=3).encode( x='category:N', y=alt.Y('sum(positive_comments):Q', title='Sum of Positive comments', axis=alt.Axis(labelOverlap=True)), color='category:N').properties( title='Distribution of Positive Comments', width=300) chart2 = alt.Chart(df).mark_bar( cornerRadiusTopLeft=3, cornerRadiusTopRight=3).encode( x='category:N', y=alt.Y('sum(negative_comments):Q', title='Sum of Negative comments', axis=alt.Axis(labelOverlap=True)), color='category:N').properties( title='Distribution of Negative Comments', width=300) chart3 = alt.Chart(df).mark_bar( cornerRadiusTopLeft=3, cornerRadiusTopRight=3).encode( x='category:N', y=alt.Y('sum(neutral_comments):Q', title='Sum of Neutral comments', axis=alt.Axis(labelOverlap=True)), color='category:N').properties( title='Distribution of Neutral Comments', width=300) concat = alt.hconcat(chart1, chart2, chart3).resolve_scale(y='shared').configure_title( fontSize=15, offset=5, orient='top', anchor='middle') viewer.show(concat)
def open_chart_in_browser(chart: alt.Chart) -> NoReturnType: # a helpful wrapper around altair_viewer.altview altview.show(chart)
def plot_both(county='Alle_Fylker', periodicity='day', start=None, end=None, plot_lib='Altair', view_plot=False, plot_title=False, width=800, height=500): """ Function to generate a line-plot for Covid19's total daily cases from FHI/Norway. Parameters: ~~~~~~~~~~~~~~~~~~~~ :param county: str: Name od the county to be analyzed, which could be 'Agder', 'Innlandet', 'More_og_Romsdal', 'Nordland', 'Oslo', 'Rogaland', 'Troms_og_Finnmark', Trondelag', 'Vestfold_og_Telemark', 'Vestland', 'Viken'or 'Alle_Fylker' for all counties. :param periodicity: str: Periodicity to display the plot, which could be 'day' or 'week'. Default 'day'. :param start: str or date object: The starting date for the analysis. :param end: str or date object: The ending date for the analysis. :param plot_lib: str: The name of the plotting library to be used. Default is 'Altair' which uses Altair. A option is 'plt' which uses matplotlib. :param view_plot: bool: The default is False to not show the plot. :param plot_title: bool: The default is False to not display the plot's title. :param width: int: Only used for 'Altair' plot. Define the width of the plotting. Default is 800. :param height: int: Only used for 'Altair' plot. Define the height of the plotting. Default is 500. Notes: ~~~~~~~~~~~~~~~~~~~~ - If start and end are None, them it will plot the entire data-base. """ # creating Matplotlib or Seaborn plot if plot_lib == 'plt': # plotting cumulative cases plot_cumulative_cases(county=county, start=start, end=end, plot_lib=plot_lib, view_plot=False, plot_title=False) # setting plots together but with independent y-axis plt.twinx() # plotting daily reported cases plot_reported_cases(county=county, start=start, end=end, plot_lib=plot_lib, view_plot=False, plot_title=False) # setting plot's title if plot_title is True: plt.title(f'Data Vs (cumulative and daily cases) ' f'for {county}') # viewing plot if view_plot is True: plt.show() elif plot_lib == 'Altair': # setting fixed parameters col1 = 'Kumulativt antall' title1 = 'Cumulative nrs.' col2 = 'Nye tilfeller' title2 = 'New report nrs.' # setting prefix and suffix for the data-files path = 'data/' if periodicity == 'day': prefix = path + 'antall-meldte-covid-19-t_day_' elif periodicity == 'week': prefix = path + 'antall-meldte-covid-19-t_week_' else: raise ValueError("Error: The given periodicity is not valid.") suffix = '.csv' # reading data as DataFrame df = pd.read_csv(filepath_or_buffer=prefix + county + suffix, sep=',', index_col='Dato', infer_datetime_format=True) # setting data range df = df[start:end] # fixing datetime for Altair df.reset_index(inplace=True) base = None if periodicity == 'day': df['Dato'] = pd.to_datetime(df["Dato"], dayfirst=True) # creating base for Altair plot as a function of daily-dates base = alt.Chart(df).encode( alt.X('Dato:T', axis=alt.Axis(format='%Y-%m-%d'), scale=alt.Scale(zero=False))) if periodicity == 'week': # creating base for Altair plot as a function of weekly-dates base = alt.Chart(df).encode( alt.X('Dato', scale=alt.Scale(zero=False))) # drawing bars bar = base.mark_bar(opacity=0.3).encode(y=alt.Y(col2, axis=alt.Axis( title=title2, orient='left')), ) # drawing line line = base.mark_line(color='red').encode(y=alt.Y( col1, axis=alt.Axis(title=title1, orient='right')), ) # creating x-axis selections nearest = alt.selection(type='single', nearest=True, on='mouseover', fields=['Dato'], empty='none') # Transparent selectors across the chart. This is what tells us # the x-value of the cursor selectors = alt.Chart(df).mark_point().encode( x='Dato', opacity=alt.value(0), ).add_selection(nearest) # drawing points on the line, and highlight based on selection points = line.mark_point().encode( opacity=alt.condition(nearest, alt.value(1), alt.value(0))) # drawing a rule at the location of the selection rules = alt.Chart(df).mark_rule(color='gray').encode( x='Dato', ).transform_filter(nearest) # setting dual-axis chart = alt.layer(line, bar, selectors, points, rules).resolve_scale(y='independent') # setting properties and interactiveness chart = chart.properties( width=width, height=height, title=f'Covid19 reports for {county}').interactive() # adding tooltips chart = chart.encode(tooltip=[ alt.Tooltip('Dato', title='Date'), alt.Tooltip(col1, title=title1), alt.Tooltip(col2, title=title2) ]) # viewing the plot if view_plot is True: altair_viewer.show(chart) # altair_viewer.display(chart) return chart else: raise ValueError(f"Error: Plot type {plot_lib} not implemented.")