def generate_bokeh_umap(self, media_type): output_notebook() topics = [] labels = [] for key, value in self.top_words_map.items(): topics.append(value) labels.append(key) if len(labels) >= 20000: reducer = umap.UMAP(n_neighbors=100, metric='hellinger') if len(labels) >= 5000: reducer = umap.UMAP(n_neighbors=50, metric='hellinger') else: reducer = umap.UMAP(metric='hellinger') X = self.vectorized_out.copy() X_embedded = reducer.fit_transform(X) # tsne = TSNE(verbose=1, perplexity=100, random_state=42) # X = self.vectorized_out # X_embedded = tsne.fit_transform(X.toarray()) df_tmp = pd.DataFrame(self.doc_topic_dists) df_tmp['topic'] = df_tmp.idxmax(axis=1) y_labels = df_tmp['topic'].values y_labels_new = [] for i in y_labels: y_labels_new.append(labels[i]) df = self.original_df.copy() # data sources if media_type == 'videos': source = ColumnDataSource( data=dict(x=X_embedded[:, 0], y=X_embedded[:, 1], x_backup=X_embedded[:, 0], y_backup=X_embedded[:, 1], desc=y_labels, ids=df['id'], titles=df['title'], published_times=df['first_airing'], text=df['text'], publication_end_times=df['publication_end_time'], media_availables=df['media_available'], duration_minutes=df['duration_minutes'], finnpanel_genres=df['finnpanel_genre'], labels=["Topic " + str(x) for x in y_labels_new], links=df['link'])) # hover over information hover = HoverTool( tooltips=[ ("Id", "@ids{safe}"), ("Title", "@titles{safe}"), ("Published", "@published_times{safe}"), # ("Text", "@texts{safe}"), ("Publication ends", "@publication_end_times{safe}"), ("Currently available", "@media_availables{safe}"), ("Duration (minutes)", "@duration_minutes{safe}"), ("Finnpanel genres", "@finnpanel_genres{safe}"), ("Link", "@links") ], point_policy="follow_mouse") elif media_type == 'articles': source = ColumnDataSource( data=dict(x=X_embedded[:, 0], y=X_embedded[:, 1], x_backup=X_embedded[:, 0], y_backup=X_embedded[:, 1], desc=y_labels, ids=df.index, titles=df['title'], published_times=df['published_time'].dt.strftime( '%Y-%m-%d %H:%M'), text=df['text'], labels=["Topic " + str(x) for x in y_labels_new], links=df['link'])) # hover over information hover = HoverTool( tooltips=[ ("Id", "@ids{safe}"), ("Title", "@titles{safe}"), ("Published", "@published_times{safe}"), # ("Text", "@texts{safe}"), ("Link", "@links") ], point_policy="follow_mouse") # map colors mapper = linear_cmap(field_name='desc', palette=Category20[20], low=min(y_labels), high=max(y_labels)) # prepare the figure plot = figure(plot_width=1200, plot_height=850, tools=[ hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save', 'tap' ], title="Clustering of the content with UMAP and NMF", toolbar_location="above") # plot settings plot.scatter('x', 'y', size=5, source=source, fill_color=mapper, line_alpha=0.3, line_color="black", legend='labels') plot.legend.background_fill_alpha = 0.6 # Keywords text_banner = Paragraph( text='Keywords: Slide to specific cluster to see the keywords.', height=45) input_callback_1 = input_callback(plot, source, text_banner, topics, self.nr_of_topics) # currently selected article div_curr = Div( text="""Click on a plot to see the link to the article.""", height=150) if media_type == 'videos': callback_selected = CustomJS(args=dict(source=source, current_selection=div_curr), code=selected_code_videos()) elif media_type == 'articles': callback_selected = CustomJS(args=dict(source=source, current_selection=div_curr), code=selected_code_articles()) taptool = plot.select(type=TapTool) taptool.callback = callback_selected # WIDGETS slider = Slider( start=0, end=self.nr_of_topics, value=self.nr_of_topics, step=1, title="Topic #") #, js_event_callbacks=input_callback_1) slider.js_on_change("value", input_callback_1) keyword = TextInput( title="Search:") #, js_event_callbacks=input_callback_1) keyword.js_on_change("value", input_callback_1) # pass call back arguments input_callback_1.args["text"] = keyword input_callback_1.args["slider"] = slider # STYLE slider.sizing_mode = "stretch_width" slider.margin = 15 keyword.sizing_mode = "scale_both" keyword.margin = 15 div_curr.style = { 'color': '#BF0A30', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em' } div_curr.sizing_mode = "scale_both" div_curr.margin = 20 text_banner.style = { 'color': '#0269A4', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em' } text_banner.sizing_mode = "scale_both" text_banner.margin = 20 plot.sizing_mode = "scale_both" plot.margin = 5 r = row(div_curr, text_banner) r.sizing_mode = "stretch_width" # LAYOUT OF THE PAGE l = layout([ [slider, keyword], [text_banner], [div_curr], [plot], ]) l.sizing_mode = "scale_both" # show output_file('t-sne_interactive_streamlit.html') show(l) return (l)
# WIDGETS slider = Slider(start=0, end=20, value=20, step=1, title="Cluster #", callback=input_callback_1) keyword = TextInput(title="Search:", callback=input_callback_1) # pass call back arguments input_callback_1.args["text"] = keyword input_callback_1.args["slider"] = slider # STYLE slider.sizing_mode = "stretch_width" slider.margin=15 keyword.sizing_mode = "scale_both" keyword.margin=15 div_curr.style={'color': '#BF0A30', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'} div_curr.sizing_mode = "scale_both" div_curr.margin = 20 text_banner.style={'color': '#0269A4', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'} text_banner.sizing_mode = "scale_both" text_banner.margin = 20 plot.sizing_mode = "scale_both" plot.margin = 5 r = row(div_curr,text_banner) r.sizing_mode = "stretch_width"