def generate_bokeh_umap(self, media_type): output_notebook() topics = [] labels = [] for key, value in self.top_words_map.items(): topics.append(value) labels.append(key) if len(labels) >= 20000: reducer = umap.UMAP(n_neighbors=100, metric='hellinger') if len(labels) >= 5000: reducer = umap.UMAP(n_neighbors=50, metric='hellinger') else: reducer = umap.UMAP(metric='hellinger') X = self.vectorized_out.copy() X_embedded = reducer.fit_transform(X) # tsne = TSNE(verbose=1, perplexity=100, random_state=42) # X = self.vectorized_out # X_embedded = tsne.fit_transform(X.toarray()) df_tmp = pd.DataFrame(self.doc_topic_dists) df_tmp['topic'] = df_tmp.idxmax(axis=1) y_labels = df_tmp['topic'].values y_labels_new = [] for i in y_labels: y_labels_new.append(labels[i]) df = self.original_df.copy() # data sources if media_type == 'videos': source = ColumnDataSource( data=dict(x=X_embedded[:, 0], y=X_embedded[:, 1], x_backup=X_embedded[:, 0], y_backup=X_embedded[:, 1], desc=y_labels, ids=df['id'], titles=df['title'], published_times=df['first_airing'], text=df['text'], publication_end_times=df['publication_end_time'], media_availables=df['media_available'], duration_minutes=df['duration_minutes'], finnpanel_genres=df['finnpanel_genre'], labels=["Topic " + str(x) for x in y_labels_new], links=df['link'])) # hover over information hover = HoverTool( tooltips=[ ("Id", "@ids{safe}"), ("Title", "@titles{safe}"), ("Published", "@published_times{safe}"), # ("Text", "@texts{safe}"), ("Publication ends", "@publication_end_times{safe}"), ("Currently available", "@media_availables{safe}"), ("Duration (minutes)", "@duration_minutes{safe}"), ("Finnpanel genres", "@finnpanel_genres{safe}"), ("Link", "@links") ], point_policy="follow_mouse") elif media_type == 'articles': source = ColumnDataSource( data=dict(x=X_embedded[:, 0], y=X_embedded[:, 1], x_backup=X_embedded[:, 0], y_backup=X_embedded[:, 1], desc=y_labels, ids=df.index, titles=df['title'], published_times=df['published_time'].dt.strftime( '%Y-%m-%d %H:%M'), text=df['text'], labels=["Topic " + str(x) for x in y_labels_new], links=df['link'])) # hover over information hover = HoverTool( tooltips=[ ("Id", "@ids{safe}"), ("Title", "@titles{safe}"), ("Published", "@published_times{safe}"), # ("Text", "@texts{safe}"), ("Link", "@links") ], point_policy="follow_mouse") # map colors mapper = linear_cmap(field_name='desc', palette=Category20[20], low=min(y_labels), high=max(y_labels)) # prepare the figure plot = figure(plot_width=1200, plot_height=850, tools=[ hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save', 'tap' ], title="Clustering of the content with UMAP and NMF", toolbar_location="above") # plot settings plot.scatter('x', 'y', size=5, source=source, fill_color=mapper, line_alpha=0.3, line_color="black", legend='labels') plot.legend.background_fill_alpha = 0.6 # Keywords text_banner = Paragraph( text='Keywords: Slide to specific cluster to see the keywords.', height=45) input_callback_1 = input_callback(plot, source, text_banner, topics, self.nr_of_topics) # currently selected article div_curr = Div( text="""Click on a plot to see the link to the article.""", height=150) if media_type == 'videos': callback_selected = CustomJS(args=dict(source=source, current_selection=div_curr), code=selected_code_videos()) elif media_type == 'articles': callback_selected = CustomJS(args=dict(source=source, current_selection=div_curr), code=selected_code_articles()) taptool = plot.select(type=TapTool) taptool.callback = callback_selected # WIDGETS slider = Slider( start=0, end=self.nr_of_topics, value=self.nr_of_topics, step=1, title="Topic #") #, js_event_callbacks=input_callback_1) slider.js_on_change("value", input_callback_1) keyword = TextInput( title="Search:") #, js_event_callbacks=input_callback_1) keyword.js_on_change("value", input_callback_1) # pass call back arguments input_callback_1.args["text"] = keyword input_callback_1.args["slider"] = slider # STYLE slider.sizing_mode = "stretch_width" slider.margin = 15 keyword.sizing_mode = "scale_both" keyword.margin = 15 div_curr.style = { 'color': '#BF0A30', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em' } div_curr.sizing_mode = "scale_both" div_curr.margin = 20 text_banner.style = { 'color': '#0269A4', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em' } text_banner.sizing_mode = "scale_both" text_banner.margin = 20 plot.sizing_mode = "scale_both" plot.margin = 5 r = row(div_curr, text_banner) r.sizing_mode = "stretch_width" # LAYOUT OF THE PAGE l = layout([ [slider, keyword], [text_banner], [div_curr], [plot], ]) l.sizing_mode = "scale_both" # show output_file('t-sne_interactive_streamlit.html') show(l) return (l)
def show_camera(content, geom, pad_width, pad_height, label, titles=None, showlog=True, display_range=None, content_lowlim=None, content_upplim=None): """ Parameters ---------- content: pixel-wise quantity to be plotted, ndarray with shape (N, number_of_pixels) where N is the number of different sets of pixel values, for example N different data runs or whatever. The shape can also be just (number_of_pixels), in case a single camera display is to be shown geom: camera geometry pad_width: width in pixels of each of the 3 pads in the plot pad_height: height in pixels of each of the 3 pads in the plot label: string to label the quantity which is displayed, the same for the N sets of pixels inside "content" titles: list of N strings, with the title specific to each of the sets of pixel values to be displayed: for example, indicating run numbers content_lowlim: scalar or ndarray of shape(N, number_of_pixels), same as content: lowest value of "content" which is considered healthy, below which a message will be written out content_upplim: highest value considered healthy, same as above display_range: range of "content" to be displayed Returns ------- [slider, p1, range_slider, p2, p3]: three bokeh figures, intended for showing them on the same row, and two sliders, one for the run numbers ( or whatever "sets" of data we are displaying) and the other for the z-range of the plots. p1 is the camera display (with "content" in linear & logarithmic scale) p2: content vs. pixel p3: histogram of content (with one entry per pixel) """ # patch to reduce gaps between bokeh's cam circular pixels: camgeom = copy.deepcopy(geom) numsets = 1 if np.ndim(content) > 1: numsets = content.shape[0] # numsets is the number of different sets of pixel data to be displayed allimages = [] if np.ndim(content) == 1: allimages.append(content) else: for i in range(1, numsets + 1): allimages.append(content[i - 1]) if titles is None: titles = [''] * numsets # By default we plot the range which contains 99.8 of all events, so that # outliers do not prevent us from seing the bulk of the data: display_min = np.nanquantile(allimages, 0.001) display_max = np.nanquantile(allimages, 0.999) if display_range is not None: display_min = display_range[0] display_max = display_range[1] cam = CameraDisplay(camgeom, display_min, display_max, label, titles[0], use_notebook=False, autoshow=False) cam.image = allimages[0] cam.figure.title.text = titles[0] allimageslog = [] camlog = None source1log = None color_mapper_log = None titlelog = None if showlog: for image in allimages: logcontent = np.copy(image) for i, x in enumerate(logcontent): # workaround as long as log z-scale is not implemented in bokeh camera: if x <= 0: logcontent[i] = np.nan else: logcontent[i] = np.log10(image[i]) allimageslog.append(logcontent) camlog = CameraDisplay(camgeom, np.nanquantile(allimageslog, 0.001), np.nanquantile(allimageslog, 0.999), label, titles[0], use_notebook=False, autoshow=False) camlog.image = allimageslog[0] camlog.figure.title.text = titles[0] source1log = camlog.datasource color_mapper_log = camlog._color_mapper titlelog = camlog.figure.title cluster_i = [] cluster_j = [] pix_id_in_cluster = [] for i in camgeom.pix_id: data = get_pixel_location(i) cluster_i.append(data[0]) cluster_j.append(data[1]) pix_id_in_cluster.append(data[2]) for c in [cam, camlog]: if c is None: continue c.datasource.add(list(c.geom.pix_id), 'pix_id') c.datasource.add(cluster_i, 'cluster_i') c.datasource.add(cluster_j, 'cluster_j') c.datasource.add(pix_id_in_cluster, 'pix_id_in_cluster') # c.add_colorbar() c.figure.plot_width = pad_width c.figure.plot_height = int(pad_height * 0.85) c.figure.grid.visible = False c.figure.axis.visible = True c.figure.xaxis.axis_label = 'X position (m)' c.figure.yaxis.axis_label = 'Y position (m)' c.figure.add_tools( HoverTool(tooltips=[('pix_id', '@pix_id'), ('value', '@image'), ('cluster (i,j)', '(@cluster_i, @cluster_j)'), ('pix # in cluster', '@pix_id_in_cluster')], mode='mouse', point_policy='snap_to_data')) tab1 = Panel(child=cam.figure, title='linear') if showlog: tab2 = Panel(child=camlog.figure, title='logarithmic') p1 = Tabs(tabs=[tab1, tab2]) else: p1 = Tabs(tabs=[tab1]) p1.margin = (0, 0, 0, 25) p2 = figure(background_fill_color='#ffffff', y_range=(display_min, display_max), x_axis_label='Pixel id', y_axis_label=label) p2.min_border_top = 60 p2.min_border_bottom = 70 source2 = ColumnDataSource( data=dict(pix_id=cam.geom.pix_id, value=cam.image)) pixel_data = p2.circle(x='pix_id', y='value', size=2, source=source2) if content_lowlim is None: content_lowlim = np.nan * np.ones_like(content) if content_upplim is None: content_upplim = np.nan * np.ones_like(content) if np.isscalar(content_lowlim): content_lowlim = content_lowlim * np.ones_like(content) source2_lowlim = ColumnDataSource( data=dict(pix_id=cam.geom.pix_id, value=content_lowlim[0])) p2.line(x='pix_id', y='value', source=source2_lowlim, line_dash='dashed', color='orange', line_width=2) if np.isscalar(content_upplim): content_upplim = content_upplim * np.ones_like(content) source2_upplim = ColumnDataSource( data=dict(pix_id=cam.geom.pix_id, value=content_upplim[0])) p2.line(x='pix_id', y='value', source=source2_upplim, line_dash='dashed', color='red') p2.add_tools( HoverTool(tooltips=[('(pix_id, value)', '(@pix_id, @value)')], mode='mouse', point_policy='snap_to_data', renderers=[pixel_data])) p2.y_range = Range1d(display_min, display_max) allhists = [] alledges = [] # We define 100 bins between display_min and display_max # Note that values beyond that range won't be histogrammed and hence will # not appear on the "p3" figure below. nbins = 100 for image in allimages: hist, edges = np.histogram(image[~np.isnan(image)], bins=nbins, range=(display_min, display_max)) allhists.append(hist) alledges.append(edges) source3 = ColumnDataSource(data=dict(top=allhists[0], bottom=0.7 * np.ones_like(allhists[0]), left=alledges[0][:-1], right=alledges[0][1:])) p3 = figure(background_fill_color='#ffffff', y_range=(0.7, np.max(allhists) * 1.1), x_range=(display_min, display_max), x_axis_label=label, y_axis_label='Number of pixels', y_axis_type='log') p3.quad(top='top', bottom='bottom', left='left', right='right', source=source3) if titles is None: titles = [None] * len(allimages) cdsdata = dict(z=allimages, hist=allhists, edges=alledges, titles=titles) # BEWARE!! these have to be lists of arrays. Not 2D numpy arrays!! cdsdata['lowlim'] = [x for x in content_lowlim] cdsdata['upplim'] = [x for x in content_upplim] if showlog: cdsdata['zlog'] = allimageslog cds_allimages = ColumnDataSource(data=cdsdata) # One has to add here everything that must change when moving the slider: callback = CustomJS(args=dict(source1=cam.datasource, source1log=source1log, source2=source2, source2_lowlim=source2_lowlim, source2_upplim=source2_upplim, source3=source3, zz=cds_allimages, title=cam.figure.title, titlelog=titlelog, showlog=showlog), code=""" var slider_value = cb_obj.value var z = zz.data['z'] varzlow = zz.data['lowlim'] varzupp = zz.data['upplim'] var edges = zz.data['edges'] var hist = zz.data['hist'] for (var i = 0; i < source1.data['image'].length; i++) { source1.data['image'][i] = z[slider_value-1][i] if (showlog) { var zlog = zz.data['zlog'] source1log.data['image'][i] = zlog[slider_value-1][i] } source2.data['value'][i] = source1.data['image'][i] source2_lowlim.data['value'][i] = varzlow[slider_value-1][i] source2_upplim.data['value'][i] = varzupp[slider_value-1][i] } for (var j = 0; j < source3.data['top'].length; j++) { source3.data['top'][j] = hist[slider_value-1][j] source3.data['left'][j] = edges[slider_value-1][j] source3.data['right'][j] = edges[slider_value-1][j+1] } title.text = zz.data['titles'][slider_value-1] source1.change.emit() if (showlog) { titlelog.text = title.text source1log.change.emit() } source2.change.emit() source2_lowlim.change.emit() source2_upplim.change.emit() source3.change.emit() """) slider = None if numsets > 1: slider_height = 300 # WARNING: the html won't look nice for number of sets much larger # than 300! But in this way we avoid that the slider skips elements: if numsets > 299: slider_height = numsets + 1 slider = Slider(start=1, end=numsets, value=1, step=1, title="run", orientation='vertical', show_value=False, height=slider_height) slider.margin = (0, 0, 0, 35) slider.js_on_change('value', callback) callback2 = CustomJS(args=dict(color_mapper=cam._color_mapper, color_mapper_log=color_mapper_log, showlog=showlog), code=""" var range = cb_obj.value color_mapper.low = range[0] color_mapper.high = range[1] color_mapper.change.emit() if (showlog) { if (range[0] > 0.) color_mapper_log.low = Math.log(range[0])/Math.LN10 color_mapper_log.high = Math.log(range[1])/Math.LN10 color_mapper_log.change.emit() } """) step = (display_max - display_min) / 100. range_slider = RangeSlider(start=display_min, end=display_max, value=(display_min, display_max), step=step, title="z_range", orientation='vertical', direction='rtl', height=300, show_value=False) range_slider.js_on_change('value', callback2) return [slider, p1, range_slider, p2, p3]
callback_selected = CustomJS(args=dict(source=source, current_selection=div_curr), code=selected_code()) taptool = plot.select(type=TapTool) taptool.callback = callback_selected # WIDGETS slider = Slider(start=0, end=20, value=20, step=1, title="Cluster #", callback=input_callback_1) keyword = TextInput(title="Search:", callback=input_callback_1) # pass call back arguments input_callback_1.args["text"] = keyword input_callback_1.args["slider"] = slider # STYLE slider.sizing_mode = "stretch_width" slider.margin=15 keyword.sizing_mode = "scale_both" keyword.margin=15 div_curr.style={'color': '#BF0A30', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'} div_curr.sizing_mode = "scale_both" div_curr.margin = 20 text_banner.style={'color': '#0269A4', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'} text_banner.sizing_mode = "scale_both" text_banner.margin = 20 plot.sizing_mode = "scale_both" plot.margin = 5