def prep_palette(self, pname, binverse=False): """ Prepares a palette based on a name :param pname: :return: """ res = palettes.grey(256) if pname == 'Greys256': res = palettes.grey(256) elif pname == 'Inferno256': res = palettes.inferno(256) elif pname == 'Magma256': res = palettes.magma(256) elif pname == 'Plasma256': res = palettes.plasma(256) elif pname == 'Viridis256': res = palettes.viridis(256) elif pname == 'Cividis256': res = palettes.cividis(256) elif pname == 'Turbo256': res = palettes.turbo(256) elif pname == 'Bokeh8': res = palettes.small_palettes['Bokeh'][8] elif pname == 'Spectral11': res = palettes.small_palettes['Spectral'][11] elif pname == 'RdGy11': res = palettes.small_palettes['RdGy'][11] elif pname == 'PiYG11': res = palettes.small_palettes['PiYG'][11] if binverse: res = res[::-1] return res
def test_cmap_generator_function(): assert pal.viridis(256) == pal.Viridis256 assert pal.magma(256) == pal.Magma256 assert pal.plasma(256) == pal.Plasma256 assert pal.inferno(256) == pal.Inferno256 assert pal.gray(256) == pal.Greys256 assert pal.grey(256) == pal.Greys256 assert pal.turbo(256) == pal.Turbo256
def test_cmap_generator_function(): assert pal.viridis(256) == pal.Viridis256 assert pal.magma(256) == pal.Magma256 assert pal.plasma(256) == pal.Plasma256 assert pal.inferno(256) == pal.Inferno256 assert pal.gray(256) == pal.Greys256 assert pal.grey(256) == pal.Greys256 assert pal.turbo(256) == pal.Turbo256 assert pal.diverging_palette(pal.Reds9, pal.Greys9, n=18, midpoint=0.5) == pal.Reds9 + pal.Greys9[::-1]
def prepare_colors(labels, colors): """ :param list labels: Data corresponding label list If None, it considers all data belong to same category numpy.ndarray is available It assumes that labels begin 0, and the number of labels is labels.max() + 1 :param list colors: Data corresponding colors If None, it automatically set colors numpy.ndarray is available :returns: n_labels, unique_labels, colors - n_labels : Number of labels. It assumes that label index begins from 0 - unique_labels : List of unique labels - colors : List of color code, length is n_data Usage >>> labels = [0, 0, 1, 1, 2, 5] >>> colors = None >>> n_labels, unique_labels, colors = prepare_colors(labels, colors) >>> print(n_labels) 6 """ # find unique lables and check num lables if labels is not None: unique_labels = np.unique(labels) else: unique_labels = np.zeros(1, dtype=np.int) n_labels = unique_labels.max() + 1 # check inserted colors if colors is not None: if isinstance(colors, str): colors = [colors] * n_labels if len(colors) < n_labels: raise ValueError(f'There exists {n_labels}.'\ ' However, the length of colors is too short') return n_labels, unique_labels, colors # prepare colors if n_labels <= 9: colors = Set1[9][:n_labels] elif n_labels > 256: raise ValueError(f'There exists {n_labels}, too many labels') else: colors = turbo(n_labels) return n_labels, unique_labels, colors
# print(mcs[pd.to_datetime('2020-11-22'):]) iqr_d = Span(location=pd.to_datetime(dates[itr]), dimension='height', line_color='grey', line_width=2, line_alpha=0.3) # pos = nx.spring_layout(G,pos=fixed_positions, fixed = init_names) # for i in range(0, len(init_x), 2): # print("x:", init_x[i:i+2]) # print("y:", init_y[i:i+2]) pal_len = len(pd.Series(init_categories).unique()) new_pal = turbo(pal_len) init_colors = factor_cmap('category', new_pal, pd.Series(init_categories).unique(), nan_color='black') ds = ColumnDataSource( dict(x=init_x, y=init_y, name=init_names, full_name=init_full_names, sector=init_sectors, category=init_categories, size=init_sizes, label_offset=init_offsets))
mt.cols().show() mt.rows().show() # mt qc check mt_qc = hl.sample_qc(mt) p = hl.plot.histogram(mt_qc.sample_qc.call_rate, range=(0.88, 1), legend='Call Rate') p_2 = hl.plot.histogram(mt_qc.sample_qc.gq_stats.mean, legend='Mean Sample GQ') # PCA columns = mt.cols() pca_scores = columns.population_inference.pca_scores labels = columns.population_inference.pop pops = list(set(labels.collect())) mapper = CategoricalColorMapper(palette=turbo(8), factors=pops) # plot the first 5 PCs p = hl.plot.scatter( pca_scores[0], pca_scores[1], label=labels, title='PCA', xlabel='PC1', ylabel='PC2', collect_all=True, colors=mapper, ) p = hl.plot.scatter( pca_scores[1],
def scheduling_plot(result): # 스케줄링 결과 txt파일 읽기 with open(result, 'rt') as file: data = file.readlines() # 공백 제거 data = [i.strip() for i in data] # 프로세스 개수 num 변수에 저장 num = data[0].split() num = int(num[0]) # 랜덤 색상 팔레트 생성 colors = [''] # colors[0]은 사용하지 않음 r = lambda: np.random.randint(0, 255) for i in range(1, num + 1): colors.append('#%02X%02X%02X' % (r(), r(), r())) # 프로세스 개수 data 배열에서 삭제 del data[0] start = 0 # 전체 종료 시간 endTime 변수에 저장 end = len(data) - 1 tmp = data[end].split() endTime = int(tmp[2]) # plot 생성, 프로세스 개수와 종료시간에 따라 크기 지정 및 label 설정 fig, gantt = plt.subplots(figsize=(endTime * 0.5, num * 1.5)) gantt.set_xlim(0, endTime + 1) gantt.set_ylim(0, ((PROC_HEIGHT * num) + (PROC_SPACING * (num + 1)))) gantt.set_xlabel("Time") gantt.set_ylabel("Process") yticks = [None] * num yticklabels = [None] * num for i in range(num): yticks[i] = bar_mid(i + 1) yticklabels[i] = i + 1 gantt.set_yticks(yticks) gantt.set_yticklabels(yticklabels) xticks = [0] * (endTime + 1) xticklabels = [None] * (endTime + 1) for i in range(endTime + 1): xticks[i] = i xticklabels[i] = i gantt.set_xticks(xticks) gantt.set_xticklabels(xticklabels) gantt.grid(True) # 각 프로세스마다의 바 출력 for i in data: data = i.split() execute_time = int(data[2]) - int(data[1]) gantt.broken_barh([(int(data[1]), execute_time)], (bar_bottom(int(data[0])), (PROC_HEIGHT)), color=turbo(num)[int(data[0]) - 1]) # 공백 줄이기 plt.tight_layout() # 결과 출력 파일로 저장 시 주석 처리 필요 # plt.show() # png 파일로 결과 저장 plt.savefig("gantt_chart.png")
def generate_correlation_graph(correlation_matrix_csv_path, path_to_save, title='Correlation Matrix',plot_height=1000, plot_width=1600): ## PREPARING CORRELATION MATRIX df = pd.read_csv(correlation_matrix_csv_path) df = df.set_index('Unnamed: 0').rename_axis('parameters', axis=1) df.index.name = 'level_0' ## AXIS LABELS FOR PLOT common_axes_val = list(df.index) df = pd.DataFrame(df.stack(), columns=['correlation']).reset_index() source = ColumnDataSource(df) ## FINDING LOWEST AND HIGHEST OF CORRELATION VALUES low_df_corr_min = df.correlation.min() high_df_corr_min = df.correlation.max() no_of_colors = len(df.correlation.unique()) ### PLOT PARTICULARS ## CHOOSING DEFAULT COLORS mapper = LinearColorMapper(palette=get_reversed_list(cividis(no_of_colors)), low=low_df_corr_min, high=high_df_corr_min) ## SETTING UP THE PLOT p = figure(title=title,x_range=common_axes_val, y_range=list((common_axes_val)),x_axis_location="below", plot_width=plot_width, plot_height=plot_height,tools=BOKEH_TOOLS, toolbar_location='above',tooltips=[('Parameters', '@level_0 - @parameters'), ('Correlation', '@correlation')]) p.toolbar.autohide = True ## SETTING UP PLOT PROPERTIES p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.axis.major_label_text_font_size = "12pt" p.xaxis.major_label_orientation = pi/2 ## SETTING UP HEATMAP RECTANGLES cir = p.rect(x="level_0", y="parameters", width=1, height=1,source=source,fill_color={'field': 'correlation', 'transform': mapper},line_color=None) ## SETTING UP COLOR BAR color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",ticker=BasicTicker(desired_num_ticks=10),formatter=PrintfTickFormatter(format="%.1f"),label_standoff=6, border_line_color=None, location=(0, 0)) p.add_layout(color_bar, 'right') ## AVAILABLE COLOR SCHEMES COLOR_SCHEME = { 'Cividis':get_reversed_list(cividis(no_of_colors)), 'Gray':get_reversed_list(gray(no_of_colors)), 'Inferno':get_reversed_list(inferno(no_of_colors)), 'Magma':get_reversed_list(magma(no_of_colors)), 'Viridis':get_reversed_list(viridis(no_of_colors)), 'Turbo':get_reversed_list(turbo(no_of_colors)), } ## JS CALLBACK callback = CustomJS(args=dict(col_sch=COLOR_SCHEME,low=low_df_corr_min,high=high_df_corr_min,cir=cir,color_bar=color_bar), code=""" // JavaScript code goes here var chosen_color = cb_obj.value; var color_mapper = new Bokeh.LinearColorMapper({palette:col_sch[chosen_color], low:low, high:high}); cir.glyph.fill_color = {field: 'correlation', transform: color_mapper}; color_bar.color_mapper.low = low; color_bar.color_mapper.high = high; color_bar.color_mapper.palette = col_sch[chosen_color]; """) ## SELECT OPTION FOR INTERACTIVITY GIVEN TO USER select = Select(title='Color Palette',value='cividis', options=list(COLOR_SCHEME.keys()), width=200, height=50) ## CALL BACK TO BE TRIGGERED WHENEVER USER SELECTS A COLOR PALETTE select.js_on_change('value', callback) ## GENERATION FINAL PLOT BY BINDING PLOT AND SELECT OPTION final_plot = layout([[select],[p]]) curdoc().add_root(final_plot) output_file(path_to_save) save(final_plot) carry_bokeh_correction(path_to_save)
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init() mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Get NFE samples only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB'))) scores = hl.read_table(SCORES) mt = mt.annotate_cols(scores=scores[mt.s].scores) mt = mt.annotate_cols(TOB_WGS=mt.s.contains('TOB')) # PCA plot must all come from the same object columns = mt.cols() pca_scores = columns.scores labels = columns.TOB_WGS hover_fields = dict([('s', columns.s)]) # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) print('Making PCA plots labelled by study') for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='TOB-WGS', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', hover_fields=hover_fields, ) plot_filename = f'{output}/study_pc' + str(pc2) + '.png' with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') plot_filename_html = 'study_pc' + str(pc2) + '.html' output_file(plot_filename_html) save(p) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False) print('Making PCA plots labelled by the subpopulation') labels = columns.hgdp_1kg_metadata.labeled_subpop pops = list(set(labels.collect())) for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='Sub-Population', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, colors=CategoricalColorMapper(palette=turbo(len(pops)), factors=pops), ) plot_filename = f'{output}/subpopulation_pc' + str(pc2) + '.png' with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') plot_filename_html = 'subpopulation_pc' + str(pc2) + '.html' output_file(plot_filename_html) save(p) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False)
y = X[:, 1] index = np.argsort(x) # import ipdb; ipdb.set_trace() x = np.sort(x) y = y[index] pred = np.nan*np.zeros(len(X)) error = np.nan*np.zeros(len(X)) # 表示的是残差间的点 error_0s = [np.array(np.nan*np.zeros(2)) for i in range(0,len(X))] error_1s = [np.array(np.nan*np.zeros(2)) for i in range(0,len(X))] error_index = np.nan*np.zeros(len(X)) error_value = np.zeros(len(X)) error_colors = palettes.turbo(size) # create datasource source = ColumnDataSource(data=dict(x=x, y=y, pred=pred, error_0s=error_0s, \ error_1s=error_1s, error_index=error_index, error_value=error_value, \ color=error_colors)) # 显示原始点信息 plot = figure(plot_height=800, plot_width=800, title="不同参数变化残差变化", \ x_range=[min(min(x)*.1, min(x) * 1.5), max(max(x) * .2, max(x)*1.5)], \ y_range=[min(min(y)*.1, min(y) * 1.5), max(max(y) * .2, max(y)*1.5)]) plot.grid.visible = False # 方差变化信息 var_plot = figure(plot_width=600, title="方差变化")
def draw_plot(name, color): p.line(data['date'][data.Symbol == name], data['open'][data.Symbol == name], color=color, legend_label=name) p.circle(data['date'][data.Symbol == name], data['open'][data.Symbol == name], color=color, legend_label=name) # The location of the legend labels is controlled by the location property p.legend.location = "top_left" p.legend.click_policy = "hide" p.legend.title = 'Ticker' p.legend.title_text_font_style = "bold" p.legend.title_text_font_size = "15pt" return p # with concurrent.futures.ThreadPoolExecutor() as executor: # args = ((name, color) for name, color in zip(company_list, turbo(n))) # executor.map(lambda x: draw_plot(*x), args) for name, color in zip(company_list, turbo(n)): draw_plot(name, color) # Specify the name of the output file and show the result output_file('materials.html') show(p)
def get_data(company): dtf = data[data.Symbol == company].set_index('date') for name, color in zip(company_list, turbo(n)): if name == company: dtf['color'] = color return dtf
TOOLS = "pan,wheel_zoom,reset,box_select,lasso_select,help" data = pd.read_csv("factbook.csv") TOOLTIPS = [("index", "$index"), ("Country", "@Country"), ("GDP per Capita", "@{GDP per capita}"), ("Life expectancy at birth", "@{Life expectancy at birth}"), ("Population", "@{Population}"), ("Birth rate", "@{Birth rate}")] length = len(data.index) plot1 = figure(tools=TOOLS, tooltips=TOOLTIPS, plot_width=800, plot_height=400) plot2 = figure(tools=TOOLS, tooltips=TOOLTIPS, plot_width=800, plot_height=400) plot3 = figure(tools=TOOLS, tooltips=TOOLTIPS, plot_width=800, plot_height=400) plot4 = figure(tools=TOOLS, tooltips=TOOLTIPS, plot_width=800, plot_height=400) colors = turbo(11) data = data.rename(columns=lambda x: x.strip()) GDPperCapita = data['GDP per capita'].str.replace("$", "").str.replace( ",", "").str.strip().astype(float) Life = np.array(data['Life expectancy at birth'].values) population = data['Population'].str.replace(",", "").astype(float) population = np.array([ float(x - min(population.values)) / float( (max(population.values) - min(population.values))) * 29.0 + 5 for x in population.values ]) #print(population) birthrate = np.array(data['Birth rate']) colorlist = [colors[int((i / max(birthrate)) * 10)] for i in birthrate] GDPperCapita = np.array(GDPperCapita.values)
def Electron_Energy_Graph_Old(conn): ############################################################################ #################### CREATE THE DATA FOR THE GRAPH ######################### output_file( "Electron_Output_Graph.html" ) #???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? # Use the connection passed to the function to read the data into a # dataframe via an SQL query. df = pd.read_sql('SELECT * FROM [eEnergyICP]', conn) print(df) # Delete cells where 'protocol id' is empty df = df.dropna(subset=['protocol id']) # With any luck this can be removed after chatting to AW and MB ????????????????????????????????????????????????????????????????????????????????? # Get the date and machinename from the protocol id' field # Seperate on the first '_' df_left = df['protocol id'].str.partition(sep='_') # Seperate on the last '_' df_right = df['protocol id'].str.rpartition(sep='_') # From these sperated dataframes add the appropriate columns back into the # main dataframe. df.loc[:, 'adate'] = df_left[0] df.loc[:, 'machinename'] = df_right[2] # Turn 'adate' into datetime. Problems with this function as it assumes american date formats over british. ????????????????????????????????????????????????????????????????????????????????? # Talk to AW and MB about getting date from other tables in the database and pulling them into the query. ??????????????????????????????????????????????????????????????????????????????????? # This way the date should be in a set format that the datetime function can be told, which should resolve this issue. ?????????????????????????????????????????????????????????????????????? # # Need to turn the date fields into a Dateime object (either 'adate' # (protons) or the newly created 'adate' (photons)). The date field should # always be named 'adate' for consistency. df.loc[:, 'adate'] = pd.to_datetime(df.loc[:, 'adate']) # When starting a new graph can be useful to print the dataframe after any # manipulations to make sure the code has done what you expected. print(df) # Create a list of the fields using the dataframe TableFields = (list(df.columns)) ############################################################################ ############################################################################ ############################################################################ ################ CREATE THE DATAFRAME FOR THE TOLERANCES ################### # The purpose of this plot is generally to be as general as possible but # there are only a few parameters that will have defined tolerances. # Therefore the tolerance section can be a bit more specific and a dataframe # containing tolereances can be manually created for many cases and # extracted from the database in others (in a manner similar to above but # calling from a different table/query with the SQL statement) # # The format of the dataframe should be the first line being the x_axis # (with some values taken from the main dataframe to get the right # formatting). The subsequent columns are the tolerances [low, high]. # NB: column names should match those from the main dataframe. df_tol1 = pd.read_sql('SELECT * FROM [ElectronFWHMLimits]', conn) print(df_tol1) df_tol1 = df_tol1.set_index('class') print(df_tol1) df_tol_TB = pd.DataFrame({ 'adate': [df['adate'].max(), df['adate'].max()], '6fwhm': [df_tol1.loc['TBUCLH', 'lower6'], df_tol1.loc['TBUCLH', 'upper6']], '9fwhm': [df_tol1.loc['TBUCLH', 'lower9'], df_tol1.loc['TBUCLH', 'upper9']], '12fwhm': [df_tol1.loc['TBUCLH', 'lower12'], df_tol1.loc['TBUCLH', 'upper12']], '15fwhm': [df_tol1.loc['TBUCLH', 'lower15'], df_tol1.loc['TBUCLH', 'upper15']] }) print(df_tol_TB) df_tol_Classic = pd.DataFrame({ 'adate': [df['adate'].max(), df['adate'].max()], '6fwhm': [df_tol1.loc['Classic', 'lower6'], df_tol1.loc['Classic', 'upper6']], '9fwhm': [df_tol1.loc['Classic', 'lower9'], df_tol1.loc['Classic', 'upper9']], '12fwhm': [df_tol1.loc['Classic', 'lower12'], df_tol1.loc['Classic', 'upper12']], '16fwhm': [df_tol1.loc['Classic', 'lower16'], df_tol1.loc['Classic', 'upper16']], '20fwhm': [df_tol1.loc['Classic', 'lower20'], df_tol1.loc['Classic', 'upper20']] }) print(df_tol_Classic) ############################################################################ ############################################################################ ########################################################################## ################### CREATE THE COLUMNS FOR THE LEGEND ###################### # NB: The following section has been designed to be as general as possible # but in reality it might be preferable to more manually choose the markers # and colors based on optimising the most likey things to be plotted. # # This code is a way of creating a legend with markers based on one # parameter (e.g. machine name) and color on another parameter (e.g. energy) ######### Colors: # Create a sorted list of the unique values in a dataframe column that the # colors will be based on. list_forcolor = sorted(df['machinename'].unique().tolist()) # If the length of the list is <9 then we can use the colorblind palette, # which contains 8 colors. This should be the default for accessability # reasons unless there are compeling reasons otherwise. if len(list_forcolor) < 9: color_palette = Colorblind[len(list_forcolor)] # If not <9 then we can use the much larger Turbo palette which contains # 256 colors. Will check if there are more than 256 options though and # throw an error if so. elif len(list_forcolor) > 256: print( 'Error - Name of Function: >256 unique energies in database ' \ 'causing failure of the turbo color palette function (only ' \ '256 availible colors.' ) exit() # If it passes the check then use the built in turbo function that splits # the turbo palette into roughly equal sections based on a supplied integer # number. else: color_palette = turbo(len(list_forcolor)) ######### Markers: # Doesn't seem to be a way to create a simple list of all the Bokeh marker # options so just do this manually. May want to re-order to improve # visibility of commonly used options. markers = [ 'asterisk', 'circle', 'circle_cross', 'circle_x', 'cross', 'dash', 'diamond', 'diamond_cross', 'hex', 'inverted_triangle', 'square', 'square_cross', 'square_x', 'triangle', 'x' ] # Create a sorted list of the unique values in a dataframe column that the # markers will be based on. list_formarker = sorted(df['machinename'].unique().tolist()) # Check that there are enough markers to give a unique marker to each option # but otherwise throw an error. if len(list_formarker) > len(markers): print( 'Error - Name of Function: Not enough markers to assign a ' \ 'unique marker to each option.' ) exit() ######### Legend Key: # Create a function that will be used to run through the dataframe looking # at the energy and machine column and creating a new column that will have # values for both seperated by a '_', stored as a string. def add_legend(row): return str(str(row['machinename'])) # Run the function and also copy the other columns into new columns so that # when ther are renamed to 'x' and 'y' later they are still availible for # the legend if needed. df.loc[:, 'legend'] = df.apply(lambda row: add_legend(row), axis=1) df.loc[:, 'machinename1'] = df.loc[:, 'machinename'] print(df) ############################################################################ ############################################################################ ############################################################################ ################## FORMATTING AND CREATING A BASIC PLOT #################### ############################################################################ ############################# USER INPUTS ################################## # Decide what the default viewing option is going to be. (i.e. the fields to # be plotted on the x and y axis when the graph is opened, the plot size # etc.). # From the legend defined above give the values that will be pre-ticked when # the plot is opened color_to_plot = ['TrueBeam B', 'TrueBeam C'] marker_to_plot = color_to_plot # Decide on what data to plot on the x/y axis when opened. x_data1 = 'adate' y_data1 = '6fwhm' # Decide what the plot formatting will be, inluding the plot title, axis # titles and the size of the plot. plot_title1 = 'Electron Energy' x_axis_title1 = x_data1 y_axis_title1 = y_data1 plot_size_height1 = 450 plot_size_width1 = 800 legend_location = 'bottom_left' # Create a list of the plot parameters that will be used as input to a # function later. list_plot_parameters = [ x_data1, y_data1, plot_title1, x_axis_title1, y_axis_title1, plot_size_height1, plot_size_width1, legend_location ] ############################################################################ ############################################################################ ############################################################################ ########################### CREATE THE PLOT ################################ # Create the actual ploy. Generally it's a good idea to do this by defining # functions as they can then be used in the callbacks later without having # a lot of redundant very similar code. ######### Make Dataset: # Define a make dataset function that can be used now but also called later # in the callback functions to save re-writing similar code later. def make_dataset(color_to_plot, marker_to_plot, x_data1, y_data1): # Create a sub dataframe Sub_df1 = df.copy() # Delete any rows in the sub-dataframes that do not exist in the # checkboxes/default user choices. (e.g. if you've selected 6MV in the # checkbox this will remove any rows that have something other than 6MV) Sub_df1 = Sub_df1[Sub_df1['machinename'].isin(color_to_plot)] Sub_df1 = Sub_df1[Sub_df1['machinename'].isin(marker_to_plot)] # Search for the columns with the x_data and y_data names and replace # them with 'x' and 'y'. Unless plotting the same data on both in which # case add an extra column for 'y' that's a copy of 'x' if x_data1 == y_data1: Sub_df1.rename(columns={x_data1: 'x'}, inplace=True) Sub_df1.loc[:, 'y'] = Sub_df1.loc[:, 'x'] else: Sub_df1.rename(columns={x_data1: 'x'}, inplace=True) Sub_df1.rename(columns={y_data1: 'y'}, inplace=True) # Return the newly created Sub_df1 return Sub_df1 # Run the make_dataset function to create a sub dataframe that the plot will # be made from. Sub_df1 = make_dataset(color_to_plot, marker_to_plot, x_data1, y_data1) # Create a Column Data Source. This is important as it is the data format # needed for Bokeh. When making this it is useful to convert the dataframe # into a dictionary, which seems to help with the callback function (see # 'Common Issues' for details). src1 = ColumnDataSource(Sub_df1.to_dict(orient='list')) ######### Make Plot: # Create an empty plot (plot parameters will be applied later in a way that # can be manipulated in the callbacks) p1 = figure() # Create a scatter plot. p1.scatter( # source = The ColumnDataSource made above. source=src1, # x/y = 'x'/'y' which are fields that were renamed as such in # the make_dataset function x='x', y='y', # Some general parameters about marker size. These seem like # reasonable values but possible could alter these in a # callback? fill_alpha=0.4, size=12, # Create the legend using the created fields added in the legend # section. Use the factor_mark and factor_cmap functions to # match the colors/markers to the right lists. # NB: Always use legend_field for this not legend_group as the # former acts on the javascript side but the latter the Python # side. Therefore the former will update automatically when the # plot is changed with no need for a callback. legend_field='legend', marker=factor_mark('machinename1', markers, list_formarker), color=factor_cmap('machinename1', color_palette, list_forcolor)) ######### Add plot parameters: # Define a define plot parameters factor that can be used now but also # called later in the callback functions. def define_plot_parameters(list): # Input is a List of format: # list_plot_parameters = [ x_data1, y_data1, # plot_title1, x_axis_title1, y_axis_title1, # plot_size_height1, plot_size_width1, # legend_location ] # The parameters have to be controlled like this in a callback to allow # for them to be adjusted. Otherwise the plot parameters are not # interactive. # Yes! - p1.xaxis.axis_label = 'X_axis_title' # No! - p1 = figure(x_axis_label = 'X_axis_title') p1.title.text = list[2] p1.xaxis.axis_label = list[3] p1.yaxis.axis_label = list[4] p1.plot_height = list[5] p1.plot_width = list[6] p1.legend.location = list[7] # If the user wants to plot an axis as datetime then the axis needs to # be reformatted. Will do this by checking if the x_data1/y_data1 is # =='adate'. # NB: This only works if 'adate' is used as the name for the date column # and also that this is the only date column. if list[0] == 'adate': p1.xaxis.formatter = DatetimeTickFormatter(days=['%d/%m', '%a%d']) else: p1.xaxis.formatter = BasicTickFormatter() if list[1] == 'adate': p1.yaxis.formatter = DatetimeTickFormatter(days=['%d/%m', '%a%d']) else: p1.yaxis.formatter = BasicTickFormatter() return # Run the define_plot_parameters function to format the plot. define_plot_parameters(list_plot_parameters) ############################################################################ ############################################################################ ############################################################################ ############################################################################ ############################################################################ ############################ ADD TOLERANCES ################################ # We defined the tolerances further up and now want to add the correct ones # to the plot (having created the plot above). Again this will be done with # functions and in a way that the functions can be used in the callbacks # later. # # NB: At the moment this is still a bit of a work in progress and shows the # way to add line tolerances. Another option might be to add colorblocks # using varea and/or varea_stack. # # NB: Also this funcion assumes that tolerances will all be against one # x_axis value (e.g. date). This is probably the majority of use cases but # probably relatively trivial to add further toleraces against other x_axis # data. # Create a function that will create a dataframe that can be used to supply # a plot of two tolerance lines. This will including 'appearing' and # 'disappearing' depending on whether tolerances are defined or not. def tolerances(x_data1, y_data1, Sub_df1, df_tol1): # Get a list of the column headers from the tolerance table defined # earlier. headers1 = df_tol1.columns.values.tolist() # Check if the xdata is what is in the df_tol1 as the x_axis (if not no # point plotting tolerances as all tolerances are vs this tolerance). if x_data1 != headers1[0]: # x_data1 doesn't match so going to output something that should # basically just not plot but also won't throw the viewing range. data = { 'x': [Sub_df1['x'].max(), Sub_df1['x'].max()], 'y_low': [Sub_df1['y'].max(), Sub_df1['y'].max()], 'y_high': [Sub_df1['y'].max(), Sub_df1['y'].max()] } Sub_df1_tol1 = pd.DataFrame(data) return Sub_df1_tol1 # Otherwise we have the right x_data1 so now just check if it's datetime # or not. if x_data1 == 'adate': # It has the format 'adate' so should be datetime. So find the max # min dates in the Sub_df1 and add a couple of weeks either side so # that it plots the full range (plus a little bit for visualisation # reasons). max_x = Sub_df1['x'].max() + pd.DateOffset(weeks=2) min_x = Sub_df1['x'].min() + pd.DateOffset(weeks=-2) else: # If it's not datetime then just add about 5% of the range to # either side to make the plot look nicer. # NB: This has not been checked extensively as most tolerances are # vs. time. max_x = Sub_df1['x'].max() min_x = Sub_df1['x'].min() range = max_x - min_x max_x = max_x + (range / 20) min_x = min_x - (range / 20) # Used the x part so now remove the element from the list. This will # help for the small case where x_data1 == ydata1. headers1.remove(x_data1) if y_data1 in headers1: # If y_data1 is in the list then loop through to find out where and # get the data from the tolerance dataframe. for x in headers1: if y_data1 == x: # When the loop has found where it is then can output a # dataframe of the form: # x = [far left of plot, far right of plot] # y_low = [low_tolerance, low_tolerance] # y_high = [high_tolerance, high_tolerance] data = { 'x': [min_x, max_x], 'y_low': [df_tol1[x][0], df_tol1[x][0]], 'y_high': [df_tol1[x][1], df_tol1[x][1]] } Sub_df1_tol1 = pd.DataFrame(data) else: # If y_data1 is not in the headers1 list then there are no # tolerances to plot so going to output something that should # basically just not plot but also won't throw the viewing range. data = { 'x': [Sub_df1['x'].max(), Sub_df1['x'].max()], 'y_low': [Sub_df1['y'].max(), Sub_df1['y'].max()], 'y_high': [Sub_df1['y'].max(), Sub_df1['y'].max()] } Sub_df1_tol1 = pd.DataFrame(data) return Sub_df1_tol1 return Sub_df1_tol1 def choose_tolerances(x_data1, y_data1, Sub_df1, color_to_plot): if any(item in color_to_plot for item in ['TrueBeam B', 'TrueBeam C', 'TrueBeam D', 'TrueBeam F']): # If this is true then will need to run the df_tol_TB tolerances Sub_df1_tol_TB = tolerances(x_data1, y_data1, Sub_df1, df_tol_TB) else: data = { 'x': [Sub_df1['x'].max(), Sub_df1['x'].max()], 'y_low': [Sub_df1['y'].max(), Sub_df1['y'].max()], 'y_high': [Sub_df1['y'].max(), Sub_df1['y'].max()] } Sub_df1_tol_TB = pd.DataFrame(data) if any(item in color_to_plot for item in ['Linac B', 'Linac C', 'Linac D', 'Linac E']): # If this is true then will need to run the df_tol_TB tolerances Sub_df1_tol_Classic = tolerances(x_data1, y_data1, Sub_df1, df_tol_Classic) else: data = { 'x': [Sub_df1['x'].max(), Sub_df1['x'].max()], 'y_low': [Sub_df1['y'].max(), Sub_df1['y'].max()], 'y_high': [Sub_df1['y'].max(), Sub_df1['y'].max()] } Sub_df1_tol_Classic = pd.DataFrame(data) return Sub_df1_tol_TB, Sub_df1_tol_Classic # Run the tolerances function to output the new dataframe Sub_df1_tol_TB, Sub_df1_tol_Classic = choose_tolerances( x_data1, y_data1, Sub_df1, color_to_plot) # Turn the dataframe into a new ColumnDataSource (again turning it into a # dictionary) src1_tol_TB = ColumnDataSource(Sub_df1_tol_TB.to_dict(orient='list')) src1_tol_Classic = ColumnDataSource( Sub_df1_tol_Classic.to_dict(orient='list')) # Add two lines to the plot using the new ColumnDataSource as the source, # one line for low tolerance and one line for high. p1.line(source=src1_tol_TB, x='x', y='y_low', color='firebrick') p1.line(source=src1_tol_TB, x='x', y='y_high', color='firebrick') p1.line(source=src1_tol_Classic, x='x', y='y_low', color='hotpink') p1.line(source=src1_tol_Classic, x='x', y='y_high', color='hotpink') ############################################################################ ############################################################################ ############################################################################ ################## ADD MORE COMPLEX TOOLS TO THE PLOT ###################### # Create tools here that will allow for some manipulation or inspection of # plotted data. # # As an example a 'HoverTool' will be added to the plot. # # Other useful tools and details of the syntax can be found here: # https://docs.bokeh.org/en/latest/docs/user_guide/tools.html # Create the hover tool (see website above for syntax/details). # This example creates a hover tool that displays: # Date: The value of the data-point as measued on the x-axis # (formatted for datetime) # Y-Axis: The value of the data-point as measued on the y-axis # (x,y): The x and y co-ordinates in plot space # Chamber Comb.: The data stored under the 'Chamber' column for that # data-point. # Comments: The data stored under the 'comments' column for that # data-point. hover = HoverTool(tooltips=[('Date', '@x{%F}'), ('Y-Axis', '@y'), ('(x,y)', '($x, $y)'), ('Chamber Comb.', '@Chamber'), ('Comments', '@comments')], formatters={'x': 'datetime'}) # Add the newly created tool to the plot. p1.add_tools(hover) ############################################################################ ############################################################################ ############################################################################ ################# CREATE WIDGETS TO BE ADDED TO THE PLOT ################### # Create widgets here that will allow for some manipulation of the plotted # data. These widgets provide an interactive ability that can alter the data # that is plotted, provide update fnctions and call other programmatic # functions. This is done either using built in Bokeh functionality or # using more powerful but complex python and javascript based callbacks. # # As an example some 'Select' widgets, 'Checkbox' widgets and 'RangeSliders' # will be added to the plot. # # Other useful widgets and details of the syntax can be found here: # https://docs.bokeh.org/en/latest/docs/user_guide/interaction/widgets.html ######## 1) # Create the select widget (see website above for syntax/details). This # widget will be used for the callback example later to change data plotted # on the x/y-axis. # This example creates a select tool that displays: # Dropdown list containing a list of every field that was downloaded from # the database. # NB: When making a list it may be worth manually creating it to limit # it to the fields that can be plotted (e.g. not including fields # like 'Comments'). This will shorten the dropdown list but you # should err on the side of inclusion to make the final plot as # flexible as possible. # # Create a list of the availible options menu_axis = [] for field in TableFields: menu_axis.append(field) menu_axis = sorted(menu_axis) # Select tool needs inputs for the title, a starting value and the just # created list to supply the available options. select_xaxis = Select(title='X-Axis Fields Available:', value=x_axis_title1, options=menu_axis) select_yaxis = Select(title='Y-Axis Fields Available:', value=y_axis_title1, options=menu_axis) ######## 2) # This select widget will be made in the same way and used to create a # dropdown list to change the legend position. # # Create a list of the availible options menu_legend = [ 'top_left', 'top_center', 'top_right', 'center_left', 'center', 'center_right', 'bottom_left', 'bottom_center', 'bottom_right' ] # Create the select tool as above select_legend = Select(title='Legend Position', value=legend_location, options=menu_legend) ######## 3) # These checkbox widgets will be used to create a tool to select the # values that are being plotted from the fields that the legend is based on. # # NB: There is some built in Bokeh functionality for interavtive legends # that can fulfill some of the same goals where the number of options is # limited to something that can display on a reasonably sized legend. May # be a better and more robust solution where possible. # Create a list of all unique names in the column chosen to be matched to # markers (sorted). options_marker = sorted(df['machinename'].unique().tolist()) # Create an index list for all of the values that should be pre-ticked. index_marker = [ i for i in range(len(options_marker)) if options_marker[i] in marker_to_plot ] # Create the checkbox, providing the list of availible options and a list # of what should be active (pre-ticked). checkbox_marker = CheckboxGroup(labels=options_marker, active=index_marker, visible=False) # Do the same for the column that was matched to colors. options_color = sorted(df['machinename'].unique().tolist()) index_color = [ i for i in range(len(options_color)) if options_color[i] in color_to_plot ] checkbox_color = CheckboxGroup(labels=options_color, active=index_color) ######## 4) # Make some range sliders that will be used to manipulate the x-axis and # y-axis range. # Most of the manipulation will be done using a later function but will need # to create the bare minimum rangeslider first that can later be manipulated # (This seems to be the minimum number of parameters needed to create these # widgets). Note that a RangeSliders AND a DateRangeSlider needs to be # created for each axis. range_slider_x = RangeSlider(title='X-Axis Range', start=0, end=1, value=(0, 1), step=0.1) range_slider_y = RangeSlider(title='Y-Axis Range', start=0, end=1, value=(0, 1), step=0.1) range_slider_xdate = DateRangeSlider(title='X-Axis Range (Date)', start=date(2017, 1, 1), end=date(2017, 1, 2), value=(date(2017, 1, 1), date(2017, 1, 2)), step=1) range_slider_ydate = DateRangeSlider(title='Y-Axis Range (Date)', start=date(2017, 1, 1), end=date(2017, 1, 2), value=(date(2017, 1, 1), date(2017, 1, 2)), step=1) # Define the function that will be used now and also in the callbacks later. # This will allow the range_sliders to adjust to match any changes in the # data being plotted on the x/y axis. def range_slider(x_data1, y_data1, Sub_df1): # Start with the y-axis. # First need to check if 'adate' and if so edit the date range slider # but otherwise edit the normal slider. if y_data1 == 'adate': # Set the start, end and value fields to the full range. range_slider_ydate.start = Sub_df1['y'].min() range_slider_ydate.end = Sub_df1['y'].max() range_slider_ydate.value = (Sub_df1['y'].min(), Sub_df1['y'].max()) # Step to 1 works for DateRangeSlider range_slider_ydate.step = 1 # Make the DateRangeSlider visible and hide the normal RangeSlider range_slider_ydate.visible = True range_slider_y.visible = False else: # Set the start, end and value fields to the full range. range_slider_y.start = Sub_df1['y'].min() range_slider_y.end = Sub_df1['y'].max() range_slider_y.value = (Sub_df1['y'].min(), Sub_df1['y'].max()) # Step to range/10000 should give sufficient granularity range_slider_y.step = (Sub_df1['y'].max() - Sub_df1['y'].min()) / 100000 # Make the normal RangeSlider visible and hide the DateRangeSlider range_slider_y.visible = True range_slider_ydate.visible = False # Do the same for the x-axis if x_data1 == 'adate': range_slider_xdate.start = Sub_df1['x'].min() range_slider_xdate.end = Sub_df1['x'].max() range_slider_xdate.value = (Sub_df1['x'].min(), Sub_df1['x'].max()) range_slider_xdate.step = 1 range_slider_xdate.visible = True range_slider_x.visible = False else: range_slider_x.start = Sub_df1['x'].min() range_slider_x.end = Sub_df1['x'].max() range_slider_x.value = (Sub_df1['x'].min(), Sub_df1['x'].max()) range_slider_x.step = (Sub_df1['x'].max() - Sub_df1['x'].min()) / 100000 range_slider_x.visible = True range_slider_xdate.visible = False return # Run the function. range_slider(x_data1, y_data1, Sub_df1) ############################################################################ ############################################################################ ############################################################################ ########################### CREATE A LAYOUT ################################ # Create a layout to add widgets and arrange the display. This simple layout # displays the select widgets above the plot with the checkboxes to the # right (one above the other). # # More details can be found at: # https://docs.bokeh.org/en/latest/docs/user_guide/layout.html # # NB: More work to do here to make plots responsive to browser window size # (e.g. using sizing_mode = scale_both) but need to invstigate with/without # remote desktops. layout_checkbox = column([checkbox_marker, checkbox_color]) layout_plots = column([ select_xaxis, select_yaxis, select_legend, range_slider_x, range_slider_y, range_slider_xdate, range_slider_ydate, p1 ]) tab_layout = row([layout_plots, layout_checkbox]) ############################################################################ ############################################################################ ############################################################################ ####################### CREATE CALLBACK FUNCTIONS ########################## # CAVEAT: Callback functions are very complex and below is my (CB) rough # understanding of how they function based mainly on experience/trial and # error while writting these functions for other graphs. It should be taken # as a starting point but not as a definitive user guide. # # Callback functions are very powerful and can be based off of javascript or # python. The example presented here uses python but in future a javascript # copy should also be added. ######## 1) # This callback is designed to take inputs from the select and checkbox # widgets update the graph to plot the new data requested by the user. # # Syntax: # attr = The value passed from the on_change function before the callback # was named (e.g. in this example attr = 'value') # old = The value of the widget before it was changed (I.e. If a select # widget is changed from 'Output' to 'T/P Correction', then # old = 'Output' # new = The value of the widget after it was changed (I.e. If a select # widget is changed from 'Output' to 'T/P Correction', then # old = 'T/P Correction' # # NB: In general seen little need to use these inputs as you can generally # access the value of the widgets directly which seems to be more powerful # and flexible # # First define the callback function. def callback(attr, old, new): # Want to acquire the current values of all of the checkboxes and select # widgets to provide as inputs for the re-plot. For the checkboxes this # means itterating through the active list and outputting the labels # that are active color_to_plot = [ checkbox_color.labels[i] for i in checkbox_color.active ] marker_to_plot = color_to_plot plot1_xdata_to_plot = select_xaxis.value plot1_ydata_to_plot = select_yaxis.value legend_location = select_legend.value # Use the pre-defined make_dataset function with these new inputs to # create a new version of the sub dataframe. Sub_df1 = make_dataset(color_to_plot, marker_to_plot, plot1_xdata_to_plot, plot1_ydata_to_plot) # Use the pre-defined define_plot_parameters function with these new # inputs to update the plot parameters. x_axis_title1 = plot1_xdata_to_plot y_axis_title1 = plot1_ydata_to_plot define_plot_parameters([ plot1_xdata_to_plot, plot1_ydata_to_plot, plot_title1, x_axis_title1, y_axis_title1, plot_size_height1, plot_size_width1, legend_location ]) # Use the pre-defined tolerances function with these new inputs to # make a new version of the tolerances sub dataframe. Sub_df1_tol_TB, Sub_df1_tol_Classic = choose_tolerances( plot1_xdata_to_plot, plot1_ydata_to_plot, Sub_df1, color_to_plot) # Use the pre-defined range_slider function with these new inputs to # update the range sliders (this will make sure that the range sliders # start/end etc. match up with what's being plotted, as well as # displaying/hiding the RangeSlider/DateRangeSlider as needed range_slider(plot1_xdata_to_plot, plot1_ydata_to_plot, Sub_df1) # Update the ColumnDataSources using the newly created dataframes. The # plots look to these as the source so this changes what is being # plotted. src1.data = Sub_df1.to_dict(orient='list') src1_tol_TB.data = Sub_df1_tol_TB.to_dict(orient='list') src1_tol_Classic.data = Sub_df1_tol_Classic.to_dict(orient='list') return # Use the on_change function to call the now defined callback function # whenever the user changes the value in the widget. # NB: Other functions such as on_click are availible for other widgets. # Syntax: # First argument is passed to the callback as attr (see callback section # above) # Second argument is the name of the callback function to be called. select_xaxis.on_change('value', callback) select_yaxis.on_change('value', callback) select_legend.on_change('value', callback) checkbox_color.on_change('active', callback) checkbox_marker.on_change('active', callback) ######## 2) # This callback is designed to take inputs from the range sliders to change # visible range def callback_range(attr, old, new): # Check what is currently being plotted. Need this to know whether to # look for the values from the DateRangeSlider or the RangeSlider plot1_xdata_to_plot = select_xaxis.value plot1_ydata_to_plot = select_yaxis.value # Start with the x-axis if plot1_xdata_to_plot == 'adate': # If it's 'adate' then need to look at the DateRangeSlider and # update the start and end values of the range using the values from # the slider. # NB: range_slider.value = left_value, right_value p1.x_range.start, p1.x_range.end = range_slider_xdate.value else: # If it's not 'adate' then need to look at the normal RangeSlider p1.x_range.start, p1.x_range.end = range_slider_x.value # Do the same for the y-axis if plot1_ydata_to_plot == 'adate': p1.y_range.start, p1.y_range.end = range_slider_ydate.value else: p1.y_range.start, p1.y_range.end = range_slider_y.value return # Use the on_change function to call the now defined callback function # whenever the user changes the value in the widget. range_slider_x.on_change('value', callback_range) range_slider_y.on_change('value', callback_range) range_slider_xdate.on_change('value', callback_range) range_slider_ydate.on_change('value', callback_range) ############################################################################ ############################################################################ ############################################################################ ####################### RETURN TO THE MAIN SCRIPT ########################## # Now that the script is finished and the plot created we can return to the # main script. # # To pass back the data for the tab we need to return a Panel with: # child = layout (the one that we made earlier with the widget and plot) # title = 'Something that makes sense as a tab label for the user' return Panel(child=tab_layout, title='Electron Energy')
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) # Filter outliers and related samples mt = mt.semi_join_cols(scores) mt = mt.annotate_cols(scores=scores[mt.s].scores) mt = mt.annotate_cols( study=hl.if_else(mt.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG')) # PCA plot must all come from the same object columns = mt.cols() pca_scores = columns.scores labels = columns.study sample_names = columns.s cohort_sample_codes = list(set(labels.collect())) tooltips = [('labels', '@label'), ('samples', '@samples')] # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) print('Making PCA plots labelled by study') for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') plot = figure( title='TOB-WGS + HGDP/1kG Dataset', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]}%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=pca_scores[pc1].collect(), y=pca_scores[pc2].collect(), label=labels.collect(), samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', ['#1b9e77', '#d95f02'], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'study_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'study_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) print('Making PCA plots labelled by the subpopulation') labels = columns.hgdp_1kg_metadata.labeled_subpop.collect() labels = ['TOB-WGS' if x is None else x for x in labels] subpopulation = list(set(labels)) # change ordering of subpopulations # so TOB-WGS is at the end and glyphs appear on top subpopulation.append(subpopulation.pop(subpopulation.index('TOB-WGS'))) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') plot = figure( title='Subpopulation', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]}%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=pca_scores[pc1].collect(), y=pca_scores[pc2].collect(), label=labels, samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(subpopulation)), subpopulation), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'subpopulation_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'subpopulation_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def Create_Legend(df, color_column, custom_color_boolean, custom_color_palette, marker_column, custom_marker_boolean, custom_marker_palette): ######### Colors: # Create a color list based on the unique entries in one of the database # columns (specified by the tab author). color_list = sorted(df[color_column].unique().tolist()) # First check if the writer wants to use a custom set or if they're happy # with the defaults in here. Note that the custom set can be entered as # fuction (e.g. turbo), a dictionary (e.g. Colorblind) or a list (e.g. a # user specified list of hex values). Therefore need to check for type so # the legend can be built correctly. if custom_color_boolean == True: if isinstance(custom_color_palette, types.FunctionType): # If it's a function then it's probably one of the 256 value large # palettes supplied by Bokeh. This will throw an error if you have # more unique items than accepted inputs to the Bokeh funcion. color_palette = list(custom_color_palette(len(color_list))) elif isinstance(custom_color_palette, dict): # If it's a dictionary then it's probably one of the smaller # palettes supplied by Bokeh. This will throw an error if you have # more or less unique items than keys in the dictionary. color_palette = list(custom_color_palette[len(color_list)]) elif isinstance(custom_color_palette, tuple) or isinstance( custom_color_palette, list): if len(color_list) > len(custom_color_palette): print( 'Error - Not enough colors in custom palette to ' \ 'assign a unique marker to each option.' ) exit() # Set color_palette and turn it into a list as this will help if it # need to be changed later (tuples cannot be altered). color_palette = list(custom_color_palette) else: print('Error - Unsuported type of custom_color_palette') exit() # If custom_color_palette is not requested by the writter then will want to # use the default options. The default is the Colorblind palette if it is # large enough and otherwise use the large Turbo palette. (Will error out if # Turbo is not large enough. else: if (len(color_list) < 8) and (len(color_list) > 2): color_palette = list(Colorblind[len(color_list)]) else: # This will throw an error if you have more than 256 unique items # (max number of colors in the Turbo palette color_palette = list(turbo(len(color_list))) ######### Markers: # Create a marker list based on the unique entries in one of the database # columns (specified by the tab author). marker_list = sorted(df[marker_column].unique().tolist()) # If a custom marker is to be used then set it as the marker_palette if custom_marker_boolean == True: marker_palette = custom_marker_palette # Else use the default list (this was set by CB in an order to try and keep # as good a contrast between items as possible as the marker_list grows in # size (i.e. having 'better' markers at the begining and saving the 'worse' # ones for the end where they may not be used)). else: marker_palette = [ 'circle', 'square', 'triangle', 'diamond', 'inverted_triangle', 'hex', 'circle_cross', 'square_cross', 'diamond_cross', 'asterisk', 'cross', 'x', 'circle_x', 'square_x', 'dash' ] # Make sure there are enough markers to assign unique markers to each option if len(marker_list) > len(marker_palette): print( 'Error - Not enough markers to assign a unique marker to ' \ 'each option.' ) exit() ######### Legend Key: # Create a function that will be used to run through the dataframe looking # at the colomns chosen for the color and marker and creating a new 'marker_color' # column that can be used for the legend (unless the same column is being # used for both marker and color, in which case set legend as 'marker') def add_legend_to_df(df): def add_legend(row): if marker_column == color_column: return str(str(row[marker_column])) else: return str( str(row[marker_column]) + '_' + str(row[color_column])) # Run the function. df.loc[:, 'legend'] = df.apply(lambda row: add_legend(row), axis=1) df.loc[:, 'color1'] = df.loc[:, color_column] df.loc[:, 'marker1'] = df.loc[:, marker_column] return df # Run the now defined function df = add_legend_to_df(df) return (color_list, color_palette, marker_list, marker_palette, df, add_legend_to_df)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) scores = scores.annotate( study=hl.if_else(scores.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG')) sample_names = scores.s.collect() labels = scores.study.collect() study = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) # plot by study for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Study', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', ['#1b9e77', '#d95f02'], study), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'study_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'study_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot by continental population hgdp1kg_tobwgs = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = scores.annotate(continental_pop=hgdp1kg_tobwgs.cols()[ scores.s].hgdp_1kg_metadata.population_inference.pop) labels = scores.continental_pop.collect() # Change TOB-WGS 'none' values to 'TOB-WGS' labels = ['TOB-NFE' if x is None else x for x in labels] continental_population = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Continental Population', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(continental_population)), continental_population), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'continental_pop_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'continental_pop_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot by subpopulation scores = scores.annotate(subpop=hgdp1kg_tobwgs.cols()[ scores.s].hgdp_1kg_metadata.labeled_subpop) labels = scores.subpop.collect() labels = ['TOB-NFE' if x is None else x for x in labels] sub_population = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Subpopulation', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(sub_population)), sub_population), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'subpop_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'subpop_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Plot loadings loadings_ht = hl.read_table(LOADINGS) for i in range(0, (number_of_pcs)): pc = i + 1 plot = manhattan_loadings( pvals=hl.abs(loadings_ht.loadings[i]), locus=loadings_ht.locus, title='Loadings of PC ' + str(pc), collect_all=True, ) plot_filename = output_path(f'loadings_pc{pc}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def _bokeh_timeseries_plots(varnames, time_units, var_units, phase_names, phases_node_path, last_solution_case, last_simulation_case, plot_dir_path, num_cols=2, bg_fill_color='#282828', grid_line_color='#666666', open_browser=False): from bokeh.io import output_notebook, output_file, save, show from bokeh.layouts import gridplot, column, row, grid, layout from bokeh.models import Legend, LegendItem from bokeh.plotting import figure import bokeh.palettes as bp if dymos_options['notebook_mode']: output_notebook() else: output_file(os.path.join(plot_dir_path, 'plots.html')) # Prune the edges from the color map cmap = bp.turbo(len(phase_names) + 2)[1:-1] figures = [] colors = {} sol_plots = {} sim_plots = {} # Get the minimum and maximum times in any phase, so when we plot a variable that only exists # in a few phases, it is plotted against the entire time range. min_time = 1.0E21 max_time = -1.0E21 for iphase, phase_name in enumerate(phase_names): if phases_node_path: time_name = f'{phases_node_path}.{phase_name}.timeseries.time' else: time_name = f'{phase_name}.timeseries.time' min_time = min(min_time, np.min(last_solution_case.outputs[time_name])) max_time = max(max_time, np.max(last_solution_case.outputs[time_name])) colors[phase_name] = cmap[iphase] for ivar, var_name in enumerate(varnames): # Get the labels time_label = f'time ({time_units[var_name]})' var_label = f'{var_name} ({var_units[var_name]})' title = f'timeseries.{var_name}' # add labels, title, and legend padding = 0.05 * (max_time - min_time) fig = figure(title=title, background_fill_color=bg_fill_color, x_range=(min_time - padding, max_time + padding), plot_width=180, plot_height=180) fig.xaxis.axis_label = time_label fig.yaxis.axis_label = var_label fig.xgrid.grid_line_color = grid_line_color fig.ygrid.grid_line_color = grid_line_color # Plot each phase for iphase, phase_name in enumerate(phase_names): sol_color = cmap[iphase] sim_color = cmap[iphase] if phases_node_path: var_name_full = f'{phases_node_path}.{phase_name}.timeseries.{var_name}' time_name = f'{phases_node_path}.{phase_name}.timeseries.time' else: var_name_full = f'{phase_name}.timeseries.{var_name}' time_name = f'{phase_name}.timeseries.time' # Get values if var_name_full not in last_solution_case.outputs: continue var_val = last_solution_case.outputs[var_name_full] time_val = last_solution_case.outputs[time_name] for idxs, i in np.ndenumerate(np.zeros(var_val.shape[1:])): var_val_i = var_val[:, idxs].ravel() sol_plots[phase_name] = fig.circle(time_val.ravel(), var_val_i, size=5, color=sol_color, name='sol:' + phase_name) # get simulation values, if plotting simulation if last_simulation_case: # if the phases_node_path is empty, need to pre-pend names with "sim_traj." # as that is pre-pended in Trajectory.simulate code sim_prefix = '' if phases_node_path else 'sim_traj.' var_val_simulate = last_simulation_case.outputs[sim_prefix + var_name_full] time_val_simulate = last_simulation_case.outputs[sim_prefix + time_name] for idxs, i in np.ndenumerate( np.zeros(var_val_simulate.shape[1:])): var_val_i = var_val_simulate[:, idxs].ravel() sim_plots[phase_name] = fig.line(time_val_simulate.ravel(), var_val_i, line_dash='solid', line_width=0.5, color=sim_color, name='sim:' + phase_name) figures.append(fig) # Implement a single legend for all figures using the example here: # https://stackoverflow.com/a/56825812/754536 # ## Use a dummy figure for the LEGEND dum_fig = figure(outline_line_alpha=0, toolbar_location=None, background_fill_color=bg_fill_color, plot_width=250, max_width=250) # set the components of the figure invisible for fig_component in [ dum_fig.grid, dum_fig.ygrid, dum_fig.xaxis, dum_fig.yaxis ]: fig_component.visible = False # The glyphs referred by the legend need to be present in the figure that holds the legend, # so we must add them to the figure renderers. sol_legend_items = [(phase_name + ' solution', [ dum_fig.circle([0], [0], size=5, color=colors[phase_name], tags=['sol:' + phase_name]) ]) for phase_name in phase_names] sim_legend_items = [(phase_name + ' simulation', [ dum_fig.line([0], [0], line_dash='solid', line_width=0.5, color=colors[phase_name], tags=['sim:' + phase_name]) ]) for phase_name in phase_names] legend_items = [ j for i in zip(sol_legend_items, sim_legend_items) for j in i ] # # set the figure range outside of the range of all glyphs dum_fig.x_range.end = 1005 dum_fig.x_range.start = 1000 legend = Legend(click_policy='hide', location='top_left', border_line_alpha=0, items=legend_items, background_fill_alpha=0.0, label_text_color='white', label_width=120, spacing=10) dum_fig.add_layout(legend, place='center') gd = gridplot(figures, ncols=num_cols, sizing_mode='scale_both') plots = gridplot([[gd, column(dum_fig, sizing_mode='stretch_height')]], toolbar_location=None, sizing_mode='scale_both') if dymos_options['notebook_mode'] or open_browser: show(plots) else: save(plots)
def main(number_of_pcs: int): # pylint: disable=too-many-locals """Query script entry point.""" hl.init() mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) mt = mt.annotate_cols(scores=scores[mt.s].scores) mt = mt.annotate_cols(TOB_WGS=mt.s.contains('TOB')) # PCA plot must all come from the same object columns = mt.cols() pca_scores = columns.scores labels = columns.TOB_WGS # get percent variance explained eigenvalues = pd.read_csv(EIGENVALUES) eigenvalues.columns = ['eigenvalue'] variance = eigenvalues['eigenvalue'].divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) print('Making PCA plots labelled by the study ID') for i in range(0, number_of_pcs): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='TOB-WGS', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', ) show(p) print('Making PCA plots labelled by the continental population') labels = columns.hgdp_1kg_metadata.population_inference.pop pops = list(set(labels.collect())) hover_fields = dict([('s', columns.s)]) for i in range(0, number_of_pcs): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='Continental Population', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, colors=CategoricalColorMapper(palette=turbo(len(pops)), factors=pops), hover_fields=hover_fields, ) show(p) print('Making PCA plots labelled by the subpopulation') labels = columns.hgdp_1kg_metadata.labeled_subpop pops = list(set(labels.collect())) for i in range(0, number_of_pcs): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='Sub-Population', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, colors=CategoricalColorMapper(palette=turbo(len(pops)), factors=pops), ) show(p)