def combine_replicates(TaXon_table_xlsx, suffix_list, path_to_outdirs): import PySimpleGUI as sg import pandas as pd import numpy as np from pathlib import Path TaXon_table_file = Path(TaXon_table_xlsx) # create output file output_file = Path( str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + str(TaXon_table_file.stem) + "_derep.xlsx") TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx) df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) sample_names = df.columns[10:] unique_sample_names_list, samples_to_process_list = [], [] for sample in sample_names: sample_name = sample.split("_")[0:-1] unique_sample_names_list.append("_".join(sample_name)) unique_sample_names_set = sorted(set(unique_sample_names_list)) replicates_dict = {} for sample in unique_sample_names_set: for i, suffix in enumerate(suffix_list): replicates_dict["rep_" + str(i)] = sample + "_" + suffix_list[i] combined = sample + "_comb" replicate_names_list = list(replicates_dict.values()) try: df[combined] = df[replicate_names_list].sum(axis=1) df = df.drop(replicate_names_list, axis=1) except: print("Warning! No replicates found for: " + sample) df.to_excel(output_file, index=False, sheet_name='TaXon table') closing_text = "Taxon table is found under:\n" + '/'.join( str(output_file).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("replicate merging", "processing", TaXon_table_file.name, output_file.name, "nan", path_to_outdirs)
def convert_to_presence_absence(TaXon_table_xlsx, path_to_outdirs): from pathlib import Path import PySimpleGUI as sg import pandas as pd TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0) # create presence/absence table presence_absence_list = [] for col in TaXon_table_df.values.tolist(): presence_absence_list.append(col[0:10] + [int(1) if reads != 0 else int(0) for reads in col[10:]]) df_pa = pd.DataFrame(presence_absence_list) df_pa.columns = TaXon_table_df.columns.tolist() output_file = Path(str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem + "_pa.xlsx") df_pa.to_excel(output_file, index=False, sheet_name = 'TaXon table') closing_text = "Presence absence tables is found in: " + str(path_to_outdirs) + "/TaXon_tables/" sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("presence absence conversion", "processing", TaXon_table_xlsx.name, output_file.name, "nan", path_to_outdirs)
def venn_diagram(file_a, file_b, file_c, venn_diagram_name, path_to_outdirs, clustering_unit): import os import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np import matplotlib.pyplot as plt from matplotlib_venn import venn2 from matplotlib_venn import venn3 from matplotlib.pyplot import plot, ion, show from pathlib import Path file_a = Path(file_a) file_b = Path(file_b) venn_font = 20 if file_c == False: ############################################################################ # use venn2 count = 0 G = "G_" + clustering_unit allowed_taxa = [ "A_Phylum", "B_Class", "C_Order", "D_Family", "E_Genus", "F_Species", G ] venn_dict = {} ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [ sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar') ], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout) progress_bar = window_progress_bar['progressbar'] progress_update = 167 * 2 ############################################################################ for taxon in allowed_taxa: output_name = taxon taxon = taxon[2:] col_name = taxon if taxon in ["ASVs", "ESVs", "OTUs", "zOTUs"]: col_name = taxon taxon = "ID" data_file_a = pd.read_excel(file_a, 'TaXon table', header=0) data_file_b = pd.read_excel(file_b, 'TaXon table', header=0) file_name_a = file_a.stem file_name_b = file_b.stem taxa_file_a = data_file_a[taxon].values.tolist() taxa_file_b = data_file_b[taxon].values.tolist() taxa_unique_a = list(dict.fromkeys(taxa_file_a)) taxa_unique_b = list(dict.fromkeys(taxa_file_b)) taxa_labels_a = [] taxa_labels_b = [] taxa_sizes_a = [] taxa_sizes_b = [] for taxon_name in taxa_unique_a: if "nan" != str(taxon_name): taxa_labels_a.append(str(taxon_name)) taxa_sizes_a.append(taxa_file_a.count(taxon_name)) for taxon_name in taxa_unique_b: if "nan" != str(taxon_name): taxa_labels_b.append(str(taxon_name)) taxa_sizes_b.append(taxa_file_b.count(taxon_name)) taxa_labels_a = sorted(taxa_labels_a) taxa_labels_b = sorted(taxa_labels_b) a_only = set(taxa_labels_a) - set(taxa_labels_b) len_a_only = len(a_only) b_only = set(taxa_labels_b) - set(taxa_labels_a) len_b_only = len(b_only) shared = set(taxa_labels_a) & set(taxa_labels_b) len_shared = len(shared) venn_dict[col_name + "_a_only"] = a_only venn_dict[col_name + "_shared"] = shared venn_dict[col_name + "_b_only"] = b_only plt.figure(figsize=(20, 10)) out = venn2(subsets=(len_a_only, len_b_only, len_shared), set_labels=(file_name_a, file_name_b)) for text in out.set_labels: text.set_fontsize(venn_font) for x in range(len(out.subset_labels)): if out.subset_labels[x] is not None: out.subset_labels[x].set_fontsize(venn_font) dirName = Path( str(path_to_outdirs) + "/Venn_diagrams/" + venn_diagram_name) if not os.path.exists(dirName): os.mkdir(dirName) output_pdf = Path(str(dirName) + "/" + output_name + ".pdf") plt.title(output_name[2:]) plt.savefig(output_pdf, bbox_inches='tight') if taxon == "Species": answer = sg.PopupYesNo('Show last plot?', keep_on_top=True) if answer == "Yes": plt.show(block=False) sg.Popup("Close") plt.close() ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += 167 progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() output_xlsx = Path(str(dirName) + "/" + "Venn_comparison_results.xlsx") df = pd.DataFrame.from_dict(venn_dict, orient='index').transpose() df.to_excel(output_xlsx, index=False) sg.Popup("Venn diagrams are found in", path_to_outdirs, "Venn_diagrams/", title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("venn diagram", "analysis", file_a.name, output_xlsx.name, venn_diagram_name, path_to_outdirs) ttt_log("venn diagram", "analysis", file_b.name, output_xlsx.name, venn_diagram_name, path_to_outdirs) else: ############################################################################ # use venn3 if file_c == '': sg.PopupError("Please provide a file", keep_on_top=True) raise RuntimeError() file_c = Path(file_c) count = 0 G = "G_" + clustering_unit allowed_taxa = [ "A_Phylum", "B_Class", "C_Order", "D_Family", "E_Genus", "F_Species", G ] venn_dict = {} ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [ sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar') ], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout) progress_bar = window_progress_bar['progressbar'] progress_update = 167 * 2 ############################################################################ for taxon in allowed_taxa: output_name = taxon taxon = taxon[2:] col_name = taxon if taxon in ["ASVs", "ESVs", "OTUs", "zOTUs"]: col_name = taxon taxon = "ID" data_file_a = pd.read_excel(file_a, 'TaXon table', header=0) data_file_b = pd.read_excel(file_b, 'TaXon table', header=0) data_file_c = pd.read_excel(file_c, 'TaXon table', header=0) file_name_a = file_a.stem file_name_b = file_b.stem file_name_c = file_c.stem taxa_file_a = data_file_a[taxon].values.tolist() taxa_file_b = data_file_b[taxon].values.tolist() taxa_file_c = data_file_c[taxon].values.tolist() taxa_unique_a = list(dict.fromkeys(taxa_file_a)) taxa_unique_b = list(dict.fromkeys(taxa_file_b)) taxa_unique_c = list(dict.fromkeys(taxa_file_c)) taxa_labels_a = [] taxa_labels_b = [] taxa_labels_c = [] taxa_sizes_a = [] taxa_sizes_b = [] taxa_sizes_c = [] for taxon_name in taxa_unique_a: if "nan" != str(taxon_name): taxa_labels_a.append(str(taxon_name)) taxa_sizes_a.append(taxa_file_a.count(taxon_name)) for taxon_name in taxa_unique_b: if "nan" != str(taxon_name): taxa_labels_b.append(str(taxon_name)) taxa_sizes_b.append(taxa_file_b.count(taxon_name)) for taxon_name in taxa_unique_c: if "nan" != str(taxon_name): taxa_labels_c.append(str(taxon_name)) taxa_sizes_c.append(taxa_file_c.count(taxon_name)) taxa_labels_a = sorted(taxa_labels_a) taxa_labels_b = sorted(taxa_labels_b) taxa_labels_c = sorted(taxa_labels_c) a_only = set(taxa_labels_a) - set(taxa_labels_b) - set( taxa_labels_c) len_a_only = len(a_only) b_only = set(taxa_labels_b) - set(taxa_labels_a) - set( taxa_labels_c) len_b_only = len(b_only) c_only = set(taxa_labels_c) - set(taxa_labels_a) - set( taxa_labels_b) len_c_only = len(c_only) shared_all = set(taxa_labels_a) & set(taxa_labels_b) & set( taxa_labels_c) len_shared_all = len(shared_all) shared_a_b = set( taxa_labels_a) & set(taxa_labels_b) - set(taxa_labels_c) len_shared_a_b = len(shared_a_b) shared_a_c = set( taxa_labels_a) & set(taxa_labels_c) - set(taxa_labels_b) len_shared_a_c = len(shared_a_c) shared_b_c = set( taxa_labels_b) & set(taxa_labels_c) - set(taxa_labels_a) len_shared_b_c = len(shared_b_c) venn_dict[col_name + "_a_only"] = a_only venn_dict[col_name + "_b_only"] = b_only venn_dict[col_name + "_c_only"] = c_only venn_dict[col_name + "_shared_all"] = shared_all venn_dict[col_name + "_shared_a_b"] = shared_a_b venn_dict[col_name + "_shared_a_c"] = shared_a_c venn_dict[col_name + "_shared_b_c"] = shared_b_c plt.figure(figsize=(20, 10)) out = venn3(subsets=(len_a_only, len_b_only, len_shared_a_b, len_c_only, len_shared_a_c, len_shared_b_c, len_shared_all), set_labels=(file_name_a, file_name_b, file_name_c)) for text in out.set_labels: text.set_fontsize(venn_font) for x in range(len(out.subset_labels)): if out.subset_labels[x] is not None: out.subset_labels[x].set_fontsize(venn_font) dirName = Path( str(path_to_outdirs) + "/Venn_diagrams/" + venn_diagram_name) if not os.path.exists(dirName): os.mkdir(dirName) output_pdf = Path(str(dirName) + "/" + output_name + ".pdf") plt.title(output_name[2:]) plt.savefig(output_pdf, bbox_inches='tight') if taxon == "Species": answer = sg.PopupYesNo('Show last plot?', keep_on_top=True) if answer == "Yes": plt.show(block=False) sg.Popup("Close") plt.close() ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += 167 progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() output_xlsx = Path(str(dirName) + "/" + "Venn_comparison_results.xlsx") df = pd.DataFrame.from_dict(venn_dict, orient='index').transpose() df.to_excel(output_xlsx, index=False) sg.Popup("Venn diagrams are found in", path_to_outdirs, "Venn_diagrams/", title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("venn diagram", "analysis", file_a.name, output_xlsx.name, venn_diagram_name, path_to_outdirs) ttt_log("venn diagram", "analysis", file_b.name, output_xlsx.name, venn_diagram_name, path_to_outdirs) ttt_log("venn diagram", "analysis", file_c.name, output_xlsx.name, venn_diagram_name, path_to_outdirs)
def replicate_analysis(TaXon_table_xlsx, height, width, suffix_list, path_to_outdirs, template, theme, font_size, custom_colors, clustering_unit): import PySimpleGUI as sg import pandas as pd import numpy as np from statistics import mean from pathlib import Path import matplotlib.pyplot as plt from matplotlib_venn import venn2 from matplotlib_venn import venn3 from matplotlib.pyplot import plot, ion, show import matplotlib.gridspec as gridspec import math, os, webbrowser import plotly.express as px from plotly.subplots import make_subplots import plotly.graph_objects as go from collections import OrderedDict color1 = theme[0] color2 = theme[1] opacity_value = theme[2] height = int(height) width = int(width) TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) sample_names = TaXon_table_df.columns[10:].tolist() OTUs = TaXon_table_df["ID"].values.tolist() derep_sample_names_dict = {} unique_sample_names_list = [] replicates_dict = {} for sample in sample_names: sample_name = sample.split("_")[0:-1] unique_sample_names_list.append("_".join(sample_name)) unique_sample_names_set = sorted(set(unique_sample_names_list)) ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [ sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar') ], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(unique_sample_names_set) + 1 ############################################################################ replicate_perc_shared_dict = {} fig_main_dict = {} reads_dict = {} ## create an output folder replicate_analysis_name = Path(TaXon_table_xlsx).name.replace(".xlsx", "") dirName = Path( str(path_to_outdirs) + "/Replicate_analysis/" + replicate_analysis_name) if not os.path.exists(dirName): os.mkdir(dirName) for sample in unique_sample_names_set: for i, suffix in enumerate(suffix_list): replicates_dict["rep_" + str(i)] = sample + "_" + suffix_list[i] replicate_names_list = list(replicates_dict.values()) try: ## calculate the number of shared OTUs shared_OTUs_list = [ row for row in TaXon_table_df[replicate_names_list].values.tolist() if 0 not in row ] present_OTUs_list = [ row for row in TaXon_table_df[replicate_names_list].values.tolist() if row != [0] * len(replicate_names_list) ] perc_shared = round( len(shared_OTUs_list) / len(present_OTUs_list) * 100, 2) replicate_perc_shared_dict[sample] = perc_shared ## calculate the percentage of reads that is discarded and kept reads_total = sum([ sum(row) for row in TaXon_table_df[replicate_names_list].values.tolist() ]) reads_kept_perc = round( sum([sum(row) for row in shared_OTUs_list]) / reads_total * 100, 2) reads_discarded_perc = round(100 - reads_kept_perc, 2) reads_dict[sample] = [reads_kept_perc, reads_discarded_perc] ## create left sided OTU plot fig_dict = {} for i, OTU in enumerate(present_OTUs_list): if 0 not in OTU: fig_main_dict[i + 1, sample, "Blue", "shared"] = [sum(OTU) / reads_total * 100] else: fig_main_dict[i + 1, sample, "Red", "non-shared"] = [ sum(OTU) / reads_total * 100 ] except: print("Warning! No replicates found for: " + sample) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() ######################################################################################################################## ## figure 1 shared OTUs samples = list(replicate_perc_shared_dict.keys()) shared_otus = list(replicate_perc_shared_dict.values()) y_title = "shared " + clustering_unit fig = px.bar(x=samples, y=shared_otus, labels={ "y": y_title, "x": "Sample", "text": y_title }, text=shared_otus) y_title = 'shared ' + clustering_unit + ' (%)' fig.update_yaxes(title=y_title, range=[0, 100], dtick=10, autorange=False) fig.update_xaxes(title='', tickmode='linear') fig.update_xaxes(tickangle=-90) fig.update_layout(width=int(width), height=int(height), template=template, font_size=font_size, title_font_size=font_size) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1.5, opacity=opacity_value) ## write files output_pdf = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_" + clustering_unit + ".pdf") output_html = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_" + clustering_unit + ".html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ######################################################################################################################## ## figure 2 kept/discarded reads samples = list(reads_dict.keys()) #discarded_reads = [reads[1] for reads in list(reads_dict.values())] shared_reads = [reads[0] for reads in list(reads_dict.values())] fig = px.bar(x=samples, y=shared_reads, labels={ "y": "shared reads (%)", "x": "Sample", "text": "shared reads (%)" }, text=shared_reads) fig.update_yaxes(title='shared reads (%)', range=[0, 100], dtick=10, autorange=False) fig.update_xaxes(title='', tickmode='linear') fig.update_xaxes(tickangle=-90) fig.update_layout(width=int(width), height=int(height), template=template, font_size=font_size, title_font_size=font_size) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1.5, opacity=opacity_value) ## write files output_pdf2 = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared" + clustering_unit + "_reads.pdf") output_html2 = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared" + clustering_unit + "_reads.html") fig.write_image(str(output_pdf2)) fig.write_html(str(output_html2)) ######################################################################################################################## ## figure 3 OTU left side plot ## sort the dict by abundance fig_main_dict_sorted = dict( sorted(fig_main_dict.items(), key=lambda item: item[1], reverse=True)) ## collect y values >> read abundances ## collect x values >> rank # y1, x1, m1, t1 = [], [], [], [] # y2, x2, m2, t2 = [], [], [], [] # i = 0 # # for key, value in fig_main_dict_sorted.items(): # if key[3] == 'shared': # y1 = y1 + value # x1.append(i) # m1.append(key[2]) # t1.append(key[3]) # i += 1 # else: # y2 = y2 + value # x2.append(i) # m2.append(key[2]) # t2.append(key[3]) # i += 1 # # max_reads = math.ceil(max(y1 + y2)) +1 # n_ranked_OTUs = i # n_shared = len(y1) # n_nonshared = len(y2) # name1 = "shared (n=" + str(n_shared) + ")" # name2 = "non shared (n=" + str(n_nonshared) + ")" # c1 = color_discrete_sequence[0] # c2 = color_discrete_sequence[1] # # fig = make_subplots(rows=2, cols=1, shared_xaxes=True) # fig.add_trace(go.Scatter(x=x1, y=y1, mode='markers', marker=dict(size=8, color=c1), name=name1),row=1, col=1) # fig.add_trace(go.Scatter(x=x2, y=y2, mode='markers', marker=dict(size=8, color=c2), name=name2),row=2, col=1) # fig.update_xaxes(title='', showticklabels=True, row=1, col=1) # fig.update_xaxes(title='ranked OTUs (by read abundance)', showticklabels=True, row=2, col=1) # if log_transform == True: # fig.update_yaxes(title="reads (log)", range=[0,max_reads]) # out_sub = "_log_" # else: # fig.update_yaxes(title="reads (%)", range=[0,105]) # out_sub = "_rel_" # fig.update_layout(width=int(width), height=int(height), template=template, font_size=font_size, title_font_size=font_size) # # ## add annotations to both plots # if add_annotations == True: # for annotation in [1.0, 0.1, 0.01]: # ## store annotation as text # text = "<" + str(annotation) + "%" # ## search for the x-axis rank of the annotation # x_pos = [] # try: # for rank, reads in zip(x1, y1): # if round(reads, 3) <= annotation: # x_pos = rank # break # except: # pass # if x_pos != []: # ## y_pos is annotation # fig.add_annotation(x=x_pos, y=2.5, # text=text, showarrow=True, font=dict(size=font_size-2), # align="center", arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor="Black", ax=20, ay=-30, # bordercolor=c1, borderwidth=1, borderpad=3, bgcolor=c1, opacity=0.9, # row=1, col=1) # # ## search for the x-axis rank of the annotation # x_pos = [] # try: # for rank, reads in zip(x2, y2): # if round(reads, 3) <= annotation: # x_pos = rank # break # except: # pass # if x_pos != []: # ## y_pos is annotation # fig.add_annotation(x=x_pos, y=2.5, # text=text, showarrow=True, font=dict(size=font_size-2), # align="center", arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor="Black", ax=20, ay=-30, # bordercolor=c2, borderwidth=1, borderpad=3, bgcolor=c2, opacity=0.9, # row=2, col=1) ######################################################################################################################## ## figure 4 OTU bar plot v2 y1, y2 = [], [] for key, value in fig_main_dict_sorted.items(): if key[3] == 'shared': y1 = y1 + value else: y2 = y2 + value categories = [[100, 10], [10, 1], [1, 0.1], [0.1, 0]] bar_plot_dict = {} n_OTUs_shared, n_OTUs_nonshared, names = [], [], [] fig = go.Figure() for category in categories: upper = category[0] lower = category[1] shared = len([y for y in y1 if (y > lower and y < upper)]) nonshared = len([y for y in y2 if (y > lower and y < upper)]) n_OTUs = shared + nonshared shared_perc = shared / n_OTUs * 100 nonshared_perc = nonshared / n_OTUs * 100 n_OTUs_shared.append(shared_perc) n_OTUs_nonshared.append(nonshared_perc) if category != [0.1, 0]: text = str(category[0]) + "%-" + str(category[1]) + "%" names.append(text) fig.add_annotation(x=text, y=100, text="n=" + str(n_OTUs), font=dict(size=font_size - 2), showarrow=False, yshift=10) else: text = "<0.1%" names.append(text) fig.add_annotation(x=text, y=100, text="n=" + str(n_OTUs), font=dict(size=font_size - 2), showarrow=False, yshift=10) fig.add_trace( go.Bar(x=names, y=n_OTUs_shared, name='shared', marker_color=custom_colors[0])) fig.add_trace( go.Bar(x=names, y=n_OTUs_nonshared, name='non-shared', marker_color=custom_colors[1])) fig.update_layout(width=int(width), height=int(height), template=template, font_size=font_size, title_font_size=font_size) y_title = clustering_unit + ' per bin (%)' fig.update_yaxes(title=y_title) fig.update_xaxes(title='read abundance') ## write files output_pdf3 = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_nonshared.pdf") output_html3 = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_nonshared.html") fig.write_image(str(output_pdf3)) fig.write_html(str(output_html3)) ######################################################################################################################## ## write statistics file output_txt = Path( str(dirName) + "/" + TaXon_table_xlsx.stem + "_stats.txt") f = open(output_txt, "w") avg_shared_otus = round(mean(shared_otus), 2) avg_shared_reads = round(mean(shared_reads), 2) n_samples = len(samples) text = "Average shared " + clustering_unit + ": " + str( avg_shared_otus) + "%\n" + "Average shared reads: " + str( avg_shared_reads) + "%\n" + "Number of samples: " + str(n_samples) f.write(text) f.close() ## ask to show file answer = sg.PopupYesNo(text + '\n\nShow all three plots?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html3)) webbrowser.open('file://' + str(output_html2)) webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "The three plots are found under:\n" + "Projects/Replicate_analysis/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log from taxontabletools.create_log import ttt_log ttt_log("replicate analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, "nan", path_to_outdirs)
def rarefaction_curve_taxa(TaXon_table_xlsx, repetitions, path_to_outdirs, template, font_size, taxonomic_level_1, taxonomic_level_2, color_discrete_sequence): import random import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.graph_objects as go import plotly.express as px from pathlib import Path import webbrowser ## load the TaXon table TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx) df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) df = df.replace(np.nan,"nan") ## create a y axis title text taxon_title = taxonomic_level_1.lower() ## adjust taxonomic level if neccessary if taxonomic_level_1 in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level_1 taxonomic_level_1 = "ID" ## collect available samples available_samples = df.columns.tolist()[10:] sample_dict_clean = {} ## collect available taxa available_taxa = [taxon for taxon in set(df[taxonomic_level_2].values.tolist()) if taxon != "nan"] ## create a color dict ## extend to the color dict color_discrete_sequence = color_discrete_sequence * len(available_taxa) color_dict = {} for i, taxon in enumerate(available_taxa): color_dict[taxon] = color_discrete_sequence[i] ## collect the increase for each taxon increase_dict = {} ## create an empty figure fig = go.Figure() for taxon in sorted(available_taxa): df_filtered = df.loc[df[taxonomic_level_2] == taxon] # iterate through all available samples for sample in available_samples: # create a dict for the read numbers of the respective sample for each species sample_OTU_list = df_filtered[[sample, taxonomic_level_1]].values.tolist() # select only the present Species sample_species_list = list(set([OTU[1] for OTU in sample_OTU_list if (OTU[0] != 0 and OTU[1] != "nan")])) # store the species in a dictionary sample_dict_clean[sample] = sample_species_list # draw once for each sample number_of_draws = len(sample_dict_clean.keys()) # dictionary to store the drawing results draw_dictionary = {} for n_reps in range(0, repetitions): # store the original dictionary to start over again # a copy of the original dictionary is required, because the samples will be removed with each draw # thus for each replicate a new dictionary to draw from has to be created sample_dict_to_draw = dict(sample_dict_clean) species_list = [] species_set = [] for i in range(0, number_of_draws): # choose a random sample from the dictionary random_choice = random.choice(list(sample_dict_to_draw.keys())) # extract the OTU IDs from the chosen sample and add them to the already existing OTU IDs species_list = species_list + sample_dict_clean[random_choice] # create a unique set species_set = set(species_list) # number of OTUs n_species = len(species_set) # now add the unique OTU list to the output dictionary # if the key is not in the dict, create a new entry (= OTU ID plus number of OTUs) if i not in draw_dictionary.keys(): draw_dictionary[i] = [n_species] # if the key already exists, calculate the sum of the already existing number of OTUs and the new number of OTUs else: # create a new list to store the current number of OTUs add_species_list = draw_dictionary[i] add_species_list.append(n_species) draw_dictionary[i] = add_species_list # remove the sample to draw only once sample_dict_to_draw.pop(random_choice) # create a dict to store the average number of OTUs per draw rarefaction_dict_average, rarefaction_dict_stdef = {}, {} def average(lst): return sum(lst) / len(lst) # iterate through the draw_dictionary and calculate the average number of OTUs for key, value in draw_dictionary.items(): average_species = average(draw_dictionary[key]) stdef_species = np.std(draw_dictionary[key], dtype=np.float64) rarefaction_dict_average[key] = average_species rarefaction_dict_stdef[key] = stdef_species ## add to plot draws = [i+1 for i in rarefaction_dict_average.keys()] n_species = list(rarefaction_dict_average.values()) increase_dict[taxon] = n_species error_bar = list(rarefaction_dict_stdef.values()) fig.add_trace(go.Scatter(x=draws, y=n_species, name=taxon, marker_color=color_dict[taxon], error_y=dict(type='data', array=error_bar, thickness=0.5, width=3, visible=True))) ## update figure y_axis_title = "# " + taxon_title fig.update_layout(title_text="repetitions = " + str(n_reps+1), yaxis_title=y_axis_title, xaxis_title="# samples") fig.update_layout(height=700, width=1200, template="simple_white", showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_xaxes(rangemode="tozero") fig.update_yaxes(rangemode="tozero") fig.update_layout(height=800, width=1200, template=template, showlegend=True, font_size=font_size, title_font_size=font_size) ## write files out_name = taxonomic_level_1.lower() + "_" + taxonomic_level_2.lower() output_pdf = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_" + out_name + ".pdf") output_html = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_" + out_name + ".html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Rarefaction curves are found in: " + str(path_to_outdirs) + "/rarefaction_curves/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log from taxontabletools.create_log import ttt_log ttt_log("rarefaction curve per taxon", "analysis", TaXon_table_file.name, output_pdf.name, "nan", path_to_outdirs)
def create_krona_chart_multi(TaXon_table_xlsx, path_to_outdirs): import subprocess, os, webbrowser import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np from pathlib import Path try: subprocess.call(["ktImportText"], stdout=open(os.devnull, 'wb')) except: sg.PopupError( "Krona tools must be manually installed first!" + "\n" * 2 + "Note: Krona tools is currently not supported on Windows!" + "\n", title="Error") raise RuntimeError("Krona tools needs to be installed") TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] TaXon_table_df = TaXon_table_df.replace(np.nan, '__', regex=True) samples = TaXon_table_df.columns.tolist()[10:] columns = TaXon_table_df.columns.tolist()[:10] # check for presence absence data # otherwise abort and print error message pa_test = set([ val for sublist in TaXon_table_df[TaXon_table_samples].values.tolist() for val in sublist ]) if pa_test == {1, 0}: pa_data = True else: pa_data = False ## create an output folder krona_chart_name = Path(TaXon_table_xlsx).name.replace(".xlsx", "") dirName = Path(str(path_to_outdirs) + "/Krona_charts/" + krona_chart_name) if not os.path.exists(dirName): os.mkdir(dirName) ## store the names of the sample tsv files sample_tsv_path = [] ## write a seperate tsv file for each sample in the TaXon table for sample in samples: row1 = ["sample-ID", "", "", "", "", "", ""] row2 = [ "count", "phylum", "class", "order", "family", "genus", "species" ] krona_taxonomy_list = [] krona_taxonomy_list.append(row1) krona_taxonomy_list.append(row2) for OTU in TaXon_table_df[columns + [sample]].values.tolist(): taxonomy = OTU[1:7] reads = sum(OTU[10:]) if reads != 0: if pa_data == True: krona_taxonomy_list.append([1] + taxonomy) else: krona_taxonomy_list.append([reads] + taxonomy) ## store the data in df krona_taxonomy_df = pd.DataFrame(krona_taxonomy_list) krona_table_tsv = Path( str(dirName) + "/" + sample.replace(" ", "_") + "_krona_table.tsv") sample_tsv_path.append(str(krona_table_tsv)) # write krona table to tsv krona_taxonomy_df.to_csv(krona_table_tsv, sep="\t", header=False, index=False) krona_chart_html = Path(str(dirName) + "_krona_multi.html") os.system("ktImportText " + ' '.join(sample_tsv_path) + " -o " + str(krona_chart_html)) # finish script answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(krona_chart_html)) closing_text = "Krona chart is found under:\n" + '/'.join( str(krona_chart_html).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("krona chart", "analysis", TaXon_table_xlsx.name, krona_chart_html.name, "nan", path_to_outdirs)
def beta_diversity(TaXon_table_xlsx, width, heigth, cmap, meta_data_to_test, taxonomic_level, path_to_outdirs, template, font_size, diss_metric): import pandas as pd import numpy as np from skbio.diversity import beta_diversity from skbio.stats.distance import anosim import plotly.express as px from pathlib import Path import PySimpleGUI as sg import webbrowser TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path( str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test) ## drop samples with metadata called nan (= empty) drop_samples = [ i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan" ] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame( [ i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples ], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() ## create a y axis title text taxon_title = taxonomic_level ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == len( Meta_data_table_df['Samples'].tolist()): sg.Popup( "The meta data is unique for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == 1: sg.Popup( "The meta data is similar for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples): ## collect samples for plot samples = Meta_data_table_samples ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe df_new = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in df_new.index: df_new = df_new.drop('unidentified') ## collect reads data = df_new[samples].transpose().values.tolist() ## calculate dissimilarity distances dissimilarity_dm = beta_diversity(diss_metric, data, samples) anosim_results = anosim(dissimilarity_dm, metadata_list, permutations=999) anosim_r = round(anosim_results['test statistic'], 5) anosim_p = anosim_results['p-value'] textbox = "Anosim (" + meta_data_to_test + ", " + taxon_title + ")<br>" + "R = " + str( anosim_r) + "<br>" + "p = " + str(anosim_p) matrix = dissimilarity_dm.data matrix_df = pd.DataFrame(matrix) matrix_df.columns = samples matrix_df.index = samples # create plot color_label = diss_metric + " distance" fig = px.imshow(matrix, x=samples, y=samples, color_continuous_scale=cmap, labels=dict(color=color_label)) fig.update_layout(height=int(heigth), width=int(width), template=template, showlegend=True, title=textbox, font_size=font_size, title_font_size=font_size) # finish script output_pdf = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" + taxon_title + "_" + diss_metric + ".pdf") output_html = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" + taxon_title + "_" + diss_metric + ".html") output_xlsx = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" + taxon_title + "_" + diss_metric + ".xlsx") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) matrix_df.to_excel(output_xlsx) ## ask to show plot answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## write to log file sg.Popup("Beta diversity estimate are found in", path_to_outdirs, "/Beta_diversity/", title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("beta diversity", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) else: sg.PopupError( "Error: The samples between the taxon table and meta table do not match!", keep_on_top=True)
def site_occupancy_heatmap(TaXon_table_xlsx, path_to_outdirs, template, height, width, meta_data_to_test, taxonomic_level, font_size, color_discrete_sequence, add_categories_sum): import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.graph_objects as go from plotly.subplots import make_subplots from pathlib import Path import webbrowser, os TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path(str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() ## drop samples with metadata called nan (= empty) drop_samples = [i[0] for i in Meta_data_table_df.values.tolist() if i[1] == "nan"] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame([i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() ## create a y axis title text taxon_title = taxonomic_level ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" if len(set(metadata_list)) == 1: sg.PopupError("Please choose more than one meta data category.") else: if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples): ## define variables samples = TaXon_table_samples OTU_abundances_dict = {} samples_metadata_list = [] ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe TaXon_table_df = TaXon_table_df.groupby(TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in TaXon_table_df.index: TaXon_table_df = TaXon_table_df.drop('unidentified') ## create a list of samples for each category category_dict = {} for sample, category in zip(Meta_data_table_samples, metadata_list): if category not in category_dict.keys(): category_dict[category] = [sample] else: category_dict[category] = category_dict[category] + [sample] ## collect all available taxa taxa = TaXon_table_df[taxonomic_level].values.tolist() ## check if the respective species are present in the collections taxon_presence_dict = {} n_rows, row_heights = [], [] color_discrete_sequence = color_discrete_sequence * len(category_dict.keys()) if (taxonomic_level == "Species" or taxonomic_level == "Genus"): x_values = ["<i>" + taxon + "</i>" for taxon in taxa] else: x_values = taxa if add_categories_sum == True: for samples in category_dict.values(): row_heights.append(len(samples)) row_heights.append(len(set(metadata_list))) fig = make_subplots(rows=len(set(metadata_list)) + 1, cols=1, shared_xaxes=True, vertical_spacing=0.05, row_heights=row_heights) else: for samples in category_dict.values(): row_heights.append(len(samples)) fig = make_subplots(rows=len(set(metadata_list)), cols=1, shared_xaxes=True, vertical_spacing=0.05, row_heights=row_heights) row = 1 for metadata, samples in category_dict.items(): if type(samples) == "str": samples = [samples] z_values = [] for sample in samples: reads = TaXon_table_df[sample].values.tolist() z_values = z_values + [[1 if x > 0 else 0 for x in reads]] y_values = samples fig.add_trace(go.Heatmap(z=z_values, x=x_values, y=y_values, showscale=False, xgap=1, ygap=1, hoverongaps = False, colorscale=[[0, "White"], [1, color_discrete_sequence[row-1]]]), row=row, col=1) row += 1 if add_categories_sum == True: z_values, y_values = [], [] for metadata, samples in category_dict.items(): reads = [sum(reads) for reads in TaXon_table_df[samples].values.tolist()] z_values = z_values + [[1 if x > 0 else 0 for x in reads]] y_values.append(metadata) fig.add_trace(go.Heatmap(z=z_values[::-1], x=x_values, y=y_values[::-1], showscale=False, xgap=1, ygap=1, hoverongaps = False, colorscale=[[0, "White"], [1, "Grey"]]), row=row, col=1) row += 1 fig.update_layout(width=int(width), height=int(height), template="seaborn", font_size=font_size, yaxis_nticks=5, title_font_size=font_size) fig.update_xaxes(tickmode='linear') fig.update_yaxes(tickmode='linear') fig.update_xaxes(tickangle=-90) occupancy_plot_directory = Path(str(path_to_outdirs) + "/" + "Site_occupancy_plots" + "/" + TaXon_table_xlsx.stem) if not os.path.exists(occupancy_plot_directory): os.mkdir(occupancy_plot_directory) ## define output files output_pdf = Path(str(occupancy_plot_directory) + "/" + taxonomic_level + "_" + meta_data_to_test + "_heatmap.pdf") output_html = Path(str(occupancy_plot_directory) + "/" + taxonomic_level + "_" + meta_data_to_test + "_heatmap.html") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Site occupancy heatmaps are found under:\n" + '/'.join(str(output_pdf).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log from taxontabletools.create_log import ttt_log placeholder = TaXon_table_xlsx.name + " (multiple site occupancy plots)" ttt_log("site occupancy", "analysis", TaXon_table_xlsx.name, "", meta_data_to_test, path_to_outdirs) else: sg.Popup("The metdata table and taXon table are not matching!")
def replicate_consistency_filter(TaXon_table_xlsx, suffix_list, path_to_outdirs, consistency): import PySimpleGUI as sg import pandas as pd import numpy as np from pathlib import Path TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) sample_names = TaXon_table_df.columns[10:].tolist() OTUs = TaXon_table_df["ID"].values.tolist() derep_sample_names_dict = {} unique_sample_names_list = [] replicates_dict = {} for sample in sample_names: sample_name = sample.split("_")[0:-1] unique_sample_names_list.append("_".join(sample_name)) unique_sample_names_set = sorted(set(unique_sample_names_list)) ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(unique_sample_names_set) + 1 ############################################################################ ## merge and replicate consistency version if consistency == True: no_replicates_list = [] for sample in unique_sample_names_set: for i, suffix in enumerate(suffix_list): replicates_dict["rep_" + str(i)] = sample + "_" + str(suffix_list[i]) replicate_names_list = list(replicates_dict.values()) try: new_df = TaXon_table_df[replicate_names_list] header = new_df.columns.tolist() processed_reads = [] for n_reads in new_df.values.tolist(): if 0 in n_reads: if len(set(n_reads)) > 1: n_reads = len(n_reads) * [0] processed_reads.append(n_reads) df_out = pd.DataFrame(processed_reads) df_out.columns = header TaXon_table_df = TaXon_table_df.drop(replicate_names_list, axis=1) TaXon_table_df[sample] = df_out.sum(axis=1) except: no_replicates_list.append(sample) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: print('Cancel') window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() if len(no_replicates_list) == len(unique_sample_names_set): sg.PopupError("No replicates found. Please check your replicate suffixes.") else: dropped_OTUs_list = [] # filter for 0 hit OTUs (can happen after consistency filtering) columns = TaXon_table_df.columns.tolist() TaXon_table_list = TaXon_table_df.values.tolist() TaXon_table_list_final = [] for entry in TaXon_table_list: if sum(entry[10:]) != 0: TaXon_table_list_final.append(entry) else: print("Dropped:", entry[0], "(0 reads)") dropped_OTUs_list.append(entry[0]) taxon_tables_directory = Path(str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem) output_xlsx = Path(str(taxon_tables_directory) + "_cons.xlsx") TaXon_table_df = pd.DataFrame(TaXon_table_list_final, columns=columns) TaXon_table_df.to_excel(output_xlsx, sheet_name='TaXon table', index=False) closing_text = "Taxon table is found under:\n" + '/'.join(str(output_xlsx).split("/")[-4:]) + "\n\n" + str(len(dropped_OTUs_list)) + " OTUs were removed." sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("replicate consistency", "processing", TaXon_table_xlsx.name, output_xlsx.name, "consistency merged", path_to_outdirs) ## merge only version else: no_replicates_list = [] for sample in unique_sample_names_set: for i, suffix in enumerate(suffix_list): replicates_dict["rep_" + str(i)] = sample + "_" + str(suffix_list[i]) replicate_names_list = list(replicates_dict.values()) try: new_df = TaXon_table_df[replicate_names_list] TaXon_table_df = TaXon_table_df.drop(replicate_names_list, axis=1) TaXon_table_df[sample] = new_df.sum(axis=1) except: no_replicates_list.append(sample) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: print('Cancel') window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() if len(no_replicates_list) == len(unique_sample_names_set): sg.PopupError("No replicates found. Please check your replicate suffixes.") else: taxon_tables_directory = Path(str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem) output_xlsx = Path(str(taxon_tables_directory) + "_merged.xlsx") TaXon_table_df.to_excel(output_xlsx, sheet_name='TaXon table', index=False) closing_text = "Taxon table is found under:\n" + '/'.join(str(output_xlsx).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("replicate merging", "processing", TaXon_table_xlsx.name, output_xlsx.name, "merged", path_to_outdirs)
def taxon_table_converter_qiime2(read_table_tsv, taxonomy_results_xlsx, TaXon_table_name, sheet_name, path_to_outdirs): # read_table_tsv = "/Users/tillmacher/Downloads/tutorial_read_table_qiime2.tsv" # taxonomy_results_xlsx = "/Users/tillmacher/Downloads/tutorial_taxonomy_table.xlsx" import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np from pathlib import Path taxonomy_results_xlsx = Path(taxonomy_results_xlsx) read_table_tsv = Path(read_table_tsv) # create filename and path for output file Output_name = TaXon_table_name + ".xlsx" Output_file = path_to_outdirs / "TaXon_tables" / Output_name # store the file name for later use file_name = taxonomy_results_xlsx.name # create datafrmes for both files taxonomy_df = pd.read_excel(taxonomy_results_xlsx, sheet_name, header=0) if sheet_name == "BOLDigger hit": taxonomy_df = taxonomy_df.drop(columns=['Flags']) read_table_df = pd.read_csv(Path(read_table_tsv), sep="\t") # drop the first row read_table_df = read_table_df.iloc[1:] read_table_df = read_table_df.reset_index(drop=True) ## create a new dataframe TaXon_table_df = taxonomy_df # check if all OTU are correctly sorted and present in both files if taxonomy_df["ID"].to_list() == read_table_df["id"].to_list(): ## append the sequences to the TaXon stable TaXon_table_df["seq"] = read_table_df["Sequence"].values.tolist() ## remove the sequence column from the read table read_table_df.drop('Sequence', axis='columns', inplace=True) ## remove the ID column from the read table read_table_df.drop('id', axis='columns', inplace=True) ## add samples to the dataframe TaXon_table_df = pd.concat([TaXon_table_df, read_table_df], axis=1) ## check if species are present as "Genus" + "Epithet" new_species_column = [] for OTU in TaXon_table_df[["Genus", "Species"]].fillna("nan").values.tolist(): if (OTU != ["nan", "nan"] and OTU[1] != 'nan'): if OTU[0] not in OTU[1]: new_species_column.append(OTU[0] + " " + OTU[1]) else: new_species_column.append(OTU[1]) else: new_species_column.append("") ## add new species column to the dataframe TaXon_table_df["Species"] = new_species_column ## save the newly created Taxon table in TaXon format as excel file TaXon_table_df.to_excel(Output_file, sheet_name='TaXon table', index=False) closing_text = "Taxon table is found under:\n" + '/'.join( str(Output_file).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log input = taxonomy_results_xlsx.name + " + " + read_table_tsv.name ttt_log("taXon table converter", "processing", input, Output_file.name, "qiime2", path_to_outdirs) else: sg.PopupError( "Error: The IDs of the read table and taxonomy table do not match!" )
def site_occupancy_barchart(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, path_to_outdirs, x_site_occ, y_site_occ, template, theme, font_size): import os, webbrowser import pandas as pd from pandas import DataFrame from pathlib import Path import plotly.graph_objects as go import PySimpleGUI as sg color1 = theme[0] color2 = theme[1] opacity_value = theme[2] ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header = 0) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_xlsx = Path(str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header = 0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test) ## drop samples with metadata called nan (= empty) drop_samples = [i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan"] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame([i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() TaXon_table_n_samples = len(TaXon_table_samples) n_sites = len(set(Meta_data_table_df[meta_data_to_test].tolist())) answer = "Ask" output_message = "No" if (sorted(TaXon_table_samples) == sorted(Meta_data_table_samples) and TaXon_table_n_samples != n_sites): site_occupancy_dict = {} sites = set(Meta_data_table_df[meta_data_to_test].tolist()) for site in sites: # this can either be a species name or the above specified taxonomic level present_OTU_list = [] # extract samples that belong to the site from the metadata file included_samples_list = Meta_data_table_df[Meta_data_table_df.values == site]['Samples'].values.tolist() # count the number of samples per site to calculate the site occupancy n_samples = len(included_samples_list) # create a list of all species (or the specified taxonomic level) if taxonomic_level == "OTUs": taxonomic_level = "ID" overall_included_species_list = TaXon_table_df[taxonomic_level].values.tolist() # make the list unique overall_included_species_set = set(overall_included_species_list) # remove potential 'nan's from the list overall_included_species_set = [x for x in overall_included_species_set if str(x) != 'nan'] # create a set of species that is present at the sites for sample in included_samples_list: OTUs_per_species_list = [] # check the read abundaces for each sample read_abundace_list = TaXon_table_df[sample].values.tolist() # enumerate the read abundaces for each sample and collect all lines that have more than one read for i, read_abundance in enumerate(read_abundace_list): species = TaXon_table_df[taxonomic_level][i] # if reads are present, collect the species name (or the specified taxonomic level) from the TaXon table if read_abundance != 0: OTUs_per_species_list.append(species) # remove all nans OTUs_per_species_list = [x for x in OTUs_per_species_list if str(x) != 'nan'] # make list unique OTUs_per_species_list = list(set(OTUs_per_species_list)) # append to list of species for the current site present_OTU_list.append(OTUs_per_species_list) # flatten the list of present species per site present_OTU_list_flattened = [val for sublist in present_OTU_list for val in sublist] # store occupancy of each species in a dict, will be accessed by position in list occupancy_dict = {} # count the number of occurences for each species and calculate the occpancy based on the number of samples for species in overall_included_species_set: count = present_OTU_list_flattened.count(species) occupancy = count / n_samples * 100 occupancy_dict[species] = occupancy occupancy_dict = {k: v for k, v in sorted(occupancy_dict.items(), key=lambda item: item[1])} occupancy_list = list(occupancy_dict.values()) species_list = list(occupancy_dict.keys()) if (taxonomic_level == "Species" or taxonomic_level == "Genus"): x_values = ["<i>" + taxon + "</i>" for taxon in species_list] else: x_values = species_list occupancy_plot_directory = Path(str(path_to_outdirs) + "/" + "Site_occupancy_plots" + "/" + TaXon_table_xlsx.stem) if not os.path.exists(occupancy_plot_directory): os.mkdir(occupancy_plot_directory) fig = go.Figure(data=[go.Bar(x=x_values, y=occupancy_list)]) fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=0.6, opacity=opacity_value) fig.update_layout(title_text=site + " (" + taxonomic_level + ")", yaxis_title="occupancy (%)") fig.update_layout(height=int(y_site_occ), width=int(x_site_occ), template=template, font_size=font_size, title_font_size=font_size) fig.update_yaxes(range=[0,100]) fig.update_xaxes(tickmode='linear') fig.update_xaxes(tickangle=-90) output_pdf = Path(str(occupancy_plot_directory) + "/" + site + "_" + taxonomic_level + ".pdf") output_html = Path(str(occupancy_plot_directory) + "/" + site + "_" + taxonomic_level + ".html") occupancy_table = Path(str(occupancy_plot_directory) + "/" + site + "_" + taxonomic_level + ".xlsx") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) occupancy_df = pd.DataFrame(occupancy_list, species_list) occupancy_df.columns = ["Occupancy"] occupancy_df.index.name = "Taxon" occupancy_df = occupancy_df.sort_values("Occupancy") # sort the table numerical if OTUs were chosen if taxonomic_level == "ID": sort_list = [] for OTU in occupancy_df.index.tolist(): sort_list.append(int(OTU.split("_")[1])) occupancy_df["sort"] = sort_list occupancy_df = occupancy_df.sort_values("sort") occupancy_df = occupancy_df.drop("sort", axis=1) occupancy_df.to_excel(occupancy_table) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Site occupancy plots are found under:\n" + '/'.join(str(output_pdf).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log from taxontabletools.create_log import ttt_log placeholder = TaXon_table_xlsx.name + " (multiple site occupancy plots)" ttt_log("site occupancy", "analysis", TaXon_table_xlsx.name, placeholder, meta_data_to_test, path_to_outdirs) else: sg.PopupError("Please check your Metadata file and Taxon table file: The samples do not match or the metadata is unique for all samples!", keep_on_top=True)
def read_proportions_pie(TaXon_table_xlsx, taxonomic_level, path_to_outdirs, width_value, height_value, template, font_size, color_discrete_sequence): import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.graph_objects as go from pathlib import Path import os, webbrowser TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("unidentified") samples_list = TaXon_table_df.columns.tolist()[10:] Species_read_proportion_dict = {} # check for presence absence data # otherwise abort and print error message pa_test = set([val for sublist in TaXon_table_df[samples_list].values.tolist() for val in sublist]) if pa_test == {1,0}: sg.Popup("Please do not use presence absence data!", title=("Error")) raise RuntimeError ## check for the taxonmic level to analyse if taxonomic_level not in ["ASVs", "ESVs", "OTUs", "zOTUs"]: ## create a y axis title text taxon_title = taxonomic_level answer = sg.PopupYesNo("Shall missing taxonomy be replaced by the best hit?\n\nYes => Replace missing taxonomy with the best available hit.\nNo => Display missing taxonomy as \'unidentified\'.", title="Plotting strategy") if answer == "Yes": ## replace nan with the best hit taxon_levels_dict = {"Phylum": 1, "Class": 2, "Order": 3, "Family": 4, "Genus": 5, "Species": 6} value_taxonomic_level = taxon_levels_dict[taxonomic_level] best_hit_list = [] for taxon in TaXon_table_df[list(taxon_levels_dict.keys())].values.tolist(): ## human readable range => e.g. from 5 to 0 for species level for test in range(value_taxonomic_level-1,-1,-1): if taxon[test] != "unidentified": best_hit_list.append(taxon[test]) break TaXon_table_df[taxonomic_level] = best_hit_list else: taxon_title = taxonomic_level taxonomic_level = "ID" ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(samples_list) + 1 ############################################################################ TaXon_table_df_2 = "" for sample in samples_list: df = TaXon_table_df[['ID', "Phylum", "Class", "Order", "Family", "Genus", "Species", sample]] df_2 = df[[sample]]/df[[sample]].sum() df = df.assign(perc=df_2.values) df["perc"] = df.groupby([taxonomic_level])['perc'].transform('sum') df_3 = df.drop_duplicates(subset=[taxonomic_level, 'perc']) df_3 = df_3.drop([sample], axis=1) df_3 = df_3.rename(columns={"perc": sample}) if TaXon_table_df_2 is "": TaXon_table_df_2 = df_3 else: TaXon_table_df_2 = TaXon_table_df_2.join(df_3[[sample]]) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() ## create dataframe for plot plot_df = TaXon_table_df_2[samples_list] plot_df.index = TaXon_table_df_2[taxonomic_level] ############################################################################## ## create a subfolder for better sorting and overview dirName = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/") dirName_samples = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/samples") if not os.path.exists(dirName): os.mkdir(dirName) if not os.path.exists(dirName_samples): os.mkdir(dirName_samples) ## read abundance pie chart per sample for sample in samples_list: sample_df = plot_df.loc[plot_df[sample] > 0.0, [sample]] labels = sample_df.index.tolist() values = sample_df[sample].values.tolist() fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)]) fig.update_layout(title=sample, annotations=[dict(text=taxonomic_level, x=0.5, y=0.5, showarrow=False)]) fig.update_traces(textposition='inside') fig.update_layout(width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size) output_pdf = Path(str(dirName_samples) + "/" + sample + "_" + taxon_title + "_pie.pdf") output_html = Path(str(dirName_samples) + "/" + sample + "_" + taxon_title + "_pie.html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## main read abundance pie chart main_df = pd.DataFrame(TaXon_table_df[taxonomic_level].values.tolist(), list(TaXon_table_df[samples_list].sum(axis=1)), columns=["Taxon"]) main_df["Reads"] = main_df.index df_2 = main_df["Reads"]/main_df["Reads"].sum() main_df = main_df.assign(perc=df_2.values*100) fig = go.Figure(data=[go.Pie(labels=main_df["Taxon"], values=main_df["perc"], marker_colors=color_discrete_sequence, hole=.3)]) fig.update_traces(textposition='inside') fig.update_layout(annotations=[dict(text=taxon_title, x=0.5, y=0.5, showarrow=False)]) fig.update_layout(width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size) ## write files output_pdf = Path(str(dirName) + "/" + taxonomic_level + "_pie.pdf") output_html = Path(str(dirName) + "/" + taxonomic_level + "_pie.html") output_xlsx = Path(str(dirName) + "/" + taxonomic_level + "_pie.xlsx") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Read proportion plot is found under:\n" + '/'.join(str(output_pdf).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log from taxontabletools.create_log import ttt_log ttt_log("read proportions pie chart", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def read_proportions_bar(TaXon_table_xlsx, taxonomic_level, path_to_outdirs, width_value, height_value, template, font_size, color_discrete_sequence): import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.express as px from pathlib import Path import os, webbrowser TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("unidentified") samples_list = TaXon_table_df.columns.tolist()[10:] Species_read_proportion_dict = {} # check for presence absence data # otherwise abort and print error message pa_test = set([val for sublist in TaXon_table_df[samples_list].values.tolist() for val in sublist]) if pa_test == {1,0}: sg.Popup("Please do not use presence absence data!", title=("Error")) raise RuntimeError ## check for the taxonmic level to analyse if taxonomic_level not in ["ASVs", "ESVs", "OTUs", "zOTUs"]: ## create a y axis title text taxon_title = taxonomic_level.lower() # ask how the to handle missing taxonomy answer = sg.PopupYesNo("Shall missing taxonomy be replaced by the best hit?\n\nYes => Replace missing taxonomy with the best available hit.\nNo => Display missing taxonomy as \'unidentified\'.", title="Plotting strategy") if answer == "Yes": ## replace nan with the best hit taxon_levels_dict = {"Phylum": 1, "Class": 2, "Order": 3, "Family": 4, "Genus": 5, "Species": 6} value_taxonomic_level = taxon_levels_dict[taxonomic_level] best_hit_list = [] for taxon in TaXon_table_df[list(taxon_levels_dict.keys())].values.tolist(): ## human readable range => e.g. from 5 to 0 for species level for test in range(value_taxonomic_level-1,-1,-1): if taxon[test] != "unidentified": best_hit_list.append(taxon[test]) break TaXon_table_df[taxonomic_level] = best_hit_list else: taxon_title = taxonomic_level taxonomic_level = "ID" ############################################################################## ## create a subfolder for better sorting and overview dirName = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) output_pdf = Path(str(dirName) + "/" + taxon_title + "_bar.pdf") output_html = Path(str(dirName) + "/" + taxon_title + "_bar.html") output_xlsx = Path(str(dirName) + "/" + taxon_title + "_bar.xlsx") ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(samples_list) + 1 ############################################################################ TaXon_table_df_2 = "" for sample in samples_list: df = TaXon_table_df[['ID', "Phylum", "Class", "Order", "Family", "Genus", "Species", sample]] df_2 = df[[sample]]/df[[sample]].sum() df = df.assign(perc=df_2.values*100) df["perc"] = df.groupby([taxonomic_level])['perc'].transform('sum') df_3 = df.drop_duplicates(subset=[taxonomic_level, 'perc']) df_3 = df_3.drop([sample], axis=1) df_3 = df_3.rename(columns={"perc": sample}) if TaXon_table_df_2 is "": TaXon_table_df_2 = df_3 else: TaXon_table_df_2 = TaXon_table_df_2.join(df_3[[sample]]) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() ## create dataframe for plotly express plot_df = '' for sample in samples_list: if plot_df is '': plot_df = pd.DataFrame([[sample] + entry for entry in TaXon_table_df_2[[taxonomic_level, sample]].values.tolist()], columns=["Sample", "Taxon", "Reads"]) else: df = pd.DataFrame([[sample] + entry for entry in TaXon_table_df_2[[taxonomic_level, sample]].values.tolist()], columns=["Sample", "Taxon", "Reads"]) plot_df = plot_df.append(df) n_taxa = len(TaXon_table_df_2[taxonomic_level].values.tolist()) plot_df["Color"] = list(np.linspace(0,100,n_taxa)) * len(samples_list) fig = px.bar(plot_df, x="Sample", y="Reads", color="Taxon", color_discrete_sequence=color_discrete_sequence, labels={"Color": "Taxon"}) fig.update_layout(barmode='stack', width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size) fig.update_yaxes(title_text="reads (%)") fig.update_xaxes(title_text="") ## write files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Read proportion plot is found under:\n" + '/'.join(str(output_pdf).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) ## print closing text from taxontabletools.create_log import ttt_log ttt_log("read proportions bar plot", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def read_proportions_heatmap(TaXon_table_xlsx, taxonomic_level, path_to_outdirs, width_value, height_value, template, font_size): import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from pathlib import Path import os, webbrowser TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("unidentified") samples_list = TaXon_table_df.columns.tolist()[10:] Species_read_proportion_dict = {} # check for presence absence data # otherwise abort and print error message pa_test = set([val for sublist in TaXon_table_df[samples_list].values.tolist() for val in sublist]) if pa_test == {1,0}: sg.Popup("Please do not use presence absence data!", title=("Error")) raise RuntimeError ## check for the taxonmic level to analyse if taxonomic_level not in ["ASVs", "ESVs", "OTUs", "zOTUs"]: ## create a y axis title text taxon_title = taxonomic_level.lower() # ask how the to handle missing taxonomy answer = sg.PopupYesNo("Shall missing taxonomy be replaced by the best hit?\n\nYes => Replace missing taxonomy with the best available hit.\nNo => Display missing taxonomy as \'unidentified\'.", title="Plotting strategy") if answer == "Yes": ## replace nan with the best hit taxon_levels_dict = {"Phylum": 1, "Class": 2, "Order": 3, "Family": 4, "Genus": 5, "Species": 6} value_taxonomic_level = taxon_levels_dict[taxonomic_level] best_hit_list = [] for taxon in TaXon_table_df[list(taxon_levels_dict.keys())].values.tolist(): ## human readable range => e.g. from 5 to 0 for species level for test in range(value_taxonomic_level-1,-1,-1): if taxon[test] != "unidentified": best_hit_list.append(taxon[test]) break TaXon_table_df[taxonomic_level] = best_hit_list else: taxon_title = taxonomic_level taxonomic_level = "ID" ############################################################################## ## create a subfolder for better sorting and overview dirName = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) output_pdf = Path(str(dirName) + "/" + taxonomic_level + "_heatmap.pdf") output_html = Path(str(dirName) + "/" + taxonomic_level + "_heatmap.html") output_xlsx = Path(str(dirName) + "/" + taxonomic_level + "_heatmap.xlsx") ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(samples_list) + 1 ############################################################################ TaXon_table_df_2 = "" for sample in samples_list: df = TaXon_table_df[['ID', "Phylum", "Class", "Order", "Family", "Genus", "Species", sample]] df_2 = df[[sample]]/df[[sample]].sum() df = df.assign(perc=df_2.values * 100) df["perc"] = df.groupby([taxonomic_level])['perc'].transform('sum') df_3 = df.drop_duplicates(subset=[taxonomic_level, 'perc']) df_3 = df_3.drop([sample], axis=1) df_3 = df_3.rename(columns={"perc": sample}) if TaXon_table_df_2 is "": TaXon_table_df_2 = df_3 else: TaXon_table_df_2 = TaXon_table_df_2.join(df_3[[sample]]) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() ## create plot ## ask if a subplot shall be generated plot_df = TaXon_table_df_2[samples_list] plot_df.index = TaXon_table_df_2[taxonomic_level] ## custom colorscale cs=[ [0, "rgb(220,220,220)"], [0.00001, "rgb(255,255,255)"], [0.05, "rgb(255,255,255)"], [0.05, "rgb(242,242,255)"], [0.1, "rgb(242,242,255)"], [0.1, "rgb(229,229,255)"], [0.15, "rgb(229,229,255)"], [0.15, "rgb(216,216,255)"], [0.2, "rgb(216,216,255)"], [0.2, "rgb(203,203,255)"], [0.25, "rgb(203,203,255)"], [0.25, "rgb(190,190,255)"], [0.3, "rgb(190,190,255)"], [0.3, "rgb(177,177,255)"], [0.35, "rgb(177,177,255)"], [0.35, "rgb(164,164,255)"], [0.4, "rgb(164,164,255)"], [0.4, "rgb(155,155,255)"], [0.45, "rgb(155,155,255)"], [0.45, "rgb(138,138,255)"], [0.5, "rgb(138,138,255)"], [0.5,"rgb(125,125,255)"], [0.55,"rgb(125,125,255)"], [0.55, "rgb(112,112,255)"], [0.6, "rgb(112,112,255)"], [0.6, "rgb(99,99,255)"], [0.65, "rgb(99,99,255)"], [0.65, "rgb(86,86,255)"], [0.7, "rgb(86,86,255)"], [0.7, "rgb(73,73,255)"], [0.75, "rgb(73,73,255)"], [0.75, "rgb(60,60,255)"], [0.8, "rgb(60,60,255)"], [0.8, "rgb(47,47,255)"], [0.85, "rgb(47,47,255)"], [0.85, "rgb(34,34,255)"], [0.9, "rgb(34,34,255)"], [0.9, "rgb(21,21,255)"], [0.95, "rgb(21,21,255)"], [0.95, "rgb(8,8,255)"], [1, "rgb(8,8,255)"], ] if (taxonomic_level == "Species" or taxonomic_level == "Genus"): y_values = ["<i>" + taxon + "</i>" for taxon in plot_df.index.tolist()[::-1]] else: y_values = plot_df.index.tolist()[::-1] ## v2 heatmap fig = go.Figure(data=go.Heatmap( z=plot_df.values.tolist()[::-1], x=plot_df.columns.tolist(), y=y_values, colorscale=cs)) fig.update_layout(width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size, yaxis_nticks=len(plot_df.index.tolist()), xaxis_nticks=len(plot_df.index.tolist()), legend_title_text='reads (%)') ## write files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Read proportion plots are found in: " + str(path_to_outdirs) + "/Read_proportion_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log from taxontabletools.create_log import ttt_log ttt_log("read proportions heatmap", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def gbif_check_taxonomy(TaXon_table_xlsx, path_to_outdirs): import requests_html, json import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np from pathlib import Path TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) taxon_levels = ["Phylum", "Class", "Order", "Family", "Genus", "Species"] OTUs_list = TaXon_table_df["ID"].values.tolist() taxonomy_check_dict = {} ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [ sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar') ], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(OTUs_list) ############################################################################ for OTU in TaXon_table_df[[ "Phylum", "Class", "Order", "Family", "Genus", "Species" ]].fillna("").values.tolist(): for i, taxonomy in enumerate(OTU): if taxonomy == "": phylum_name = OTU[0] taxon_name = OTU[i - 1] taxonomy_check = taxon_levels[0:i] result = gbif_parent_check(phylum_name, taxon_name, taxonomy_check) query = OTU[0:i] if (query != result and result != "ERROR"): if len(query) != 6: add = 6 - len(query) query = query + [''] * add if len(result) != 6: add = 6 - len(result) result = result + [''] * add query = ",".join(query) taxonomy_check_dict[query] = result break elif i == 5: phylum_name = OTU[0] taxon_name = OTU[5] taxonomy_check = taxon_levels result = gbif_parent_check(phylum_name, taxon_name, taxonomy_check) if (OTU != result and result != "ERROR"): query = ",".join(OTU) taxonomy_check_dict[query] = result ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() TaXon_table_list = [] for OTU in TaXon_table_df.fillna("").values.tolist(): taxonomy = OTU[1:7] search_key = ','.join(taxonomy) if (search_key in taxonomy_check_dict.keys() and taxonomy_check_dict[search_key] != [''] * 6): replacement_taxonomy = taxonomy_check_dict[search_key] replacement_OTU = [OTU[0]] + replacement_taxonomy + OTU[7:] TaXon_table_list.append(replacement_OTU) else: TaXon_table_list.append(OTU) file_name = TaXon_table_xlsx.stem output_name = Path( str(path_to_outdirs) + "/TaXon_tables/" + file_name + "_gbif" + ".xlsx") df_new = pd.DataFrame(TaXon_table_list, columns=(TaXon_table_df.columns.values.tolist())) df_new.to_excel(output_name, sheet_name='TaXon table', index=False) change_log_list = [] for key, value in taxonomy_check_dict.items(): change_log_list.append(["Input:"] + key.split(",")) change_log_list.append(["Gbif:"] + value) change_log_df = pd.DataFrame(change_log_list, columns=(["Change"] + taxon_levels)) change_log_name = Path( str(path_to_outdirs) + "/GBIF/" + file_name + "_gbif_log" + ".xlsx") change_log_df = pd.DataFrame(change_log_list, columns=(["Change"] + taxon_levels)) change_log_df.to_excel(change_log_name, sheet_name='TaXon table', index=False) closing_text = "Taxon table is found under:\n" + '/'.join( str(output_name).split("/") [-4:]) + "\n\n" + "Log file is found under:\n" + '/'.join( str(change_log_name).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("gbif check", "processing", TaXon_table_xlsx.name, output_name.name, "nan", path_to_outdirs)
def create_metadata_table(TaXon_table_xlsx, path_to_outdirs): import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np import sys, subprocess, os from pathlib import Path def open_table(table): if sys.platform == "win32": os.startfile(table) else: opener = "open" if sys.platform == 'darwin' else 'xdg-open' subprocess.call([opener, table]) TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path( str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_xslx_df = pd.read_excel(TaXon_table_xlsx) samples_list = TaXon_table_xslx_df.columns.tolist()[10:] samples_metadata_list = [] ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [ sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar') ], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(samples_list) + 1 ############################################################################ for sample in samples_list: sample_metadata = [] sample_metadata.append(sample) for part in sample.split("_"): sample_metadata.append(part) samples_metadata_list.append(sample_metadata) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() answer = "No" if Meta_data_table_xlsx.exists(): answer = sg.PopupYesNo("Metadata tables already exists! Overwrite?") if answer == "Yes": metadata_df = pd.DataFrame(samples_metadata_list) metadata_df.columns = ["Samples"] + [ "col_" + str(column) for column in metadata_df.columns.tolist()[1:] ] metadata_df.to_excel(Meta_data_table_xlsx, index=False) answer = sg.PopupYesNo("Open metadata table?", title="Finished", keep_on_top=True) if answer == "Yes": open_table(Meta_data_table_xlsx) from taxontabletools.create_log import ttt_log ttt_log("meta data table", "analysis", TaXon_table_xlsx.name, Meta_data_table_xlsx.name, "nan", path_to_outdirs) else: metadata_df = pd.DataFrame(samples_metadata_list) metadata_df.columns = ["Samples"] + [ "col_" + str(column) for column in metadata_df.columns.tolist()[1:] ] metadata_df.to_excel(Meta_data_table_xlsx, index=False) answer = sg.PopupYesNo("Open metadata table?", title="Finished", keep_on_top=True) if answer == "Yes": open_table(Meta_data_table_xlsx) from taxontabletools.create_log import ttt_log ttt_log("meta data table", "analysis", TaXon_table_xlsx.name, Meta_data_table_xlsx.name, "nan", path_to_outdirs)
def per_taxon_analysis(TaXon_table_xlsx, height, width, taxonomic_level, path_to_outdirs, template, theme, font_size, clustering_unit): import PySimpleGUI as sg import pandas as pd import numpy as np from pathlib import Path import plotly.graph_objects as go from plotly.subplots import make_subplots import itertools, webbrowser ## save the taxon title taxon_title = clustering_unit ## collect plot variables color1 = theme[0] color2 = theme[1] opacity_value = theme[2] height = int(height) width = int(width) ## load taxon table TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) TaXon_table_df = TaXon_table_df.fillna("nan") ## collect the taxa to test on taxa = sorted(list(set([taxon for taxon in TaXon_table_df[taxonomic_level].values.tolist() if taxon != "nan"]))) ## check if there are more than 8 taxa answer = "Yes" if len(taxa) > 8: answer = sg.PopupYesNo("There are more than 8 taxa detected. This can render the plot difficult to read. Continue anyway?") if answer == "Yes": ## collect the OTUs OTUs = TaXon_table_df["ID"].values.tolist() ## count the number of OTUs per taxon n_OTUs = [TaXon_table_df[taxonomic_level].values.tolist().count(taxon) for taxon in taxa] ## collect all OTUs on species level OTU_species = [OTU for OTU in TaXon_table_df[[taxonomic_level, "Species"]].values.tolist() if OTU[1] != "nan"] OTU_species.sort() OTU_species = list(k for k,_ in itertools.groupby(OTU_species)) OTU_species = [OTU[0] for OTU in OTU_species] n_species = [OTU_species.count(taxon) for taxon in taxa] ## count reads for each taxon n_reads = [] for taxon in taxa: n_reads.append(sum([sum(OTU[10:]) for OTU in TaXon_table_df[TaXon_table_df[taxonomic_level]==taxon].values.tolist()])) if (taxonomic_level == "Species" or taxonomic_level == "Genus"): x_values = ["<i>" + taxon + "</i>" for taxon in taxa] else: x_values = taxa ## calculate the read proportions reads_sum = sum(n_reads) n_reads = [round(reads / reads_sum * 100, 2) for reads in n_reads] ## create subplots fig = make_subplots(rows=1, cols=2, subplot_titles=("A)", "B)")) ## percentage of reads per taxonomic level hovertext = 'Taxon: %{x}, Reads: %{y}' fig.add_trace(go.Bar(hovertemplate=hovertext, name="",x=x_values, y=n_reads),row=1, col=1) fig.update_yaxes(title_text = "reads (%)", title_standoff=5, row=1, col=1) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value, showlegend=False, row=1, col=1) ## Number of OTUs hovertext = 'Taxon: %{x}, OTUs: %{y}' title_text = "# " + taxon_title fig.add_trace(go.Bar(hovertemplate=hovertext, name="",x=x_values, y=n_OTUs, text=n_OTUs, showlegend=False),row=1, col=2) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value, row=1, col=2) fig.update_yaxes(title_text=title_text, title_standoff=5, row=1, col=2, rangemode="tozero") ## Number of OTUs on species level hovertext = 'Taxon: %{x}, Species: %{text}' fig.add_trace(go.Scatter(textposition = "top center", hovertemplate=hovertext, text=n_species, name="Species",x=x_values, y=n_OTUs, showlegend=False, mode='text'),row=1, col=2) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value, row=1, col=2) ## fig.add_annotation( text='─ Species', align='left', showarrow=False, xref='paper', yref='paper', x=1.05, y=0.5, bordercolor='black', borderwidth=1) ## update the layout fig.update_layout(barmode='stack', height=int(height), width=int(width), template=template, showlegend=False, font_size=font_size, title_font_size=font_size) fig.update_xaxes(tickmode='linear') fig.update_xaxes(tickangle=-90) ## write ouput files output_pdf = Path(str(path_to_outdirs) + "/Per_taxon_statistics/" + TaXon_table_xlsx.stem + "_" + taxonomic_level + ".pdf") output_html = Path(str(path_to_outdirs) + "/Per_taxon_statistics/" + TaXon_table_xlsx.stem + "_" + taxonomic_level + ".html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Plots are found under: " + str(path_to_outdirs) + "/Per_taxon_statistics/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log from taxontabletools.create_log import ttt_log ttt_log("per taxon statistics", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def filter_samples(TaXon_table_xlsx, selected_samples, appendix_name, path_to_outdirs, sample_filter_method): import PySimpleGUI as sg import pandas as pd from pandas import DataFrame from pathlib import Path TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx_path = TaXon_table_xlsx TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx) df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) n_old_OTUs = len(df["ID"].values.tolist()) if type(selected_samples) == str: selected_samples = [selected_samples] if sample_filter_method == "exclude": for sample in selected_samples: df = df.drop(sample, axis=1) else: available_samples = df.columns.tolist()[10:] for sample in available_samples: if sample not in selected_samples: df = df.drop(sample, axis=1) header = df.columns.values.tolist() row_filter_list = [] for row in df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) df = pd.DataFrame(row_filter_list) df.columns = header file_name = TaXon_table_file.stem output_name = Path( str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + file_name + "_" + appendix_name + ".xlsx") df.to_excel(output_name, sheet_name='TaXon table', index=False) ## print results for the user n_remaining_OTUs = len(df["ID"].values.tolist()) diff_abs = n_old_OTUs - n_remaining_OTUs diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2) ## finish script closing_text = "Removed " + str(diff_abs) + " OTUs (" + str( diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join( str(output_name).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log log_text = str(diff_abs) + " OTUs ; " + str(diff_rel) + "%" from taxontabletools.create_log import ttt_log ttt_log("sample filter", "processing", TaXon_table_file.name, output_name.name, log_text, path_to_outdirs)
def betadiv_clustering(TaXon_table_xlsx, height, width, threshold, betadiv_linkage, taxonomic_level, path_to_outdirs, template, font_size, diss_metric): from scipy.cluster.hierarchy import dendrogram, linkage import plotly.figure_factory as ff import numpy as np import pandas as pd from skbio.diversity import beta_diversity from pathlib import Path import PySimpleGUI as sg import webbrowser ## import table TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") ## create a y axis title text taxon_title = taxonomic_level.lower() ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" ## collect samples for plot samples = TaXon_table_df.columns.tolist()[10:] ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe df_new = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in df_new.index: df_new = df_new.drop('unidentified') ## collect reads data = df_new[samples].transpose().values.tolist() ## calculate jaccard distances dissimilarity_dm = beta_diversity(diss_metric, data, samples) ## convert to distance matrix X1 = dissimilarity_dm.data matrix_df = pd.DataFrame(X1) matrix_df.columns = samples matrix_df.index = samples ## convert to 2D array X2 = dissimilarity_dm.condensed_form() ## cluster dendrogram fig = ff.create_dendrogram( X1, labels=samples, color_threshold=float(threshold), orientation="left", linkagefun=lambda x: linkage(X2, betadiv_linkage, metric=diss_metric)) fig.update_yaxes(ticks="") fig.update_xaxes(title="A") title = str(diss_metric) + " distance" fig.update_layout(xaxis_title=title, height=int(height), width=int(width), template=template, font_size=font_size, title_font_size=font_size) # finish script output_pdf = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" + diss_metric + ".pdf") output_html = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" + diss_metric + ".html") output_xlsx = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" + diss_metric + ".xlsx") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) matrix_df.to_excel(output_xlsx) ## ask to show plot answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## write to log file sg.Popup(diss_metric + " clustering dendrograms are found in", path_to_outdirs, "/Beta_diversity/", title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log(diss_metric + " clustering", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def read_filter(TaXon_table_xlsx, path_to_outdirs, read_filter_method, read_filter_treshold): import PySimpleGUI as sg import pandas as pd from pandas import DataFrame from pathlib import Path import numpy as np # TaXon_table_xlsx = "/Users/tillmacher/Desktop/Projects/TTT_Projects/Projects/Tutorial/TaXon_tables/Tutorial_taxon_table.xlsx" # path_to_outdirs = "/Users/tillmacher/Desktop/Projects/TTT_Projects/Projects/Tutorial" # read_filter_method = "absolute_filtering" # read_filter_treshold = 50 TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx_path = TaXon_table_xlsx TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) samples = TaXon_table_df.columns.tolist()[10:] if read_filter_method == "absolute_filtering": ## transform dataframe to array and apply filter threshold a = np.array(TaXon_table_df[samples].values.tolist()) TaXon_table_df[samples] = np.where(a < int(read_filter_treshold), 0, a).tolist() ## remove OTUs that have 0 reads after filtering row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) TaXon_table_df_filtered = pd.DataFrame(row_filter_list) TaXon_table_df_filtered.columns = TaXon_table_df.columns.tolist() ## save filtered dataframe to file file_name = TaXon_table_file.stem output_name = Path( str(path_to_outdirs) + "/TaXon_tables/" + file_name + "_" + read_filter_treshold + ".xlsx") TaXon_table_df_filtered.to_excel(output_name, sheet_name='TaXon table', index=False) ## print results for the user n_old_OTUs = len(TaXon_table_df["ID"].values.tolist()) n_remaining_OTUs = len(TaXon_table_df_filtered["ID"].values.tolist()) diff_abs = n_old_OTUs - n_remaining_OTUs diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2) ## finish script closing_text = "Removed " + str(diff_abs) + " OTUs (" + str( diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join( str(output_name).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log log_text = str(read_filter_treshold) + " ; " + str( diff_abs) + " OTUs ; " + str(diff_rel) + "%" ttt_log("absolute read filter", "processing", TaXon_table_file.name, output_name.name, log_text, path_to_outdirs) elif read_filter_method == "relative_filtering": ## transform to percentage read_filter_rel = float(read_filter_treshold) / 100 for sample in samples: ## transform to array a = np.array(TaXon_table_df[sample].values.tolist()) ## calculate threshold for each sample sample_threshold = sum(a) * read_filter_rel ## apply filter to dataframe TaXon_table_df[sample] = np.where(a < int(sample_threshold), 0, a).tolist() ## remove OTUs that have 0 reads after filtering row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) TaXon_table_df_filtered = pd.DataFrame(row_filter_list) TaXon_table_df_filtered.columns = TaXon_table_df.columns.tolist() ## save filtered dataframe to file file_name = TaXon_table_file.stem output_name = Path( str(path_to_outdirs) + "/TaXon_tables/" + file_name + "_" + read_filter_treshold + ".xlsx") TaXon_table_df_filtered.to_excel(output_name, sheet_name='TaXon table', index=False) ## print results for the user n_old_OTUs = len(TaXon_table_df["ID"].values.tolist()) n_remaining_OTUs = len(TaXon_table_df_filtered["ID"].values.tolist()) diff_abs = n_old_OTUs - n_remaining_OTUs diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2) ## finish script closing_text = "Removed " + str(diff_abs) + " OTUs (" + str( diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join( str(output_name).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log log_text = str(read_filter_treshold) + " ; " + str( diff_abs) + " OTUs ; " + str(diff_rel) + "%" ttt_log("relative read filter", "processing", TaXon_table_file.name, output_name.name, log_text, path_to_outdirs)
def calculate_taxonomic_resolution(TaXon_table_xlsx, path_to_outdirs, x_tax_res, y_tax_res, figure_type, template, theme, font_size, clustering_unit): import glob import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np import plotly.graph_objects as go from pathlib import Path import webbrowser color1 = theme[0] color2 = theme[1] opacity_value = theme[2] TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) TaXon_table_df = TaXon_table_df.replace(np.nan, 'nan', regex=True) taxonomic_levels = [ "Phylum", "Class", "Order", "Family", "Genus", "Species" ] statistics_list, statistics_set, statistics_dict, highest_level_dict = [], [], {}, {} title = "# " + clustering_unit for taxon_to_evaluate in taxonomic_levels: taxa_list = [ x for x in TaXon_table_df[taxon_to_evaluate].values.tolist() if str(x) != 'nan' ] statistics = taxon_to_evaluate, len(taxa_list) statistics_set.append(len(set(taxa_list))) statistics_list.append(list(statistics)) statistics_dict[taxon_to_evaluate] = len(taxa_list) highest_level_dict[ "Phylum"] = statistics_dict["Phylum"] - statistics_dict["Class"] highest_level_dict[ "Class"] = statistics_dict["Class"] - statistics_dict["Order"] highest_level_dict[ "Order"] = statistics_dict["Order"] - statistics_dict["Family"] highest_level_dict[ "Family"] = statistics_dict["Family"] - statistics_dict["Genus"] highest_level_dict[ "Genus"] = statistics_dict["Genus"] - statistics_dict["Species"] highest_level_dict["Species"] = statistics_dict["Species"] taxon_levels = list(highest_level_dict.keys()) highest_level_OTUs = list(highest_level_dict.values()) total_OTUs = list(statistics_dict.values()) # create plot # option A: scatter plot if figure_type == "a": fig = go.Figure(data=[ go.Bar(x=taxon_levels, y=highest_level_OTUs, name="Taxon", textposition="outside", text=highest_level_OTUs) ]) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value) fig.update_layout( title_text='Taxonomic resolution (highest taxonomic level)', yaxis_title=title) fig.update_layout(height=int(y_tax_res), width=int(x_tax_res), template=template, font_size=font_size, title_font_size=font_size) ## finish script output_pdf = Path( str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" + TaXon_table_file.stem + "_taxonomic_resolution_a.pdf") output_html = Path( str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" + TaXon_table_file.stem + "_taxonomic_resolution_a.html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## write log file from taxontabletools.create_log import ttt_log ttt_log("taxonomic resolution", "analysis", TaXon_table_file.name, output_pdf.name, "plot a", path_to_outdirs) # option B: bar plot else: fig = go.Figure(data=[ go.Bar(x=taxon_levels, y=total_OTUs, name="Taxon", textposition="outside", text=total_OTUs) ]) fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value) fig.update_layout( title_text='Taxonomic resolution (total number of OTUs)', yaxis_title=title) fig.update_layout(height=int(y_tax_res), width=int(x_tax_res), template=template, font_size=font_size, title_font_size=font_size) ## finish script output_pdf = Path( str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" + TaXon_table_file.stem + "_taxonomic_resolution_b.pdf") output_html = Path( str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" + TaXon_table_file.stem + "_taxonomic_resolution_b.html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## show plot answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## write log file from taxontabletools.create_log import ttt_log ttt_log("taxonomic resolution", "analysis", TaXon_table_file.name, output_pdf.name, "plot b", path_to_outdirs) closing_text = "\n" + "Taxonomic resolution plots are found in: " + str( path_to_outdirs) + "/taxonomic_resolution_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True)
def taxon_filter(TaXon_table_xlsx, filtered_taxa, mask, appendix_name, threshold, path_to_outdirs, taxon_filter_method): import PySimpleGUI as sg import pandas as pd from pandas import DataFrame from pathlib import Path TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx = pd.ExcelFile(TaXon_table_file) df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) # convert taxa to exclude to a list if only one taxon is given (which is then string) if type(filtered_taxa) == str: filtered_taxa = [filtered_taxa] if taxon_filter_method == "keep": available_taxa = set(df[mask].values.tolist()) available_taxa = [x for x in available_taxa if str(x) != 'nan'] available_taxa = sorted(list(available_taxa)) filtered_taxa = list(set(available_taxa) - set(filtered_taxa)) # check for taxa to filter mask_position = list(df.columns).index(mask) df_columns = df.columns rows_to_keep = [] df_rows = df.values.tolist() for row in df_rows: taxon_to_evaluate = row[mask_position] if taxon_to_evaluate not in filtered_taxa: if str(taxon_to_evaluate) != 'nan': rows_to_keep.append(row) else: # check for taxa to filter mask_position = list(df.columns).index(mask) df_columns = df.columns rows_to_keep = [] df_rows = df.values.tolist() for row in df_rows: taxon_to_evaluate = row[mask_position] if taxon_to_evaluate not in filtered_taxa: rows_to_keep.append(row) df_out = pd.DataFrame(rows_to_keep) similarity_position = list(df_columns).index("Similarity") threshold = int(threshold) filtered_rows = [] for index, row in df_out.iterrows(): similarity = list(row)[similarity_position] if similarity != 'No Match': if int(similarity) >= threshold: filtered_rows.append(list(row)) df_out = pd.DataFrame(filtered_rows) if df_out.empty: sg.PopupError('Filter theshold were to harsh: Nothing to print', title="Error", keep_on_top=True) else: df_out.columns = df_columns # write output file file_name = TaXon_table_file.stem output_name = Path( str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + file_name + "_" + appendix_name + ".xlsx") threshold_output = "Similarity threshold = " + str(threshold) filtered_taxa.append(threshold_output) df_filtered_taxa = pd.DataFrame(filtered_taxa) df_filtered_taxa.columns = ['Filter criteria'] writer = pd.ExcelWriter(output_name, engine='xlsxwriter') df_out.to_excel(writer, sheet_name='TaXon table', index=False) df_filtered_taxa.to_excel(writer, sheet_name='Filter criteria', index=False) writer.save() writer.close() ## print results for the user n_old_OTUs = len(df["ID"].values.tolist()) n_remaining_OTUs = len(df_out["ID"].values.tolist()) diff_abs = n_old_OTUs - n_remaining_OTUs diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2) ## finish script closing_text = "Removed " + str(diff_abs) + " OTUs (" + str( diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join( str(output_name).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log log_text = str(diff_abs) + " OTUs ; " + str(diff_rel) + "%" ttt_log("taxon filter", "processing", TaXon_table_file.name, output_name.name, log_text, path_to_outdirs)
def rarefaction_curve_legacy(TaXon_table_xlsx, repetitions, path_to_outdirs, template, theme, font_size, taxonomic_level_1): import random import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.graph_objects as go from pathlib import Path import webbrowser color1 = theme[0] color2 = theme[1] opacity_value = theme[2] ## create a y axis title text taxon_title = taxonomic_level_1.lower() ## adjust taxonomic level if neccessary if taxonomic_level_1 in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level_1 taxonomic_level_1 = "ID" TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx) df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) df = df.replace(np.nan,"nan") available_samples = df.columns.tolist()[10:] sample_dict_clean = {} # iterate through all available samples for sample in available_samples: # create a dict for the read numbers of the respective sample for each species sample_OTU_list = df[[sample, taxonomic_level_1]].values.tolist() # select only the present Species sample_species_list = list(set([OTU[1] for OTU in sample_OTU_list if (OTU[0] != 0 and OTU[1] != "nan")])) # store the species in a dictionary sample_dict_clean[sample] = sample_species_list # draw once for each sample number_of_draws = len(sample_dict_clean.keys()) # dictionary to store the drawing results draw_dictionary = {} ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / repetitions ############################################################################ for n_reps in range(0, repetitions): # store the original dictionary to start over again # a copy of the original dictionary is required, because the samples will be removed with each draw # thus for each replicate a new dictionary to draw from has to be created sample_dict_to_draw = dict(sample_dict_clean) species_list = [] species_set = [] for i in range(0, number_of_draws): # choose a random sample from the dictionary random_choice = random.choice(list(sample_dict_to_draw.keys())) # extract the OTU IDs from the chosen sample and add them to the already existing OTU IDs species_list = species_list + sample_dict_clean[random_choice] # create a unique set species_set = set(species_list) # number of OTUs n_species = len(species_set) # now add the unique OTU list to the output dictionary # if the key is not in the dict, create a new entry (= OTU ID plus number of OTUs) if i not in draw_dictionary.keys(): draw_dictionary[i] = [n_species] # if the key already exists, calculate the sum of the already existing number of OTUs and the new number of OTUs else: # create a new list to store the current number of OTUs add_species_list = draw_dictionary[i] add_species_list.append(n_species) draw_dictionary[i] = add_species_list # remove the sample to draw only once sample_dict_to_draw.pop(random_choice) ############################################################################ event, values = window_progress_bar.read(timeout=1) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() # create a dict to store the average number of OTUs per draw rarefaction_dict_average, rarefaction_dict_stdef = {}, {} def average(lst): return sum(lst) / len(lst) # iterate through the draw_dictionary and calculate the average number of OTUs for key, value in draw_dictionary.items(): average_species = average(draw_dictionary[key]) stdef_species = np.std(draw_dictionary[key], dtype=np.float64) rarefaction_dict_average[key] = average_species rarefaction_dict_stdef[key] = stdef_species # draw the plot draws = [i+1 for i in rarefaction_dict_average.keys()] n_species = list(rarefaction_dict_average.values()) error_bar = list(rarefaction_dict_stdef.values()) y_axis_title = "# " + taxon_title fig = go.Figure(data=[go.Scatter(x=draws, y=n_species, error_y=dict(type='data', array=error_bar, thickness=0.5, width=3, visible=True))]) fig.update_layout(title_text="repetitions = " + str(n_reps+1), yaxis_title=y_axis_title, xaxis_title="# samples") fig.update_traces(marker_color=color1, marker_line_color=color2, opacity=opacity_value) fig.update_layout(height=800, width=1200, template=template, showlegend=False, font_size=font_size, title_font_size=font_size) ## write files output_pdf = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_" + taxon_title + ".pdf") output_html = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_taxa" + taxon_title + ".html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Rarefaction curves are found in: " + str(path_to_outdirs) + "/rarefaction_curves/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log from taxontabletools.create_log import ttt_log ttt_log("rarefaction curve all-in-one", "analysis", TaXon_table_file.name, output_pdf.name, "nan", path_to_outdirs)
def subtract_NCs(TaXon_table_xlsx, path_to_outdirs, negative_controls): import PySimpleGUI as sg import pandas as pd import numpy as np import plotly.graph_objects as go from pathlib import Path import webbrowser ## load taxon table TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0) ##negative_controls = ["NC_3", "NC_1", "NC_2"] ## collect samples samples = [ sample for sample in TaXon_table_df.columns.to_list()[10:] if sample not in negative_controls ] ## calculate sum of NCs df_nc_sum = TaXon_table_df[negative_controls].sum(axis=1) ## create a new dataframe df_out = TaXon_table_df[TaXon_table_df.columns.tolist()[0:10]] # subtract the sum of reads found in the NCs from each OTU of the samples for sample in samples: df_out.insert(10, sample, (TaXon_table_df[sample] - df_nc_sum).values.tolist()) ## replace negative values with 0 num = df_out._get_numeric_data() num[num < 0] = 0 ## remove empty OTUs out_list = [OTU for OTU in df_out.values.tolist() if sum(OTU[10:]) != 0] ## check if the still contains reads if df_out.empty: sg.PopupError('Filter theshold were to harsh: Nothing to print', title="Error", keep_on_top=True) else: output_xlsx = Path( str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem + "_NCsub.xlsx") df_out = pd.DataFrame(out_list, columns=df_out.columns.tolist()).replace( "nan", "") df_out.to_excel(output_xlsx, sheet_name="TaXon table", index=False) from taxontabletools.create_log import ttt_log ttt_log("nc subtract", "processing", TaXon_table_xlsx.name, output_xlsx.name, "nan", path_to_outdirs) ## finish script closing_text = str( len(TaXon_table_df) - len(df_out) ) + " OTUs were removed. The Taxon table is found under:\n" + '/'.join( str(output_xlsx).split("/")[-4:]) sg.Popup(closing_text, title="Finished", keep_on_top=True)
def rarefaction_curve_reads(TaXon_table_xlsx, repetitions, width, height, path_to_outdirs, template, theme, font_size): import pandas as pd import PySimpleGUI as sg import numpy as np from statistics import mean from pathlib import Path import plotly.graph_objects as go from plotly.subplots import make_subplots import math, webbrowser TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("") samples = TaXon_table_df.columns.tolist()[10:] scatter_size = 5 color1 = theme[0] color2 = theme[1] opacity_value = theme[2] height = int(height) ## count rows and columns to create subplots n_rows = math.ceil(len(samples) / 4) n_columns = 5 column_count = 1 row_count = 1 fig = make_subplots(rows=n_rows, cols=4, subplot_titles=samples, shared_yaxes=True) ## calculate maximum number of OTUs max_OTUs = [] for sample in samples: max_OTUs.append(len([OTU for OTU in TaXon_table_df[sample] if OTU != 0])) y_limit = max(max_OTUs) + 20 ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(samples) ############################################################################ ############################################################################ event, values = window_progress_bar.read(timeout=1) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += 0 progress_bar.UpdateBar(progress_update) ############################################################################ for sample in samples: ## filter sample from data read_df = TaXon_table_df[[sample, "ID"]] ## drop empty OTUs read_df = read_df[read_df[sample] != 0] ## create read list to draw the subsamples from read_list = pd.Series(np.repeat(read_df['ID'].to_list(), read_df[sample].to_list())) output = [] ## draw random sample for perc in np.arange(0.00, 1.05, 0.05): ## calculate sample size sub_sample_size = int(len(read_list) * perc) ## draw X subsamples of that size mean_species = mean([read_list.sample(n = sub_sample_size).nunique() for i in range(repetitions)]) output.append(mean_species) output = pd.DataFrame({'percentage': np.arange(0.00, 1.05, 0.05), 'mean_OTUs': output}) ## write plot fig.add_trace(go.Scatter(x=output["percentage"], y=output["mean_OTUs"], name=sample, mode='markers+lines', marker=dict(size=int(scatter_size))), row=row_count, col=column_count) fig.update_traces(marker_color=color1, marker_line_color=color2, opacity=opacity_value, row=row_count, col=column_count) fig.update_yaxes(range=[0, y_limit], row=row_count, col=column_count) ## add a y axis title to all left bound plots if column_count == 1: fig.update_yaxes(title_text="# OTUs", row=row_count, col=column_count) ## add x axis title to all plots in the last row # if row_count == n_rows: # fig.update_xaxes(title_text="subsample (%)", row=row_count, col=column_count) column_count += 1 if column_count == n_columns: column_count = 1 row_count += 1 height += 100 ############################################################################ event, values = window_progress_bar.read(timeout=1) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() fig.update_layout(height=int(height), width=int(width), template=template, font_size=font_size, title_font_size=font_size, showlegend=False) fig.update_yaxes(rangemode="tozero") fig.update_xaxes(rangemode="tozero") ## write files output_pdf = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_reads.pdf") output_html = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_reads.html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "Rarefaction curves are found in: " + str(path_to_outdirs) + "/rarefaction_curves/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log from taxontabletools.create_log import ttt_log ttt_log("rarefaction curve reads", "analysis", TaXon_table_file.name, output_pdf.name, repetitions, path_to_outdirs)
def NMDS_analysis(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, width, height, nmds_s, max_iter_val, n_init_val, path_to_outdirs, template, font_size, color_discrete_sequence, nmds_dissimilarity): import pandas as pd import numpy as np from skbio.diversity import beta_diversity from sklearn.manifold import MDS import plotly.graph_objects as go import plotly.express as px from pathlib import Path import PySimpleGUI as sg import os, webbrowser from itertools import combinations TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path( str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test) ## drop samples with metadata called nan (= empty) drop_samples = [ i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan" ] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame( [ i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples ], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() ## create a y axis title text taxon_title = taxonomic_level.lower() ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" ## create a subfolder for better sorting and overview dirName = Path( str(path_to_outdirs) + "/" + "NMDS_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == len( Meta_data_table_df['Samples'].tolist()): sg.Popup( "The meta data is unique for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples): samples = Meta_data_table_samples ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe df_new = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in df_new.index: df_new = df_new.drop('unidentified') ## collect reads data = df_new[samples].transpose().values.tolist() ## calculate jaccard distances jaccard_dm = beta_diversity(nmds_dissimilarity, data, samples) ## NMDS function def nmds_function(matrix, dimensions): nmds = MDS(n_components=dimensions, metric=False, dissimilarity='precomputed', max_iter=int(max_iter_val), n_init=int(n_init_val)) nmds_results = nmds.fit(jaccard_dm[:100]) stress = round(nmds_results.stress_, 2) nmds_array = nmds_results.embedding_ return ({"stress": stress, "nmds_results": nmds_array}) answer = sg.PopupOKCancel( "The NMDS calculation may take a while. Continue?") if answer == "OK": ## test different dimensions nmds_results_dict = {} stress_dict = {} for i in range(1, 11): nmds_results = nmds_function(jaccard_dm, i) nmds_results_dict[i] = nmds_results stress_dict[i] = nmds_results["stress"] #################################################################################################### win2_active = True layout2 = [ [sg.Text("NMDS analysis options", size=(20, 1))], [sg.CB("Show stress plot", default=True, key="stress_plot")], [sg.CB("Show NMDS 2D plot", default=True, key="2d_plot")], [sg.CB("Show NMDS 3D plot", default=True, key="3d_plot")], [sg.CB("Connect categories", default=True, key="draw_mesh")], [sg.Text("")], [sg.Button("Apply")] ] win2 = sg.Window('NMDS analysis', layout2, keep_on_top=False) while True: event2, values2 = win2.Read() if event2 is None or event2 == 'Apply': win2.close() win2_active = False break #################################################################################################### ## plot stress and dimensions fig = go.Figure() fig.add_trace( go.Scatter(x=list(stress_dict.keys()), y=list(stress_dict.values()), mode='markers+lines', name=sample, marker=dict(color="Blue", size=int(10)))) fig.update_layout(showlegend=False, xaxis_title="Dimensions", yaxis_title="Stress") fig.update_layout(height=int(600), width=int(800), template=template, showlegend=False, font_size=font_size, title_font_size=font_size) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_stress.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_stress.html") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file if values2['stress_plot'] == True: webbrowser.open('file://' + str(output_html)) #################################################################################################### ## plot 2D stress = stress_dict[2] if values2["draw_mesh"] == True: ## create dataframe from NMDS results nmds_results_df = pd.DataFrame( nmds_results_dict[2]["nmds_results"], index=[samples]) nmds_results_df.rename(columns={ 0: 'NMDS1', 1: 'NMDS2' }, inplace=True) nmds_results_df["Sample"] = samples nmds_results_df[meta_data_to_test] = Meta_data_table_df[ meta_data_to_test].values.tolist() combinations_list = [] for metadata in nmds_results_df[meta_data_to_test]: ## collect all entries for the respective metadata arr = nmds_results_df.loc[ nmds_results_df[meta_data_to_test] == metadata][[ 'NMDS1', 'NMDS2', meta_data_to_test, "Sample" ]].to_numpy() ## create a df for all possible combinations using itertools combinations for entry in list(combinations(arr, 2)): combinations_list.append(list(entry[0])) combinations_list.append(list(entry[1])) ## create a dataframe to draw the plot from df = pd.DataFrame(combinations_list) df.columns = ['NMDS1', 'NMDS2', meta_data_to_test, "Sample"] ## plot NMDS fig = go.Figure() fig = px.scatter( df, x="NMDS1", y="NMDS2", hover_data=['Sample'], color=meta_data_to_test, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(nmds_s), mode="markers+lines", line=dict(width=0.5)) fig.update_layout(title="Stress=" + str(stress), yaxis_title="NMDS1", xaxis_title="NMDS2") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, font_size=font_size, title_font_size=font_size) else: ## create dataframe from NMDS results nmds_results_df = pd.DataFrame( nmds_results_dict[2]["nmds_results"], index=[samples]) nmds_results_df.rename(columns={0: 'X', 1: 'Y'}, inplace=True) nmds_results_df[meta_data_to_test] = Meta_data_table_df[ meta_data_to_test].values.tolist() nmds_results_df["Sample"] = samples ## plot NMDS fig = go.Figure() fig = px.scatter( nmds_results_df, x="X", y="Y", hover_data=['Sample'], color=meta_data_to_test, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(nmds_s), mode="markers") fig.update_layout(title="Stress=" + str(stress), yaxis_title="NMDS1", xaxis_title="NMDS2") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, font_size=font_size, title_font_size=font_size) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_2d.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_2d.html") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file if values2['2d_plot'] == True: webbrowser.open('file://' + str(output_html)) #################################################################################################### ## plot 3D stress = stress_dict[3] if values2["draw_mesh"] == True: ## create dataframe from NMDS results nmds_results_df = pd.DataFrame( nmds_results_dict[3]["nmds_results"], index=[samples]) nmds_results_df["Sample"] = samples nmds_results_df[meta_data_to_test] = Meta_data_table_df[ meta_data_to_test].values.tolist() nmds_results_df.rename(columns={ 0: 'NMDS1', 1: 'NMDS2', 2: 'NMDS3' }, inplace=True) combinations_list = [] for metadata in nmds_results_df[meta_data_to_test]: ## collect all entries for the respective metadata arr = nmds_results_df.loc[ nmds_results_df[meta_data_to_test] == metadata][[ 'NMDS1', 'NMDS2', 'NMDS3', meta_data_to_test, "Sample" ]].to_numpy() ## create a df for all possible combinations using itertools combinations for entry in list(combinations(arr, 2)): combinations_list.append(list(entry[0])) combinations_list.append(list(entry[1])) ## create a dataframe to draw the plot from df = pd.DataFrame(combinations_list) df.columns = [ 'NMDS1', 'NMDS2', 'NMDS3', meta_data_to_test, "Sample" ] ## plot NMDS fig = go.Figure() ## draw the plot fig = px.scatter_3d( df, x="NMDS1", y="NMDS2", z="NMDS3", color=meta_data_to_test, text="Sample", title="textbox", color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(12), mode="markers+lines", line=dict(width=1)) fig.update_layout(height=int(height), width=int(width), template=template, title="Stress=" + str(stress), showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_layout(scene=dict(xaxis_title="NMDS1", yaxis_title="NMDS2", zaxis_title="NMDS3")) else: ## create dataframe from NMDS results nmds_results_df = pd.DataFrame( nmds_results_dict[3]["nmds_results"], index=[samples]) nmds_results_df["Sample"] = samples nmds_results_df[meta_data_to_test] = Meta_data_table_df[ meta_data_to_test].values.tolist() nmds_results_df.rename(columns={ 0: 'NMDS1', 1: 'NMDS2', 2: 'NMDS3' }, inplace=True) ## plot NMDS fig = go.Figure() ## draw the plot fig = px.scatter_3d( nmds_results_df, x="NMDS1", y="NMDS2", z="NMDS3", color=meta_data_to_test, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(12), mode="markers", line=dict(width=1)) fig.update_layout(height=int(height), width=int(width), template=template, title="Stress=" + str(stress), showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_layout(scene=dict(xaxis_title="NMDS1", yaxis_title="NMDS2", zaxis_title="NMDS3")) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.html") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file if values2['3d_plot'] == True: webbrowser.open('file://' + str(output_html)) #################################################################################################### ## print closing text closing_text = "NMDS plots are found in: " + str( path_to_outdirs) + "/NMDS_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log file from taxontabletools.create_log import ttt_log ttt_log("nmds analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs)
def basic_stats(TaXon_table_xlsx, heigth, width, path_to_outdirs, template, theme, font_size, taxonomic_level): import csv, glob, sys, os, webbrowser import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np from pathlib import Path from plotly.subplots import make_subplots import plotly.graph_objects as go TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx) TaXon_table_df = TaXon_table_df.replace(np.nan, 'nan', regex=True) ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level color1 = theme[0] color2 = theme[1] opacity_value = theme[2] # number of samples n_samples = len(TaXon_table_df.columns[10:].tolist()) # number of OTUs n_OTUs_total = len(TaXon_table_df['ID'].tolist()) # number of taxa per taxon level n_Phyla = list(set(TaXon_table_df['Phylum'].tolist())) if "nan" in n_Phyla: n_Phyla.remove("nan") n_Phyla = len(n_Phyla) n_Classes = list(set(TaXon_table_df['Class'].tolist())) if "nan" in n_Classes: n_Classes.remove("nan") n_Classes = len(n_Classes) n_Orders = list(set(TaXon_table_df['Order'].tolist())) if "nan" in n_Orders: n_Orders.remove("nan") n_Orders = len(n_Orders) n_Families = list(set(TaXon_table_df['Family'].tolist())) if "nan" in n_Families: n_Families.remove("nan") n_Families = len(n_Families) n_Genera = list(set(TaXon_table_df['Genus'].tolist())) if "nan" in n_Genera: n_Genera.remove("nan") n_Genera = len(n_Genera) n_Species = list(set(TaXon_table_df['Species'].tolist())) if "nan" in n_Species: n_Species.remove("nan") n_Species = len(n_Species) # number of respective status status_dict = {} status_entries_set = set(TaXon_table_df['Status'].tolist()) for status in status_entries_set: count = TaXon_table_df['Status'].tolist().count(status) status_dict[status] = count # sequence lengths sequence_list = TaXon_table_df['seq'].tolist() sequence_len_list = [] for sequence in sequence_list: sequence_len_list.append(len(sequence)) sequence_len_set = set(sequence_len_list) min_len_seq = min(sequence_len_set) max_len_seq = max(sequence_len_set) avg_len_seq = round(sum(sequence_len_set) / len(sequence_len_set)) # read stats per sample samples = TaXon_table_df.columns[10:].tolist() reads_dict = {} for sample in samples: # read stats reads_list = TaXon_table_df[sample].tolist() reads_sum = sum(reads_list) reads_avg = round(sum(reads_list) / len(reads_list)) # OTU stats OTUs_list = [] for OTU in reads_list: if OTU != 0: OTUs_list.append(OTU) n_OTUs = len(OTUs_list) # Species stats OTUs_species_list = TaXon_table_df[["Species", sample]].values.tolist() species_list = [] for OTU in OTUs_species_list: if OTU[0] != 'nan' and OTU[1] != 0: species_list.append(OTU[0]) n_species = len(set(species_list)) # combine to dict reads_dict[sample] = [reads_sum, reads_avg, n_OTUs, n_species] # read stats total read_sum_total = 0 for read_sum in reads_dict.values(): read_sum_total += read_sum[0] ##################################################################################### # Plot reads reads = [i[0] for i in reads_dict.values()] otus = [i[2] for i in reads_dict.values()] species = [i[3] for i in reads_dict.values()] max_otus = max(otus) + 20 width, heigth = int(width), int(heigth) # create subplots y_title = "# " + taxon_title title_3 = taxon_title + " on species level" fig = make_subplots(rows=3, cols=1, subplot_titles=("Reads", taxon_title, title_3), vertical_spacing=0.05, shared_xaxes=True) # reads fig.add_trace(go.Bar(name="reads", x=samples, y=reads),row=1, col=1) fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=1.5, opacity=opacity_value,row=1, col=1) fig.update_yaxes(title_text="# reads", row=1, col=1) # OTUs fig.add_trace(go.Bar(name=taxon_title, x=samples, y=otus),row=2, col=1) fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=1.5, opacity=opacity_value,row=2, col=1) fig.update_yaxes(range=[0, max_otus], title_text=y_title, row=2, col=1) # OTUs on species level fig.add_trace(go.Bar(name=title_3, x=samples, y=species),row=3, col=1) fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=1.5, opacity=opacity_value,row=3, col=1) fig.update_yaxes(range=[0, max_otus], title_text=y_title, row=3, col=1) # update the layout fig.update_layout(height=heigth, width=width, template=template, showlegend=False, font_size=font_size, title_font_size=font_size) ## finish script basic_stats_directory = Path(str(path_to_outdirs) + "/" + "Basic_stats" + "/" + TaXon_table_xlsx.stem) output_pdf = Path(str(basic_stats_directory) + "_basic_stats.pdf") output_html = Path(str(basic_stats_directory) + "_basic_stats.html") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show plot answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ##################################################################################### output_list_1 = [] output_list_2 = [] #df = pd.DataFrame(simple_list,columns=['col1','col2']) output_list_1.append([' Samples',n_samples, '']) output_list_1.append([' ' + taxon_title,n_OTUs_total, '']) output_list_1.append(['Number of taxa per taxon level', '#', '']) output_list_1.append([' Phyla',n_Phyla, '']) output_list_1.append([' Classes',n_Classes, '']) output_list_1.append([' Orders',n_Orders, '']) output_list_1.append([' Families',n_Families, '']) output_list_1.append([' Genera',n_Genera, '']) output_list_1.append([' Species',n_Species, '']) output_list_1.append(['Database status','#', '']) for status, count in status_dict.items(): output_list_1.append([" " + status,count, '']) output_list_1.append(['Sequence length','(bp)', '']) output_list_1.append([' Min', min_len_seq, '']) output_list_1.append([' Avg', avg_len_seq, '']) output_list_1.append([' Max', max_len_seq, '']) for sample, reads_stats in reads_dict.items(): output_list_2.append([sample, reads_stats[1], reads_stats[0], reads_stats[2], reads_stats[3]]) output_list_2.append(['Total reads', '', read_sum_total, '', '']) df_1 = pd.DataFrame(output_list_1,columns=['Category','#', '']) df_2 = pd.DataFrame(output_list_2,columns=["Sample", "avg reads", "total reads", "n OTUs", "n Species"]) df_out = pd.concat([df_1, df_2], axis=1) df_out = df_out.replace(np.nan, '', regex=True) basic_stats_directory = Path(str(path_to_outdirs) + "/" + "Basic_stats" + "/" + TaXon_table_xlsx.stem) basic_stats_xlsx = Path(str(basic_stats_directory) + "_basic_stats.xlsx") df_out.to_excel(basic_stats_xlsx, index=False) table_1 = [['Category','#']] + df_1.values.tolist() table_2 = [["Sample", "avg reads", "total reads", "n OTUs", "n Species"]] + df_2.values.tolist() table_layout_1 = [[sg.Text(' '.join(list(map(str, row))), size=(70,1)) for col in range(1)] for row in table_1] layout = [ [sg.TabGroup([[sg.Tab('General information', table_layout_1),]])], [sg.Button("Close", key="Close")]] window_basic_stats = sg.Window('Basic stats', layout, keep_on_top=True) while True: event, values = window_basic_stats.Read() if event is None or event == 'Close': window_basic_stats.close() break ## write to log file from taxontabletools.create_log import ttt_log ttt_log("basic stats", "analysis", TaXon_table_xlsx.name, basic_stats_xlsx.name, "nan", path_to_outdirs)
def gbif_occurrence(TaXon_table_xlsx, width, height, continents_to_check, template, theme, font_size, path_to_outdirs): import requests_html, json import PySimpleGUI as sg import pandas as pd from pandas import DataFrame import numpy as np from pathlib import Path import plotly.graph_objects as go import os, webbrowser ## dictionary with all country codes of the Earth country_codes_dict = { 'Andorra': ['AD', 'Europe'], 'United Arab Emirates': ['AE', 'Asia'], 'Afghanistan': ['AF', 'Asia'], 'Antigua and Barbuda': ['AG', 'North America'], 'Anguilla': ['AI', 'North America'], 'Albania': ['AL', 'Europe'], 'Armenia': ['AM', 'Asia'], 'Angola': ['AO', 'Africa'], 'Antarctica': ['AQ', 'Antarctica'], 'Argentina': ['AR', 'South America'], 'American Samoa': ['AS', 'Oceania'], 'Austria': ['AT', 'Europe'], 'Australia': ['AU', 'Oceania'], 'Aruba': ['AW', 'North America'], 'Åland Islands': ['AX', 'Europe'], 'Azerbaijan': ['AZ', 'Asia'], 'Bosnia and Herzegovina': ['BA', 'Europe'], 'Barbados': ['BB', 'North America'], 'Bangladesh': ['BD', 'Asia'], 'Belgium': ['BE', 'Europe'], 'Burkina Faso': ['BF', 'Africa'], 'Bulgaria': ['BG', 'Europe'], 'Bahrain': ['BH', 'Asia'], 'Burundi': ['BI', 'Africa'], 'Benin': ['BJ', 'Africa'], 'Saint Barthélemy': ['BL', 'North America'], 'Bermuda': ['BM', 'North America'], 'Brunei Darussalam': ['BN', 'Asia'], 'Bolivia': ['BO', 'South America'], 'Bonaire, Sint Eustatius and Saba': ['BQ', 'North America'], 'Brazil': ['BR', 'South America'], 'Bahamas': ['BS', 'North America'], 'Bhutan': ['BT', 'Asia'], 'Bouvet Island': ['BV', 'Antarctica'], 'Botswana': ['BW', 'Africa'], 'Belarus': ['BY', 'Europe'], 'Belize': ['BZ', 'North America'], 'Canada': ['CA', 'North America'], 'Cocos (Keeling) Islands': ['CC', 'Asia'], 'Congo (Democratic Republic)': ['CD', 'Africa'], 'Central African Republic': ['CF', 'Africa'], 'Congo': ['CG', 'Africa'], 'Switzerland': ['CH', 'Europe'], "Côte d'Ivoire": ['CI', 'Africa'], 'Cook Islands': ['CK', 'Oceania'], 'Chile': ['CL', 'South America'], 'Cameroon': ['CM', 'Africa'], 'China': ['CN', 'Asia'], 'Colombia': ['CO', 'South America'], 'Costa Rica': ['CR', 'North America'], 'Cuba': ['CU', 'North America'], 'Cabo Verde': ['CV', 'Africa'], 'Curaçao': ['CW', 'North America'], 'Christmas Island': ['CX', 'Asia'], 'Cyprus': ['CY', 'Asia'], 'Czechia': ['CZ', 'Europe'], 'Germany': ['DE', 'Europe'], 'Djibouti': ['DJ', 'Africa'], 'Denmark': ['DK', 'Europe'], 'Dominica': ['DM', 'North America'], 'Dominican Republic': ['DO', 'North America'], 'Algeria': ['DZ', 'Africa'], 'Ecuador': ['EC', 'South America'], 'Estonia': ['EE', 'Europe'], 'Egypt': ['EG', 'Africa'], 'Western Sahara': ['EH', 'Africa'], 'Eritrea': ['ER', 'Africa'], 'Spain': ['ES', 'Europe'], 'Ethiopia': ['ET', 'Africa'], 'Finland': ['FI', 'Europe'], 'Fiji': ['FJ', 'Oceania'], 'Falkland Islands': ['FK', 'South America'], 'Micronesia': ['FM', 'Oceania'], 'Faroe Islands': ['FO', 'Europe'], 'France': ['FR', 'Europe'], 'Gabon': ['GA', 'Africa'], 'United Kingdom': ['GB', 'Europe'], 'Grenada': ['GD', 'North America'], 'Georgia': ['GE', 'Asia'], 'French Guiana': ['GF', 'South America'], 'Guernsey': ['GG', 'Europe'], 'Ghana': ['GH', 'Africa'], 'Gibraltar': ['GI', 'Europe'], 'Greenland': ['GL', 'North America'], 'Gambia': ['GM', 'Africa'], 'Guinea': ['GN', 'Africa'], 'Guadeloupe': ['GP', 'North America'], 'Equatorial Guinea': ['GQ', 'Africa'], 'Greece': ['GR', 'Europe'], 'South Georgia and the South Sandwich Islands': ['GS', 'Antarctica'], 'Guatemala': ['GT', 'North America'], 'Guam': ['GU', 'Oceania'], 'Guinea-Bissau': ['GW', 'Africa'], 'Guyana': ['GY', 'South America'], 'Hong Kong': ['HK', 'Asia'], 'Heard Island and McDonald Islands': ['HM', 'Antarctica'], 'Honduras': ['HN', 'North America'], 'Croatia': ['HR', 'Europe'], 'Haiti': ['HT', 'North America'], 'Hungary': ['HU', 'Europe'], 'Indonesia': ['ID', 'Asia'], 'Ireland': ['IE', 'Europe'], 'Israel': ['IL', 'Asia'], 'Isle of Man': ['IM', 'Europe'], 'India': ['IN', 'Asia'], 'British Indian Ocean Territory': ['IO', 'Asia'], 'Iraq': ['IQ', 'Asia'], 'Iran': ['IR', 'Asia'], 'Iceland': ['IS', 'Europe'], 'Italy': ['IT', 'Europe'], 'Jersey': ['JE', 'Europe'], 'Jamaica': ['JM', 'North America'], 'Jordan': ['JO', 'Asia'], 'Japan': ['JP', 'Asia'], 'Kenya': ['KE', 'Africa'], 'Kyrgyzstan': ['KG', 'Asia'], 'Cambodia': ['KH', 'Asia'], 'Kiribati': ['KI', 'Oceania'], 'Comoros': ['KM', 'Africa'], 'Saint Kitts and Nevis': ['KN', 'North America'], "Korea (Democratic People's Republic)": ['KP', 'Asia'], 'Korea (Republic)': ['KR', 'Asia'], 'Kuwait': ['KW', 'Asia'], 'Cayman Islands': ['KY', 'North America'], 'Kazakhstan': ['KZ', 'Asia'], "Lao People's Democratic Republic": ['LA', 'Asia'], 'Lebanon': ['LB', 'Asia'], 'Saint Lucia': ['LC', 'North America'], 'Liechtenstein': ['LI', 'Europe'], 'Sri Lanka': ['LK', 'Asia'], 'Liberia': ['LR', 'Africa'], 'Lesotho': ['LS', 'Africa'], 'Lithuania': ['LT', 'Europe'], 'Luxembourg': ['LU', 'Europe'], 'Latvia': ['LV', 'Europe'], 'Libya': ['LY', 'Africa'], 'Morocco': ['MA', 'Africa'], 'Monaco': ['MC', 'Europe'], 'Moldova (the Republic of)': ['MD', 'Europe'], 'Montenegro': ['ME', 'Europe'], 'Saint Martin (French part)': ['MF', 'North America'], 'Madagascar': ['MG', 'Africa'], 'Marshall Islands': ['MH', 'Oceania'], 'Republic of North Macedonia': ['MK', 'Europe'], 'Mali': ['ML', 'Africa'], 'Myanmar': ['MM', 'Asia'], 'Mongolia': ['MN', 'Asia'], 'Macao': ['MO', 'Asia'], 'Northern Mariana Islands': ['MP', 'Oceania'], 'Martinique': ['MQ', 'North America'], 'Mauritania': ['MR', 'Africa'], 'Montserrat': ['MS', 'North America'], 'Malta': ['MT', 'Europe'], 'Mauritius': ['MU', 'Africa'], 'Maldives': ['MV', 'Asia'], 'Malawi': ['MW', 'Africa'], 'Mexico': ['MX', 'North America'], 'Malaysia': ['MY', 'Asia'], 'Mozambique': ['MZ', 'Africa'], 'Namibia': ['NA', 'Africa'], 'New Caledonia': ['NC', 'Oceania'], 'Niger': ['NE', 'Africa'], 'Norfolk Island': ['NF', 'Oceania'], 'Nigeria': ['NG', 'Africa'], 'Nicaragua': ['NI', 'North America'], 'Netherlands': ['NL', 'Europe'], 'Norway': ['NO', 'Europe'], 'Nepal': ['NP', 'Asia'], 'Nauru': ['NR', 'Oceania'], 'Niue': ['NU', 'Oceania'], 'New Zealand': ['NZ', 'Oceania'], 'Oman': ['OM', 'Asia'], 'Panama': ['PA', 'North America'], 'Peru': ['PE', 'South America'], 'French Polynesia': ['PF', 'Oceania'], 'Papua New Guinea': ['PG', 'Oceania'], 'Philippines': ['PH', 'Asia'], 'Pakistan': ['PK', 'Asia'], 'Poland': ['PL', 'Europe'], 'Saint Pierre and Miquelon': ['PM', 'North America'], 'Pitcairn': ['PN', 'Oceania'], 'Puerto Rico': ['PR', 'North America'], 'Palestine, State of': ['PS', 'Asia'], 'Portugal': ['PT', 'Europe'], 'Palau': ['PW', 'Oceania'], 'Paraguay': ['PY', 'South America'], 'Qatar': ['QA', 'Asia'], 'Réunion': ['RE', 'Africa'], 'Romania': ['RO', 'Europe'], 'Serbia': ['RS', 'Europe'], 'Russian Federation': ['RU', 'Europe'], 'Rwanda': ['RW', 'Africa'], 'Saudi Arabia': ['SA', 'Asia'], 'Solomon Islands': ['SB', 'Oceania'], 'Seychelles': ['SC', 'Africa'], 'Sudan': ['SD', 'Africa'], 'Sweden': ['SE', 'Europe'], 'Singapore': ['SG', 'Asia'], 'Saint Helena, Ascension and Tristan da Cunha': ['SH', 'Africa'], 'Slovenia': ['SI', 'Europe'], 'Svalbard and Jan Mayen': ['SJ', 'Europe'], 'Slovakia': ['SK', 'Europe'], 'Sierra Leone': ['SL', 'Africa'], 'San Marino': ['SM', 'Europe'], 'Senegal': ['SN', 'Africa'], 'Somalia': ['SO', 'Africa'], 'Suriname': ['SR', 'South America'], 'South Sudan': ['SS', 'Africa'], 'Sao Tome and Principe': ['ST', 'Africa'], 'El Salvador': ['SV', 'North America'], 'Syrian Arab Republic': ['SY', 'Asia'], 'Eswatini': ['SZ', 'Africa'], 'Turks and Caicos Islands': ['TC', 'North America'], 'Chad': ['TD', 'Africa'], 'French Southern Territories': ['TF', 'Antarctica'], 'Togo': ['TG', 'Africa'], 'Thailand': ['TH', 'Asia'], 'Tajikistan': ['TJ', 'Asia'], 'Tokelau': ['TK', 'Oceania'], 'Timor-Leste': ['TL', 'Asia'], 'Turkmenistan': ['TM', 'Asia'], 'Tunisia': ['TN', 'Africa'], 'Tonga': ['TO', 'Oceania'], 'Turkey': ['TR', 'Europe'], 'Trinidad and Tobago': ['TT', 'North America'], 'Tuvalu': ['TV', 'Oceania'], 'Taiwan': ['TW', 'Asia'], 'Tanzania': ['TZ', 'Africa'], 'Ukraine': ['UA', 'Europe'], 'Uganda': ['UG', 'Africa'], 'United States Minor Outlying Islands': ['UM', 'Oceania'], 'United States of America': ['US', 'North America'], 'Uruguay': ['UY', 'South America'], 'Uzbekistan': ['UZ', 'Asia'], 'Holy See': ['VA', 'Europe'], 'Saint Vincent and the Grenadines': ['VC', 'North America'], 'Venezuela (Bolivarian Republic of)': ['VE', 'South America'], 'Virgin Islands (British)': ['VG', 'North America'], 'Virgin Islands (U.S.)': ['VI', 'North America'], 'Viet Nam': ['VN', 'Asia'], 'Vanuatu': ['VU', 'Oceania'], 'Wallis and Futuna': ['WF', 'Oceania'], 'Samoa': ['WS', 'Oceania'], 'Yemen': ['YE', 'Asia'], 'Mayotte': ['YT', 'Africa'], 'South Africa': ['ZA', 'Africa'], 'Zambia': ['ZM', 'Africa'], 'Zimbabwe': ['ZW', 'Africa'] } ## load Taxon table #TaXon_table_xlsx = "/Users/tillmacher/Desktop/Projects/TTT_Projects/Projects/Sicliy_MZB/TaXon_tables/Sicily_eDNA_MZB_taxon_table_renamed_cons_derep_no_match_excluded_blanks_excluded_species.xlsx" TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("nan") ## get a unique list of taxa taxa = set([ taxon for taxon in TaXon_table_df["Species"].values.tolist() if taxon != "nan" ]) ## create a dataframe to store the results ## select only countries for the selected continents selected_countries_list = [ country for country, values in country_codes_dict.items() if values[1] in continents_to_check ] occurrence_df = pd.DataFrame(selected_countries_list, columns=["Country"]) ## calculate runtime n_countries = len(selected_countries_list) n_species = len(taxa) t_single_request = 0.15 t_total = round(t_single_request * n_species * n_countries / 60, 1) ## ask to continue the script answer = sg.PopupOKCancel("This will take roughly " + str(t_total) + " minutes. Continue?", title="Runtime") if answer == 'OK': ## create a subfolder for better sorting and overview dirName = Path( str(path_to_outdirs) + "/" + "Occurrence_analysis" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [ sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar') ], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(taxa) + 1 ############################################################################ ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += 0 progress_bar.UpdateBar(progress_update) ############################################################################ ## request gbif for the occurrence data for taxon_name in taxa: occurrence_list = [] for country, values in country_codes_dict.items(): country_code = values[0] continent = values[1] ## only check selected continents to reduce runtime if continent in continents_to_check: ## create an html session with requests_html.HTMLSession() as session: ## generate html request name request_name = '%20'.join(taxon_name.split(' ')) ## request that name r = session.get( "https://api.gbif.org/v1/occurrence/search?scientificName=" + request_name + "&country=" + country_code) ## parse json res = json.loads(r.text) ## get number of occurrences occurrence_list.append(res["count"]) ## store the results in the dataframe occurrence_df[taxon_name] = occurrence_list ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() ## remove countries that have 0 hits occurrence_df_filtered_list = [] for row in occurrence_df.values.tolist(): occurrences = set(row[1:]) if occurrences != {0}: occurrence_df_filtered_list.append(row) ## create a dataframe with relative values occurrence_df_filtered_relative = pd.DataFrame( occurrence_df_filtered_list) occurrence_df_filtered_relative.columns = occurrence_df.columns.tolist( ) ## create a dataframe with absolute values occurrence_df_filtered_absolute = pd.DataFrame( occurrence_df_filtered_list) occurrence_df_filtered_absolute.columns = occurrence_df.columns.tolist( ) ## convert dataframe to relative occurrence abundance for taxon in taxa: df = occurrence_df_filtered_relative[["Country", taxon]] df_2 = df[taxon] / df[taxon].sum() df = df.assign(perc=df_2.values * 100) df = df.drop([taxon], axis=1) df = df.rename(columns={"perc": taxon}) occurrence_df_filtered_relative[taxon] = df[taxon] fig = go.Figure() for row in occurrence_df_filtered_relative.values.tolist(): occurrences = row[1:] country = row[0] fig.add_trace( go.Bar(x=list(taxa), y=list(occurrences), text=country, name=country, textposition='auto')) fig.update_layout(barmode='stack', width=int(width), height=int(height), template=template, font_size=font_size, title_font_size=font_size) fig.update_yaxes(title="GBIF occurrence references (%)") n_occurrences = [] for taxon in taxa: n_occurrences.append( len([ value for value in occurrence_df_filtered_relative[taxon].values.tolist() if value != 0 ])) fig.add_trace( go.Scatter(x=list(taxa), y=[105] * len(taxa), text=n_occurrences, name="countries", mode="text")) ## define output files output_pdf = Path( str(dirName) + "/" + '_'.join(continents_to_check) + ".pdf") output_html = Path( str(dirName) + "/" + '_'.join(continents_to_check) + ".html") output_xlsx = Path( str(dirName) + "/" + '_'.join(continents_to_check) + ".xlsx") ## write to different sheets, one for absolute data, one for relative with pd.ExcelWriter(output_xlsx) as writer: occurrence_df_filtered_relative.to_excel(writer, sheet_name='relative', index=False) occurrence_df_filtered_absolute.to_excel(writer, sheet_name='absolute', index=False) ## write figures fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "GBIF occurrence plots and tables are found under: " + str( path_to_outdirs) + "/Occurrence_analysis/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log file from taxontabletools.create_log import ttt_log ttt_log("occurrence analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def PCoA_analysis(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, width, height, pcoa_s, path_to_outdirs, template, font_size, color_discrete_sequence, pcoa_dissimilarity): import pandas as pd import numpy as np from skbio.diversity import beta_diversity from skbio.stats.ordination import pcoa from skbio.stats.distance import anosim import plotly.graph_objects as go from plotly.subplots import make_subplots import plotly.express as px from pathlib import Path import PySimpleGUI as sg import os, webbrowser from itertools import combinations TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path( str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test) ## drop samples with metadata called nan (= empty) drop_samples = [ i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan" ] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame( [ i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples ], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() ## create a y axis title text taxon_title = taxonomic_level.lower() ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == len( Meta_data_table_df['Samples'].tolist()): sg.Popup( "The meta data is unique for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == 1: sg.Popup( "The meta data is similar for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples): samples = Meta_data_table_samples ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe TaXon_table_df = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in TaXon_table_df.index: TaXon_table_df = TaXon_table_df.drop('unidentified') data = TaXon_table_df[samples].transpose().values.tolist() jc_dm = beta_diversity(pcoa_dissimilarity, data, samples) ordination_result = pcoa(jc_dm) metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() anosim_results = anosim(jc_dm, metadata_list, permutations=999) anosim_r = round(anosim_results['test statistic'], 5) anosim_p = anosim_results['p-value'] textbox = meta_data_to_test + ", " + taxon_title + "<br>Anosim " + "R = " + str( anosim_r) + " " + "p = " + str(anosim_p) ####################################################################################### # create window to ask for PCoA axis to test def slices(list, slice): for i in range(0, len(list), slice): yield list[i:i + slice] # collect the PCoA proportion explained values proportion_explained_list = [] for i, pcoa_axis in enumerate(ordination_result.proportion_explained): if round(pcoa_axis * 100, 2) >= 1: proportion_explained_list.append("PC" + str(i + 1) + " (" + str(round(pcoa_axis * 100, 2)) + " %)") pcoa_axis_checkboxes = list( slices([ sg.Checkbox(name, key=name, size=(15, 1)) for name in proportion_explained_list ], 10)) pcoa_window_layout = [ [sg.Text('Check up to four axes to be displayed')], [sg.Frame(layout=pcoa_axis_checkboxes, title='')], [sg.Text('Only axes >= 1 % explained variance are shown')], [sg.CB("Connect categories", default=True, key="draw_mesh")], [sg.Text('')], [sg.Button('Plot', key='Plot')], [sg.Button('Back')], ] pcoa_window = sg.Window('PCoA axis', pcoa_window_layout, keep_on_top=True) while True: event, values = pcoa_window.read() draw_mesh = values["draw_mesh"] if event is None or event == 'Back': break if event == 'Plot': ## create a subfolder for better sorting and overview dirName = Path( str(path_to_outdirs) + "/" + "PCoA_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) # collect the pcoa axis values axis_to_plot = [ key for key, value in values.items() if value == True and "PC" in key ] # pass on only if two pcoa axes were checked if len(axis_to_plot) == 2: cat1 = axis_to_plot[1].split()[0] cat2 = axis_to_plot[0].split()[0] df_pcoa = ordination_result.samples[[cat1, cat2]] df_pcoa.insert( 2, "Metadata", Meta_data_table_df[meta_data_to_test].values.tolist(), True) df_pcoa.insert( 3, "Samples", Meta_data_table_df["Samples"].values.tolist(), True) if draw_mesh == True: combinations_list = [] for metadata in df_pcoa["Metadata"]: ## collect all entries for the respective metadata arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][ [cat1, cat2, "Metadata", "Samples"]].to_numpy() ## create a df for all possible combinations using itertools combinations for entry in list(combinations(arr, 2)): combinations_list.append(list(entry[0])) combinations_list.append(list(entry[1])) ## create a dataframe to draw the plot from df = pd.DataFrame(combinations_list) df.columns = [cat1, cat2, "Metadata", "Samples"] fig = px.scatter( df, x=cat1, y=cat2, color="Metadata", text="Samples", title=textbox, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers+lines") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_xaxes(title=axis_to_plot[1]) fig.update_yaxes(title=axis_to_plot[0]) else: fig = px.scatter( df_pcoa, x=cat1, y=cat2, color="Metadata", text="Samples", title=textbox, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_xaxes(title=axis_to_plot[1]) fig.update_yaxes(title=axis_to_plot[0]) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + ".pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + ".html") output_xlsx = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + ".xlsx") ## write files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ordination_result.samples[[cat1, cat2]].to_excel(output_xlsx) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "\n" + "PCoA plots are found in: " + str( path_to_outdirs) + "/PCoA_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log from taxontabletools.create_log import ttt_log ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) break elif len(axis_to_plot) == 3: cat1 = axis_to_plot[0].split()[0] cat2 = axis_to_plot[1].split()[0] cat3 = axis_to_plot[2].split()[0] df_pcoa = ordination_result.samples[[cat1, cat2, cat3]] df_pcoa.insert( 3, "Metadata", Meta_data_table_df[meta_data_to_test].values.tolist(), True) df_pcoa.insert( 4, "Samples", Meta_data_table_df["Samples"].values.tolist(), True) ## check if lines are to be drawn between the dots if draw_mesh == True: combinations_list = [] for metadata in df_pcoa["Metadata"]: ## collect all entries for the respective metadata arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][ [cat1, cat2, cat3, "Metadata", "Samples"]].to_numpy() ## create a df for all possible combinations using itertools combinations for entry in list(combinations(arr, 2)): combinations_list.append(list(entry[0])) combinations_list.append(list(entry[1])) ## create a dataframe to draw the plot from df = pd.DataFrame(combinations_list) df.columns = [cat1, cat2, cat3, "Metadata", "Samples"] ## draw the plot fig = px.scatter_3d( df, x=cat1, y=cat2, z=cat3, color="Metadata", text="Samples", title=textbox, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers+lines", line=dict(width=0.5)) fig.update_layout(height=int(height), width=int(width), template=template, title=textbox, showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_layout( scene=dict(xaxis_title=axis_to_plot[0], yaxis_title=axis_to_plot[1], zaxis_title=axis_to_plot[2])) else: fig = px.scatter_3d( df_pcoa, x=cat1, y=cat2, z=cat3, color="Metadata", text="Samples", color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, title=textbox, font_size=font_size, title_font_size=font_size) fig.update_layout( scene=dict(xaxis_title=axis_to_plot[0], yaxis_title=axis_to_plot[1], zaxis_title=axis_to_plot[2])) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.html") output_xlsx = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.xlsx") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ordination_result.samples[[cat1, cat2]].to_excel(output_xlsx) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "PCoA plots are found in: " + str( path_to_outdirs) + "/PCoA_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log file from taxontabletools.create_log import ttt_log ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) break else: sg.Popup("Please choose not more than 3 PCoA axes", title="Error", keep_on_top=True) if event == 'Plot matrix': if len(proportion_explained_list) >= 4: ## create a subfolder for better sorting and overview dirName = Path( str(path_to_outdirs) + "/" + "PCoA_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) df_pcoa = ordination_result.samples[[ "PC1", "PC2", "PC3", "PC4" ]] df_pcoa.insert( 4, "Metadata", Meta_data_table_df[meta_data_to_test].values.tolist(), True) df_pcoa.insert( 5, "Sample", Meta_data_table_df["Samples"].values.tolist(), True) fig = make_subplots(rows=4, cols=4) ########### 1 ########### fig.add_trace(go.Scatter(), row=1, col=1) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC1 (" + str( round( ordination_result.proportion_explained["PC1"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False) fig.update_xaxes(showticklabels=False, showgrid=False) fig.update_yaxes(showticklabels=False, showgrid=False) ########### 2 ########### df = df_pcoa[["PC1", "PC2", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC1"].values.tolist(), y=df_metadata["PC2"].values.tolist(), mode='markers', name=metadata, text=df_metadata["Sample"].values.tolist()), row=1, col=2) ########### 3 ########### df = df_pcoa[["PC1", "PC3", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC1"].values.tolist(), y=df_metadata["PC3"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=1, col=3) ########### 4 ########### df = df_pcoa[["PC1", "PC4", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] fig.add_trace(go.Scatter( x=df_metadata["PC1"].values.tolist(), y=df_metadata["PC4"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=1, col=4) fig.update_traces(marker_size=int(pcoa_s), mode="markers") fig.update_xaxes(showgrid=False, row=1, col=4) fig.update_yaxes(showgrid=False, row=1, col=4) ########### 5 ########### fig.add_trace(go.Scatter(), row=2, col=2) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC2 (" + str( round( ordination_result.proportion_explained["PC2"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False, row=2, col=2) ########### 6 ########### df = df_pcoa[["PC2", "PC3", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC2"].values.tolist(), y=df_metadata["PC3"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=2, col=3) ########### 7 ########### df = df_pcoa[["PC2", "PC4", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] fig.add_trace(go.Scatter( x=df_metadata["PC2"].values.tolist(), y=df_metadata["PC4"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=2, col=4) ########### 8 ########### fig.add_trace(go.Scatter(), row=3, col=3) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC3 (" + str( round( ordination_result.proportion_explained["PC3"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False, row=3, col=3) ########### 9 ########### df = df_pcoa[["PC3", "PC4", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC3"].values.tolist(), y=df_metadata["PC4"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=3, col=4) ########### 5 ########### fig.add_trace(go.Scatter(), row=4, col=4) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC4 (" + str( round( ordination_result.proportion_explained["PC4"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False, row=4, col=4) ###################### fig.update_xaxes(showline=True, mirror=True, linewidth=1, linecolor='black') fig.update_yaxes(showline=True, mirror=True, linewidth=1, linecolor='black') fig.update_traces(marker_size=int(pcoa_s), mode="markers") # finish plot matrix fig.update_layout(height=1000, width=1000, title_text=textbox) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_matrix.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_matrix.html") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "\n" + "PCoA plots are found in: " + str( path_to_outdirs) + "/PCoA_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log file from taxontabletools.create_log import ttt_log ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) break else: sg.Popup( "There must be at least 4 PCoA axis available to plot the matrix!" ) pcoa_window.close() else: sg.PopupError( "The sample of both the TaXon table and the metadata table have to match!" )
def create_taxon_table_per_sample(TaXon_table_xlsx, path_to_outdirs): import PySimpleGUI as sg import pandas as pd import numpy as np from pathlib import Path TaXon_table_file = Path(TaXon_table_xlsx) TaXon_table_xlsx = pd.ExcelFile(TaXon_table_file) TaXon_datasheet = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0) TaXon_table = TaXon_datasheet.values.tolist() samples_to_process = TaXon_datasheet.columns[10:] first_ten_columns_header = TaXon_datasheet.columns[:10].values.tolist() first_ten_columns = TaXon_datasheet.iloc[:,[0,1,2,3,4,5,6,7,8,9]].values.tolist() OTU_list = TaXon_datasheet['ID'].values.tolist() ############################################################################ ## create the progress bar window layout = [[sg.Text('Progress bar')], [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')], [sg.Cancel()]] window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True) progress_bar = window_progress_bar['progressbar'] progress_update = 0 progress_increase = 1000 / len(samples_to_process) + 1 ############################################################################ for sample in samples_to_process: Output_name = Path(sample + ".xlsx") Output_file = path_to_outdirs / "TaXon_tables_per_sample" / Output_name read_numbers = TaXon_datasheet[sample].values.tolist() sample_rows_list = [] for i, read_number in enumerate(read_numbers): if read_number > 0: sample_rows_list.append(first_ten_columns[i] + [read_number]) headers_df = pd.DataFrame([first_ten_columns_header + [sample]]) sample_df = pd.DataFrame(sample_rows_list) sample_df = headers_df.append(sample_df) sample_df.to_excel(Output_file, engine='xlsxwriter', sheet_name='TaXon table', index=False, header=False) ############################################################################ event, values = window_progress_bar.read(timeout=10) if event == 'Cancel' or event is None: window_progress_bar.Close() raise RuntimeError # update bar with loop value +1 so that bar eventually reaches the maximum progress_update += progress_increase progress_bar.UpdateBar(progress_update) ############################################################################ window_progress_bar.Close() closing_text = "\n" + "Taxon tables are found in: " + str(path_to_outdirs) + "/TaXon_tables_per_sample/" sg.Popup(closing_text, title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log placeholder = TaXon_table_file.name + " (multiple files)" ttt_log("taXon table per sample", "analysis", TaXon_table_file.name, placeholder, "nan", path_to_outdirs)