def output_barplot(df, path, settings, title=None): """Create barplots based on number of reads and total sum of nucleotides sequenced.""" logging.info( "NanoComp: Creating barplots for number of reads and total throughput." ) read_count = Plot(path=path + "NanoComp_number_of_reads.html", title="Comparing number of reads") read_count.fig = go.Figure() counts = df['dataset'].value_counts(sort=False).sort_index() idx = counts.index for idx, count in zip(idx, counts): read_count.fig.add_trace(go.Bar(x=[idx], y=[count], name=idx)) read_count.fig.update_layout( title_text=title or read_count.title, title_x=0.5, yaxis_title="Number of reads", ) read_count.html = read_count.fig.to_html(full_html=False, include_plotlyjs='cdn') read_count.save(settings) throughput_bases = Plot(path=path + "NanoComp_total_throughput.html", title="Comparing throughput in bases") if "aligned_lengths" in df: throughput = df.groupby('dataset')['aligned_lengths'].sum() ylabel = 'Total bases aligned' else: throughput = df.groupby('dataset')['lengths'].sum() ylabel = 'Total bases sequenced' idx = df["dataset"].unique() throughput_bases.fig = go.Figure() for idx, sum_dataset in zip(idx, throughput): throughput_bases.fig.add_trace( go.Bar(x=[idx], y=[sum_dataset], name=idx)) throughput_bases.fig.update_layout( title=title or throughput_bases.title, title_x=0.5, yaxis_title=ylabel, ) throughput_bases.html = throughput_bases.fig.to_html( full_html=False, include_plotlyjs='cdn') throughput_bases.save(settings) return read_count, throughput_bases
def overlay_histogram_phred(df, path, settings, palette=None): """ Reads with a perfect alignment and thus a percentIdentity of 100 get a phred score of Inf Which is not cool So these are set to 60, a very high phred score """ df["phredIdentity"] = -10 * np.log10(1 - (df["percentIdentity"] / 100)) df["phredIdentity"][np.isinf(df["phredIdentity"])] = 60 if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist_phred = Plot(path=path + "NanoComp_OverlayHistogram_PhredScore.html", title="Histogram of Phred scores") hist_phred.html, hist_phred.fig = plot_overlay_histogram(df, palette, "phredIdentity", hist_phred.title, bins=20, density=True) hist_phred.save(settings) return hist_phred
def yield_by_minimal_length_plot(array, name, path, settings, title=None, color="#4CB391"): df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]}) df["cumyield_gb"] = df["lengths"].cumsum() / 10**9 idx = np.random.choice(array.index, min(10000, len(array)), replace=False) yield_by_length = Plot(path=path + "Yield_By_Length.html", title="Yield by length") fig = px.scatter(df, x=df.reindex(idx)["lengths"], y=df.reindex(idx)["cumyield_gb"]) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Read length', yaxis_title='Cumulative yield for minimal length [Gb]', title=title or yield_by_length.title, title_x=0.5) yield_by_length.fig = fig yield_by_length.html = yield_by_length.fig.to_html(full_html=False, include_plotlyjs='cdn') yield_by_length.save(settings) return yield_by_length
def spatial_heatmap(array, path, colormap, figformat, title=None): """Taking channel information and creating post run channel activity plots.""" logging.info( "Nanoplotter: Creating heatmap of reads per channel using {} reads.". format(array.size)) activity_map = Plot(path=path + ".html", title="Number of reads generated per channel") layout = make_layout(maxval=np.amax(array)) valueCounts = pd.value_counts(pd.Series(array)) for entry in valueCounts.keys(): layout.template[np.where( layout.structure == entry)] = valueCounts[entry] data = pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks) fig = go.Figure( data=go.Heatmap(z=data.values.tolist(), colorscale=colormap)) fig.update_layout(xaxis_title='Channel', yaxis_title='Number of reads', title=title or activity_map.title, title_x=0.5) activity_map.fig = fig activity_map.html = activity_map.fig.to_html(full_html=False, include_plotlyjs='cdn') activity_map.save(figformat) return [activity_map]
def quality_over_time(dfs, path, settings, title=None, color="#4CB391"): time_qual = Plot(path=path + "TimeQualityViolinPlot.html", title="Violin plot of quality over time") fig = go.Figure() fig.add_trace( go.Violin(y=dfs["quals"], x=dfs["timebin"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Basecall quality', title=title or time_qual.title, title_x=0.5) fig.update_xaxes(tickangle=45) time_qual.fig = fig time_qual.html = time_qual.fig.to_html(full_html=False, include_plotlyjs='cdn') time_qual.save(settings) return time_qual
def sequencing_speed_over_time(dfs, path, title, settings, color="#4CB391"): time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot.html", title="Violin plot of sequencing speed over time") mask = dfs['duration'] != 0 fig = go.Figure() fig.add_trace( go.Violin(x=dfs.loc[mask, "timebin"], y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Sequencing speed (nucleotides/second)', title=title or time_duration.title, title_x=0.5) fig.update_xaxes(tickangle=45) time_duration.fig = fig time_duration.html = time_duration.fig.to_html(full_html=False, include_plotlyjs='cdn') time_duration.save(settings) return time_duration
def compare_cumulative_yields(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info( "NanoComp: Creating cumulative yield plots using {} reads.".format( len(dfs))) cum_yield_gb = Plot(path=path + "NanoComp_CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") data = [] annotations = [] for sample, color in zip(df["dataset"].unique(), palette): cumsum = dfs.loc[dfs["dataset"] == sample, "lengths"].cumsum().resample('10T').max() / 1e9 data.append( go.Scatter(x=cumsum.index.total_seconds() / 3600, y=cumsum, opacity=0.75, name=sample, marker=dict(color=color))) annotations.append( dict(xref='paper', x=0.99, y=cumsum[-1], xanchor='left', yanchor='middle', text='{}Gb'.format(round(cumsum[-1])), showarrow=False)) cum_yield_gb.html = plotly.offline.plot( { "data": data, "layout": go.Layout(barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), annotations=annotations) }, output_type="div", show_link=False) cum_yield_gb.fig = go.Figure({ "data": data, "layout": go.Layout(barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), annotations=annotations) }) cum_yield_gb.save() return [cum_yield_gb]
def plot_over_time(dfs, path, title, figformat, color="#4CB391"): num_reads = Plot(path=path + "NumberOfReads_Over_Time.html", title="Number of reads over time") s = dfs.loc[:, "lengths"].resample('10T').count() fig = px.scatter( data_frame=None, x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Number of reads per 10 minutes', title=title or num_reads.title, title_x=0.5) num_reads.fig = fig num_reads.html = num_reads.fig.to_html(full_html=False, include_plotlyjs='cdn') num_reads.save(figformat) plots = [num_reads] if "channelIDs" in dfs: pores_over_time = Plot(path=path + "ActivePores_Over_Time.html", title="Number of active pores over time") s = dfs.loc[:, "channelIDs"].resample('10T').nunique() fig = px.scatter( data_frame=None, x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Active pores per 10 minutes', title=title or pores_over_time.title, title_x=0.5) pores_over_time.fig = fig pores_over_time.html = pores_over_time.fig.to_html(full_html=False, include_plotlyjs='cdn') pores_over_time.save(figformat) plots.append(pores_over_time) return plots
def dynamic_histogram(array, name, path, title=None, color="#4CB391"): """ Use plotly to a histogram Return html code, but also save as png """ dynhist = Plot(path=path + "Dynamic_Histogram_{}.html".format(name.replace(' ', '_')), title=title or "Dynamic histogram of {}".format(name)) dynhist.html, dynhist.fig = plotly_histogram(array=array.sample(min(len(array), 10000)), color=color, title=dynhist.title) dynhist.save() return dynhist
def cumulative_yield(dfs, path, title, color, figformat): cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") s = dfs.loc[:, "lengths"].cumsum().resample('10T').max() / 1e9 fig = px.scatter( x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Cumulative yield in gigabase', title=title or cum_yield_gb.title, title_x=0.5) cum_yield_gb.fig = fig cum_yield_gb.html = cum_yield_gb.fig.to_html(full_html=False, include_plotlyjs='cdn') cum_yield_gb.save(figformat) cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads.html", title="Cumulative yield") s = dfs.loc[:, "lengths"].resample('10T').count().cumsum() fig = px.scatter( x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Cumulative yield in number of reads', title=title or cum_yield_gb.title, title_x=0.5) cum_yield_reads.fig = fig cum_yield_reads.html = cum_yield_reads.fig.to_html(full_html=False, include_plotlyjs='cdn') cum_yield_reads.save(figformat) return [cum_yield_gb, cum_yield_reads]
def length_over_time(dfs, path, title, settings, log_length=False, color="#4CB391"): if log_length: time_length = Plot(path=path + "TimeLogLengthViolinPlot.html", title="Violin plot of log read lengths over time") else: time_length = Plot(path=path + "TimeLengthViolinPlot.html", title="Violin plot of read lengths over time") length_column = "log_lengths" if log_length else "lengths" if "length_filter" in dfs: # produced by NanoPlot filtering of too long reads temp_dfs = dfs[dfs["length_filter"]] else: temp_dfs = dfs fig = go.Figure() fig.add_trace( go.Violin(y=temp_dfs[length_column], x=temp_dfs["timebin"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Read length', title=title or time_length.title, title_x=0.5) if log_length: ticks = [ 10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"]) ] fig.update_layout(yaxis=dict( tickmode='array', tickvals=np.log10(ticks), ticktext=ticks)) fig.update_yaxes(tickangle=45) time_length.fig = fig time_length.html = time_length.fig.to_html(full_html=False, include_plotlyjs='cdn') time_length.save(settings) return time_length
def overlay_histogram_identity(df, path, settings, palette=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist_pid = Plot(path=path + "NanoComp_OverlayHistogram_Identity.html", title="Histogram of percent reference identity") hist_pid.html, hist_pid.fig = plot_overlay_histogram(df, palette, "percentIdentity", hist_pid.title, density=True) hist_pid.save(settings) return hist_pid
def overlay_histogram(df, path, settings, palette=None): """ Use plotly to create an overlay of length histograms Return html code, but also save as figure (format specified) Only has 10 colors, which get recycled up to 5 times. """ if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist = Plot(path=path + "NanoComp_OverlayHistogram.html", title="Histogram of read lengths") hist.html, hist.fig = plot_overlay_histogram(df, palette, column='lengths', title=hist.title) hist.save(settings) hist_norm = Plot(path=path + "NanoComp_OverlayHistogram_Normalized.html", title="Normalized histogram of read lengths") hist_norm.html, hist_norm.fig = plot_overlay_histogram( df, palette, column='lengths', title=hist_norm.title, density=True) hist_norm.save(settings) log_hist = Plot(path=path + "NanoComp_OverlayLogHistogram.html", title="Histogram of log transformed read lengths") log_hist.html, log_hist.fig = plot_log_histogram(df, palette, title=log_hist.title) log_hist.save(settings) log_hist_norm = Plot( path=path + "NanoComp_OverlayLogHistogram_Normalized.html", title="Normalized histogram of log transformed read lengths") log_hist_norm.html, log_hist_norm.fig = plot_log_histogram( df, palette, title=log_hist_norm.title, density=True) log_hist_norm.save(settings) return [hist, hist_norm, log_hist, log_hist_norm]
def compare_cumulative_yields(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info( "Nanoplotter: Creating cumulative yield plots using {} reads.".format( len(dfs))) cum_yield_gb = Plot(path=path + "NanoComp_CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") data = [] for d, c in zip(df["dataset"].unique(), palette): s = dfs.loc[dfs["dataset"] == d, "lengths"].cumsum().resample('10T').max() / 1e9 data.append( go.Scatter(x=s.index.total_seconds() / 3600, y=s, opacity=0.75, name=d, marker=dict(color=c))) cum_yield_gb.html = plotly.offline.plot( { "data": data, "layout": go.Layout( barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), ) }, output_type="div", show_link=False) cum_yield_gb.fig = go.Figure({ "data": data, "layout": go.Layout( barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), ) }) cum_yield_gb.save() return [cum_yield_gb]
def overlay_histogram(df, path, palette=None): """ Use plotly to create an overlay of length histograms Return html code, but also save as png Only has 10 colors, which get recycled up to 5 times. """ if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist = Plot(path=path + "NanoComp_OverlayHistogram.html", title="Histogram of read lengths") hist.html, hist.fig = plot_overlay_histogram(df, palette, title=hist.title) hist.save() hist_norm = Plot(path=path + "NanoComp_OverlayHistogram_Normalized.html", title="Normalized histogram of read lengths") hist_norm.html, hist_norm.fig = plot_overlay_histogram( df, palette, title=hist_norm.title, histnorm="probability") hist_norm.save() log_hist = Plot(path=path + "NanoComp_OverlayLogHistogram.html", title="Histogram of log transformed read lengths") log_hist.html, log_hist.fig = plot_log_histogram(df, palette, title=log_hist.title) log_hist.save() log_hist_norm = Plot( path=path + "NanoComp_OverlayLogHistogram_Normalized.html", title="Normalized histogram of log transformed read lengths") log_hist_norm.html, log_hist_norm.fig = plot_log_histogram( df, palette, title=log_hist_norm.title, histnorm="probability") log_hist_norm.save() return [hist, hist_norm, log_hist, log_hist_norm]
def active_pores_over_time(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info("NanoComp: Creating active pores plot using {} reads.".format( len(dfs))) active_pores = Plot(path=path + "NanoComp_ActivePoresOverTime.html", title="Active pores over time") data = [] for sample, color in zip(df["dataset"].unique(), palette): pores = dfs.loc[dfs["dataset"] == sample, "channelIDs"].resample('10T').nunique() data.append( go.Scatter(x=pores.index.total_seconds() / 3600, y=pores, opacity=0.75, name=sample, marker=dict(color=color))) active_pores.html = plotly.offline.plot( { "data": data, "layout": go.Layout( barmode='overlay', title=title or active_pores.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Active pores (per 10 minutes)"), ) }, output_type="div", show_link=False) active_pores.fig = go.Figure({ "data": data, "layout": go.Layout( barmode='overlay', title=title or active_pores.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Active pores (per 10 minutes)"), ) }) active_pores.save() return active_pores
def dynamic_histogram(array, name, path, figformat, title=None, color="#4CB391"): """ Use plotly to a histogram Return html code, but also save as png """ dynhist = Plot( path=path + f"Dynamic_Histogram_{name[0].lower() + name[1:].replace(' ', '_')}.html", title="Dynamic histogram of {}".format(name[0].lower() + name[1:])) ylabel = "Number of reads" if len(array) <= 10000 else "Downsampled number of reads" dynhist.html, dynhist.fig = plotly_histogram(array=array.sample(min(len(array), 10000)), color=color, title=title or dynhist.title, xlabel=name, ylabel=ylabel) dynhist.save(figformat) return dynhist
def overlay_histogram_phred(df, path, figformat, palette=None): df["phredIdentity"] = -10 * np.log10(1 - (df["percentIdentity"] / 100)) if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist_phred = Plot(path=path + "NanoComp_OverlayHistogram_PhredScore.html", title="Histogram of Phred scores") hist_phred.html, hist_phred.fig = plot_overlay_histogram(df, palette, "phredIdentity", hist_phred.title, bins=20, density=True) hist_phred.save(figformat=figformat) return hist_phred
def n50_barplot(df, path, settings, title=None): ''' Returns Plot object and creates figure(format specified)/html containing bar chart of total gb aligned/sequenced read length n50 ''' n50_bar = Plot(path=path + "NanoComp_N50.html", title="Comparing read length N50") if "aligned_lengths" in df: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"])) for d in df["dataset"].unique() ] ylabel = 'Total gigabase aligned' else: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"])) for d in df["dataset"].unique() ] ylabel = 'Sequenced read length N50' idx = df["dataset"].unique() n50_bar.fig = go.Figure() for idx, n50 in zip(idx, n50s): n50_bar.fig.add_trace(go.Bar(x=[idx], y=[n50], name=idx)) n50_bar.fig.update_layout( title=title or n50_bar.title, title_x=0.5, yaxis_title=ylabel, ) n50_bar.html = n50_bar.fig.to_html(full_html=False, include_plotlyjs='cdn') n50_bar.save(settings) return [n50_bar]
def compare_sequencing_speed(df, path, settings, title=None): logging.info( "NanoComp: creating comparison of sequencing speed over time.") seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time.html", title="Sequencing speed over time") dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") dfs = dfs.loc[dfs["duration"] > 0] palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 data = [] for sample, color in zip(df["dataset"].unique(), palette): seqspeed = (dfs.loc[dfs["dataset"] == sample, "lengths"] / dfs.loc[dfs["dataset"] == sample, "duration"]).resample('30T').median() data.append( go.Scatter(x=seqspeed.index.total_seconds() / 3600, y=seqspeed, opacity=0.75, name=sample, mode='lines', marker=dict(color=color))) seq_speed.fig = go.Figure({"data": data}) seq_speed.fig.update_layout( title=title or seq_speed.title, title_x=0.5, xaxis_title='Interval (hours)', yaxis_title="Sequencing speed (nucleotides/second)") seq_speed.html = seq_speed.fig.to_html(full_html=False, include_plotlyjs='cdn') seq_speed.save(settings) return [seq_speed]
def scatter(x, y, legacy, names, path, plots, color, colormap, settings, stat=None, log=False, minvalx=0, minvaly=0, title=None, xmax=None, ymax=None): """-> create marginalised scatterplots and KDE plot with marginalized histograms -> update from scatter_legacy function to utilise plotly package - scatterplot with histogram on both axes - kernel density plot with histograms on both axes - hexbin not implemented yet - pauvre plot temporarily not available """ logging.info( f"NanoPlot: Creating {names[0]} vs {names[1]} plots using {x.size} reads." ) if not contains_variance([x, y], names): return [] plots_made = [] idx = np.random.choice(x.index, min(10000, len(x)), replace=False) maxvalx = xmax or np.amax(x[idx]) maxvaly = ymax or np.amax(y[idx]) if plots["dot"]: if log: dot_plot = Plot(path=path + "_loglength_dot.html", title=f"{names[0]} vs {names[1]} plot using dots " "after log transformation of read lengths") else: dot_plot = Plot(path=path + "_dot.html", title=f"{names[0]} vs {names[1]} plot using dots") fig = px.scatter(x=x[idx], y=y[idx], marginal_x="histogram", marginal_y="histogram", range_x=[minvalx, maxvalx], range_y=[minvaly, maxvaly]) fig.update_traces(marker=dict(color=color)) fig.update_yaxes(rangemode="tozero") fig.update_xaxes(rangemode="tozero") fig.update_layout(xaxis_title=names[0], yaxis_title=names[1], title=title or dot_plot.title, title_x=0.5) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] fig.update_layout(xaxis=dict(tickmode='array', tickvals=np.log10(ticks), ticktext=ticks, tickangle=45)) dot_plot.fig = fig dot_plot.html = dot_plot.fig.to_html(full_html=False, include_plotlyjs='cdn') dot_plot.save(settings) plots_made.append(dot_plot) if plots["kde"]: kde_plot = Plot(path=path + "_loglength_kde.html" if log else path + "_kde.html", title=f"{names[0]} vs {names[1]} kde plot") col = hex_to_rgb_scale_0_1(color) fig = ff.create_2d_density(x[idx], y[idx], point_size=3, hist_color=col, point_color=col, colorscale=colormap) fig.update_layout(xaxis_title=names[0], yaxis_title=names[1], title=title or kde_plot.title, title_x=0.5, xaxis=dict(tickangle=45)) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] fig.update_layout(xaxis=dict(tickmode='array', tickvals=np.log10(ticks), ticktext=ticks, tickangle=45)) kde_plot.fig = fig kde_plot.html = kde_plot.fig.to_html(full_html=False, include_plotlyjs='cdn') kde_plot.save(settings) plots_made.append(kde_plot) if 1 in legacy.values(): settings, args = utils.get_args() plots_made += scatter_legacy(x=x[idx], y=y[idx], names=names, path=path, plots=legacy, color=color, settings=settings, stat=stat, log=log, minvalx=minvalx, minvaly=minvaly, title=title) return plots_made
def length_plots(array, name, path, settings, title=None, n50=None, color="#4CB391"): """Create histogram of normal and log transformed read lengths.""" logging.info("NanoPlot: Creating length plots for {}.".format(name)) maxvalx = np.amax(array) if n50: logging.info( "NanoPlot: Using {} reads with read length N50 of {}bp and maximum of {}bp." .format(array.size, n50, maxvalx)) else: logging.info( f"NanoPlot: Using {array.size} reads maximum of {maxvalx}bp.") plots = [] HistType = [{ 'weight': array, 'name': 'Weighted', 'ylabel': 'Number of reads' }, { 'weight': None, 'name': 'Non weighted', 'ylabel': 'Number of reads' }] for h_type in HistType: histogram = Plot(path=path + h_type["name"].replace(" ", "_") + "Histogram" + name.replace(' ', '') + ".html", title=f"{h_type['name']} histogram of read lengths") hist, bin_edges = np.histogram(array, bins=max(round(int(maxvalx) / 500), 10), weights=h_type["weight"]) fig = go.Figure() fig.add_trace(go.Bar(x=bin_edges[1:], y=hist, marker_color=color)) if n50: fig.add_vline(n50) fig.add_annotation(text='N50', x=n50, y=0.95) fig.update_annotations(font_size=8) fig.update_layout(xaxis_title='Read length', yaxis_title=h_type["ylabel"], title=title or histogram.title, title_x=0.5) histogram.fig = fig histogram.html = histogram.fig.to_html(full_html=False, include_plotlyjs='cdn') histogram.save(settings) log_histogram = Plot( path=path + h_type["name"].replace(" ", "_") + "LogTransformed_Histogram" + name.replace(' ', '') + ".html", title=h_type["name"] + " histogram of read lengths after log transformation") if h_type["weight"] is None: hist_log, bin_edges_log = np.histogram( np.log10(array), bins=max(round(int(maxvalx) / 500), 10), weights=h_type["weight"]) else: hist_log, bin_edges_log = np.histogram( np.log10(array), bins=max(round(int(maxvalx) / 500), 10), weights=np.log10(h_type["weight"])) fig = go.Figure() fig.add_trace( go.Bar(x=bin_edges_log[1:], y=hist_log, marker_color=color)) ticks = [10**i for i in range(10) if not 10**i > 10 * maxvalx] fig.update_layout(xaxis=dict(tickmode='array', tickvals=np.log10(ticks), ticktext=ticks), xaxis_title='Read length', yaxis_title=h_type["ylabel"], title=title or log_histogram.title, title_x=0.5) if n50: fig.add_vline(np.log10(n50)) fig.add_annotation(text='N50', x=np.log10(n50), y=0.95) fig.update_annotations(font_size=8) log_histogram.fig = fig log_histogram.html = log_histogram.fig.to_html(full_html=False, include_plotlyjs='cdn') log_histogram.save(settings) plots.extend([histogram, log_histogram]) plots.append( yield_by_minimal_length_plot(array=array, name=name, path=path, title=title, color=color, settings=settings)) return plots
def violin_or_box_plot(df, y, path, y_name, settings, title=None, plot="violin", log=False): """Create a violin/boxplot/ridge from the received DataFrame. The x-axis should be divided based on the 'dataset' column, the y-axis is specified in the arguments """ comp = Plot(path=f"{path}NanoComp_{y.replace(' ', '_')}_{plot}.html", title=f"Comparing {y_name.lower()}") if plot == 'violin': logging.info(f"NanoComp: Creating violin plot for {y}.") fig = go.Figure() for dataset in df["dataset"].unique(): fig.add_trace( go.Violin(x=df["dataset"][df["dataset"] == dataset], y=df[y][df["dataset"] == dataset], points=False, name=dataset)) process_violin_and_box(fig, log=log, plot_obj=comp, title=title, y_name=y_name, ymax=np.amax(df[y]), settings=settings) elif plot == 'box': logging.info("NanoComp: Creating box plot for {}.".format(y)) fig = go.Figure() for dataset in df["dataset"].unique(): fig.add_trace( go.Box(x=df["dataset"][df["dataset"] == dataset], y=df[y][df["dataset"] == dataset], name=dataset)) process_violin_and_box(fig, log=log, plot_obj=comp, title=title, y_name=y_name, ymax=np.amax(df[y]), settings=settings) elif plot == 'ridge': logging.info("NanoComp: Creating ridges plot for {}.".format(y)) fig = go.Figure() for d in df["dataset"].unique(): fig.add_trace(go.Violin(x=df[y][df['dataset'] == d], name=d)) fig.update_traces(orientation='h', side='positive', width=3, points=False) fig.update_layout(title=title or comp.title, title_x=0.5) comp.fig = fig comp.html = comp.fig.to_html(full_html=False, include_plotlyjs='cdn') comp.save(settings) else: logging.error(f"Unknown comp plot type {plot}") sys.exit(f"Unknown comp plot type {plot}") return [comp]