def output_barplot(df, figformat, path, title=None, palette=None): """Create barplots based on number of reads and total sum of nucleotides sequenced.""" logging.info( "NanoComp: Creating barplots for number of reads and total throughput." ) read_count = Plot(path=path + "NanoComp_number_of_reads." + figformat, title="Comparing number of reads") ax = sns.countplot(x="dataset", data=df, palette=palette) ax.set(ylabel='Number of reads', title=title or read_count.title) plt.xticks(rotation=30, ha='center') read_count.fig = ax.get_figure() read_count.save(format=figformat) plt.close("all") throughput_bases = Plot(path=path + "NanoComp_total_throughput." + figformat, title="Comparing throughput in gigabases") if "aligned_lengths" in df: throughput = df.groupby('dataset')['aligned_lengths'].sum() ylabel = 'Total gigabase aligned' else: throughput = df.groupby('dataset')['lengths'].sum() ylabel = 'Total gigabase sequenced' ax = sns.barplot(x=list(throughput.index), y=throughput / 1e9, palette=palette, order=df["dataset"].unique()) ax.set(ylabel=ylabel, title=title or throughput_bases.title) plt.xticks(rotation=30, ha='center') throughput_bases.fig = ax.get_figure() throughput_bases.save(format=figformat) plt.close("all") return read_count, throughput_bases
def cumulative_yield(dfs, path, figformat, title, color): cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases." + figformat, title="Cumulative yield") s = dfs.loc[:, "lengths"].cumsum().resample('1T').max() / 1e9 ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Cumulative yield in gigabase', title=title or cum_yield_gb.title) cum_yield_gb.fig = ax.get_figure() cum_yield_gb.save(format=figformat) plt.close("all") cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads." + figformat, title="Cumulative yield") s = dfs.loc[:, "lengths"].resample('10T').count().cumsum() ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Cumulative yield in number of reads', title=title or cum_yield_reads.title) cum_yield_reads.fig = ax.get_figure() cum_yield_reads.save(format=figformat) plt.close("all") return [cum_yield_gb, cum_yield_reads]
def output_barplot(df, path, settings, title=None): """Create barplots based on number of reads and total sum of nucleotides sequenced.""" logging.info( "NanoComp: Creating barplots for number of reads and total throughput." ) read_count = Plot(path=path + "NanoComp_number_of_reads.html", title="Comparing number of reads") read_count.fig = go.Figure() counts = df['dataset'].value_counts(sort=False).sort_index() idx = counts.index for idx, count in zip(idx, counts): read_count.fig.add_trace(go.Bar(x=[idx], y=[count], name=idx)) read_count.fig.update_layout( title_text=title or read_count.title, title_x=0.5, yaxis_title="Number of reads", ) read_count.html = read_count.fig.to_html(full_html=False, include_plotlyjs='cdn') read_count.save(settings) throughput_bases = Plot(path=path + "NanoComp_total_throughput.html", title="Comparing throughput in bases") if "aligned_lengths" in df: throughput = df.groupby('dataset')['aligned_lengths'].sum() ylabel = 'Total bases aligned' else: throughput = df.groupby('dataset')['lengths'].sum() ylabel = 'Total bases sequenced' idx = df["dataset"].unique() throughput_bases.fig = go.Figure() for idx, sum_dataset in zip(idx, throughput): throughput_bases.fig.add_trace( go.Bar(x=[idx], y=[sum_dataset], name=idx)) throughput_bases.fig.update_layout( title=title or throughput_bases.title, title_x=0.5, yaxis_title=ylabel, ) throughput_bases.html = throughput_bases.fig.to_html( full_html=False, include_plotlyjs='cdn') throughput_bases.save(settings) return read_count, throughput_bases
def spatial_heatmap(array, path, colormap, figformat, title=None): """Taking channel information and creating post run channel activity plots.""" logging.info( "Nanoplotter: Creating heatmap of reads per channel using {} reads.". format(array.size)) activity_map = Plot(path=path + ".html", title="Number of reads generated per channel") layout = make_layout(maxval=np.amax(array)) valueCounts = pd.value_counts(pd.Series(array)) for entry in valueCounts.keys(): layout.template[np.where( layout.structure == entry)] = valueCounts[entry] data = pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks) fig = go.Figure( data=go.Heatmap(z=data.values.tolist(), colorscale=colormap)) fig.update_layout(xaxis_title='Channel', yaxis_title='Number of reads', title=title or activity_map.title, title_x=0.5) activity_map.fig = fig activity_map.html = activity_map.fig.to_html(full_html=False, include_plotlyjs='cdn') activity_map.save(figformat) return [activity_map]
def quality_over_time(dfs, path, settings, title=None, color="#4CB391"): time_qual = Plot(path=path + "TimeQualityViolinPlot.html", title="Violin plot of quality over time") fig = go.Figure() fig.add_trace( go.Violin(y=dfs["quals"], x=dfs["timebin"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Basecall quality', title=title or time_qual.title, title_x=0.5) fig.update_xaxes(tickangle=45) time_qual.fig = fig time_qual.html = time_qual.fig.to_html(full_html=False, include_plotlyjs='cdn') time_qual.save(settings) return time_qual
def n50_barplot(df, figformat, path, title=None, palette=None): n50_bar = Plot(path=path + "NanoComp_N50." + figformat, title="Comparing read length N50") if "aligned_lengths" in df: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"])) for d in df["dataset"].unique() ] ylabel = 'Total gigabase aligned' else: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"])) for d in df["dataset"].unique() ] ylabel = 'Sequenced read length N50' ax = sns.barplot(x=list(df["dataset"].unique()), y=n50s, palette=palette, order=df["dataset"].unique()) ax.set(ylabel=ylabel, title=title or n50_bar.title) plt.xticks(rotation=30, ha='center') n50_bar.fig = ax.get_figure() n50_bar.save(format=figformat) plt.close("all") return [n50_bar]
def sequencing_speed_over_time(dfs, path, title, settings, color="#4CB391"): time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot.html", title="Violin plot of sequencing speed over time") mask = dfs['duration'] != 0 fig = go.Figure() fig.add_trace( go.Violin(x=dfs.loc[mask, "timebin"], y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Sequencing speed (nucleotides/second)', title=title or time_duration.title, title_x=0.5) fig.update_xaxes(tickangle=45) time_duration.fig = fig time_duration.html = time_duration.fig.to_html(full_html=False, include_plotlyjs='cdn') time_duration.save(settings) return time_duration
def yield_by_minimal_length_plot(array, name, path, settings, title=None, color="#4CB391"): df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]}) df["cumyield_gb"] = df["lengths"].cumsum() / 10**9 idx = np.random.choice(array.index, min(10000, len(array)), replace=False) yield_by_length = Plot(path=path + "Yield_By_Length.html", title="Yield by length") fig = px.scatter(df, x=df.reindex(idx)["lengths"], y=df.reindex(idx)["cumyield_gb"]) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Read length', yaxis_title='Cumulative yield for minimal length [Gb]', title=title or yield_by_length.title, title_x=0.5) yield_by_length.fig = fig yield_by_length.html = yield_by_length.fig.to_html(full_html=False, include_plotlyjs='cdn') yield_by_length.save(settings) return yield_by_length
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"): """Taking channel information and creating post run channel activity plots.""" logging.info( "Nanoplotter: Creating heatmap of reads per channel using {} reads.". format(array.size)) activity_map = Plot(path=path + "." + figformat, title="Number of reads generated per channel") layout = make_layout(maxval=np.amax(array)) valueCounts = pd.value_counts(pd.Series(array)) for entry in valueCounts.keys(): layout.template[np.where( layout.structure == entry)] = valueCounts[entry] plt.figure() ax = sns.heatmap(data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks), xticklabels="auto", yticklabels="auto", square=True, cbar_kws={"orientation": "horizontal"}, cmap=color, linewidths=0.20) ax.set_title(title or activity_map.title) activity_map.fig = ax.get_figure() activity_map.save(format=figformat) plt.close("all") return [activity_map]
def length_over_time(dfs, path, figformat, title, log_length=False, plot_settings={}): time_length = Plot(path=path + "TimeLengthViolinPlot." + figformat, title="Violin plot of read lengths over time") sns.set(style="white", **plot_settings) if log_length: length_column = "log_lengths" else: length_column = "lengths" if "length_filter" in dfs: # produced by NanoPlot filtering of too long reads temp_dfs = dfs[dfs["length_filter"]] else: temp_dfs = dfs ax = sns.violinplot(x="timebin", y=length_column, data=temp_dfs, inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Read length", title=title or time_length.title) if log_length: ticks = [10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"])] ax.set(yticks=np.log10(ticks), yticklabels=ticks) plt.xticks(rotation=45, ha='center', fontsize=8) time_length.fig = ax.get_figure() time_length.save(format=figformat) plt.close("all") return time_length
def compare_cumulative_yields(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info( "NanoComp: Creating cumulative yield plots using {} reads.".format( len(dfs))) cum_yield_gb = Plot(path=path + "NanoComp_CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") data = [] annotations = [] for sample, color in zip(df["dataset"].unique(), palette): cumsum = dfs.loc[dfs["dataset"] == sample, "lengths"].cumsum().resample('10T').max() / 1e9 data.append( go.Scatter(x=cumsum.index.total_seconds() / 3600, y=cumsum, opacity=0.75, name=sample, marker=dict(color=color))) annotations.append( dict(xref='paper', x=0.99, y=cumsum[-1], xanchor='left', yanchor='middle', text='{}Gb'.format(round(cumsum[-1])), showarrow=False)) cum_yield_gb.html = plotly.offline.plot( { "data": data, "layout": go.Layout(barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), annotations=annotations) }, output_type="div", show_link=False) cum_yield_gb.fig = go.Figure({ "data": data, "layout": go.Layout(barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), annotations=annotations) }) cum_yield_gb.save() return [cum_yield_gb]
def plot_over_time(dfs, path, title, figformat, color="#4CB391"): num_reads = Plot(path=path + "NumberOfReads_Over_Time.html", title="Number of reads over time") s = dfs.loc[:, "lengths"].resample('10T').count() fig = px.scatter( data_frame=None, x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Number of reads per 10 minutes', title=title or num_reads.title, title_x=0.5) num_reads.fig = fig num_reads.html = num_reads.fig.to_html(full_html=False, include_plotlyjs='cdn') num_reads.save(figformat) plots = [num_reads] if "channelIDs" in dfs: pores_over_time = Plot(path=path + "ActivePores_Over_Time.html", title="Number of active pores over time") s = dfs.loc[:, "channelIDs"].resample('10T').nunique() fig = px.scatter( data_frame=None, x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Active pores per 10 minutes', title=title or pores_over_time.title, title_x=0.5) pores_over_time.fig = fig pores_over_time.html = pores_over_time.fig.to_html(full_html=False, include_plotlyjs='cdn') pores_over_time.save(figformat) plots.append(pores_over_time) return plots
def cumulative_yield(dfs, path, title, color, figformat): cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") s = dfs.loc[:, "lengths"].cumsum().resample('10T').max() / 1e9 fig = px.scatter( x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Cumulative yield in gigabase', title=title or cum_yield_gb.title, title_x=0.5) cum_yield_gb.fig = fig cum_yield_gb.html = cum_yield_gb.fig.to_html(full_html=False, include_plotlyjs='cdn') cum_yield_gb.save(figformat) cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads.html", title="Cumulative yield") s = dfs.loc[:, "lengths"].resample('10T').count().cumsum() fig = px.scatter( x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Cumulative yield in number of reads', title=title or cum_yield_gb.title, title_x=0.5) cum_yield_reads.fig = fig cum_yield_reads.html = cum_yield_reads.fig.to_html(full_html=False, include_plotlyjs='cdn') cum_yield_reads.save(figformat) return [cum_yield_gb, cum_yield_reads]
def length_over_time(dfs, path, title, settings, log_length=False, color="#4CB391"): if log_length: time_length = Plot(path=path + "TimeLogLengthViolinPlot.html", title="Violin plot of log read lengths over time") else: time_length = Plot(path=path + "TimeLengthViolinPlot.html", title="Violin plot of read lengths over time") length_column = "log_lengths" if log_length else "lengths" if "length_filter" in dfs: # produced by NanoPlot filtering of too long reads temp_dfs = dfs[dfs["length_filter"]] else: temp_dfs = dfs fig = go.Figure() fig.add_trace( go.Violin(y=temp_dfs[length_column], x=temp_dfs["timebin"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Read length', title=title or time_length.title, title_x=0.5) if log_length: ticks = [ 10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"]) ] fig.update_layout(yaxis=dict( tickmode='array', tickvals=np.log10(ticks), ticktext=ticks)) fig.update_yaxes(tickangle=45) time_length.fig = fig time_length.html = time_length.fig.to_html(full_html=False, include_plotlyjs='cdn') time_length.save(settings) return time_length
def plot_over_time(dfs, path, figformat, title, color): num_reads = Plot(path=path + "NumberOfReads_Over_Time." + figformat, title="Number of reads over time") s = dfs.loc[:, "lengths"].resample('10T').count() ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Number of reads per 10 minutes', title=title or num_reads.title) num_reads.fig = ax.get_figure() num_reads.save(format=figformat) plt.close("all") plots = [num_reads] if "channelIDs" in dfs: pores_over_time = Plot(path=path + "ActivePores_Over_Time." + figformat, title="Number of active pores over time") s = dfs.loc[:, "channelIDs"].resample('10T').nunique() ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Active pores per 10 minutes', title=title or pores_over_time.title) pores_over_time.fig = ax.get_figure() pores_over_time.save(format=figformat) plt.close("all") plots.append(pores_over_time) return plots
def compare_cumulative_yields(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info( "Nanoplotter: Creating cumulative yield plots using {} reads.".format( len(dfs))) cum_yield_gb = Plot(path=path + "NanoComp_CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") data = [] for d, c in zip(df["dataset"].unique(), palette): s = dfs.loc[dfs["dataset"] == d, "lengths"].cumsum().resample('10T').max() / 1e9 data.append( go.Scatter(x=s.index.total_seconds() / 3600, y=s, opacity=0.75, name=d, marker=dict(color=c))) cum_yield_gb.html = plotly.offline.plot( { "data": data, "layout": go.Layout( barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), ) }, output_type="div", show_link=False) cum_yield_gb.fig = go.Figure({ "data": data, "layout": go.Layout( barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), ) }) cum_yield_gb.save() return [cum_yield_gb]
def active_pores_over_time(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info("NanoComp: Creating active pores plot using {} reads.".format( len(dfs))) active_pores = Plot(path=path + "NanoComp_ActivePoresOverTime.html", title="Active pores over time") data = [] for sample, color in zip(df["dataset"].unique(), palette): pores = dfs.loc[dfs["dataset"] == sample, "channelIDs"].resample('10T').nunique() data.append( go.Scatter(x=pores.index.total_seconds() / 3600, y=pores, opacity=0.75, name=sample, marker=dict(color=color))) active_pores.html = plotly.offline.plot( { "data": data, "layout": go.Layout( barmode='overlay', title=title or active_pores.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Active pores (per 10 minutes)"), ) }, output_type="div", show_link=False) active_pores.fig = go.Figure({ "data": data, "layout": go.Layout( barmode='overlay', title=title or active_pores.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Active pores (per 10 minutes)"), ) }) active_pores.save() return active_pores
def quality_over_time(dfs, path, figformat, title, plot_settings={}): time_qual = Plot(path=path + "TimeQualityViolinPlot." + figformat, title="Violin plot of quality over time") sns.set(style="white", **plot_settings) ax = sns.violinplot(x="timebin", y="quals", data=dfs, inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Basecall quality", title=title or time_qual.title) plt.xticks(rotation=45, ha='center', fontsize=8) time_qual.fig = ax.get_figure() time_qual.save(format=figformat) plt.close("all") return time_qual
def sequencing_speed_over_time(dfs, path, figformat, title, plot_settings={}): time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot." + figformat, title="Violin plot of sequencing speed over time") sns.set(style="white", **plot_settings) if "timebin" not in dfs: dfs['timebin'] = add_time_bins(dfs) ax = sns.violinplot(x=dfs["timebin"], y=dfs["lengths"] / dfs["duration"], inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Sequencing speed (nucleotides/second)", title=title or time_duration.title) plt.xticks(rotation=45, ha='center', fontsize=8) time_duration.fig = ax.get_figure() time_duration.save(format=figformat) plt.close("all") return time_duration
def compare_sequencing_speed(df, figformat, path, title=None, palette=None): logging.info( "Nanoplotter: creating comparison of sequencing speed over time.") seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time." + figformat, title="Sequencing speed over time") dfs = check_valid_time_and_sort(df, "start_time") dfs['timebin'] = add_time_bins(dfs) ax = sns.violinplot(x=dfs["timebin"], y=dfs["lengths"] / dfs["duration"], hue=dfs["dataset"], inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Sequencing speed (nucleotides/second)") plt.xticks(rotation=45, ha='center', fontsize=8) seq_speed.fig = ax.get_figure() seq_speed.save(format=figformat) plt.close("all") return [seq_speed]
def yield_by_minimal_length_plot(array, name, path, title=None, color="#4CB391", figformat="png"): df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]}) df["cumyield_gb"] = df["lengths"].cumsum() / 10**9 yield_by_length = Plot( path=path + "Yield_By_Length." + figformat, title="Yield by length") ax = sns.regplot( x='lengths', y="cumyield_gb", data=df, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set( xlabel='Read length', ylabel='Cumulative yield for minimal length', title=title or yield_by_length.title) yield_by_length.fig = ax.get_figure() yield_by_length.save(format=figformat) plt.close("all") return yield_by_length
def n50_barplot(df, path, settings, title=None): ''' Returns Plot object and creates figure(format specified)/html containing bar chart of total gb aligned/sequenced read length n50 ''' n50_bar = Plot(path=path + "NanoComp_N50.html", title="Comparing read length N50") if "aligned_lengths" in df: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"])) for d in df["dataset"].unique() ] ylabel = 'Total gigabase aligned' else: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"])) for d in df["dataset"].unique() ] ylabel = 'Sequenced read length N50' idx = df["dataset"].unique() n50_bar.fig = go.Figure() for idx, n50 in zip(idx, n50s): n50_bar.fig.add_trace(go.Bar(x=[idx], y=[n50], name=idx)) n50_bar.fig.update_layout( title=title or n50_bar.title, title_x=0.5, yaxis_title=ylabel, ) n50_bar.html = n50_bar.fig.to_html(full_html=False, include_plotlyjs='cdn') n50_bar.save(settings) return [n50_bar]
def compare_sequencing_speed(df, path, settings, title=None): logging.info( "NanoComp: creating comparison of sequencing speed over time.") seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time.html", title="Sequencing speed over time") dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") dfs = dfs.loc[dfs["duration"] > 0] palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 data = [] for sample, color in zip(df["dataset"].unique(), palette): seqspeed = (dfs.loc[dfs["dataset"] == sample, "lengths"] / dfs.loc[dfs["dataset"] == sample, "duration"]).resample('30T').median() data.append( go.Scatter(x=seqspeed.index.total_seconds() / 3600, y=seqspeed, opacity=0.75, name=sample, mode='lines', marker=dict(color=color))) seq_speed.fig = go.Figure({"data": data}) seq_speed.fig.update_layout( title=title or seq_speed.title, title_x=0.5, xaxis_title='Interval (hours)', yaxis_title="Sequencing speed (nucleotides/second)") seq_speed.html = seq_speed.fig.to_html(full_html=False, include_plotlyjs='cdn') seq_speed.save(settings) return [seq_speed]
def length_plots(array, name, path, settings, title=None, n50=None, color="#4CB391"): """Create histogram of normal and log transformed read lengths.""" logging.info("NanoPlot: Creating length plots for {}.".format(name)) maxvalx = np.amax(array) if n50: logging.info( "NanoPlot: Using {} reads with read length N50 of {}bp and maximum of {}bp." .format(array.size, n50, maxvalx)) else: logging.info( f"NanoPlot: Using {array.size} reads maximum of {maxvalx}bp.") plots = [] HistType = [{ 'weight': array, 'name': 'Weighted', 'ylabel': 'Number of reads' }, { 'weight': None, 'name': 'Non weighted', 'ylabel': 'Number of reads' }] for h_type in HistType: histogram = Plot(path=path + h_type["name"].replace(" ", "_") + "Histogram" + name.replace(' ', '') + ".html", title=f"{h_type['name']} histogram of read lengths") hist, bin_edges = np.histogram(array, bins=max(round(int(maxvalx) / 500), 10), weights=h_type["weight"]) fig = go.Figure() fig.add_trace(go.Bar(x=bin_edges[1:], y=hist, marker_color=color)) if n50: fig.add_vline(n50) fig.add_annotation(text='N50', x=n50, y=0.95) fig.update_annotations(font_size=8) fig.update_layout(xaxis_title='Read length', yaxis_title=h_type["ylabel"], title=title or histogram.title, title_x=0.5) histogram.fig = fig histogram.html = histogram.fig.to_html(full_html=False, include_plotlyjs='cdn') histogram.save(settings) log_histogram = Plot( path=path + h_type["name"].replace(" ", "_") + "LogTransformed_Histogram" + name.replace(' ', '') + ".html", title=h_type["name"] + " histogram of read lengths after log transformation") if h_type["weight"] is None: hist_log, bin_edges_log = np.histogram( np.log10(array), bins=max(round(int(maxvalx) / 500), 10), weights=h_type["weight"]) else: hist_log, bin_edges_log = np.histogram( np.log10(array), bins=max(round(int(maxvalx) / 500), 10), weights=np.log10(h_type["weight"])) fig = go.Figure() fig.add_trace( go.Bar(x=bin_edges_log[1:], y=hist_log, marker_color=color)) ticks = [10**i for i in range(10) if not 10**i > 10 * maxvalx] fig.update_layout(xaxis=dict(tickmode='array', tickvals=np.log10(ticks), ticktext=ticks), xaxis_title='Read length', yaxis_title=h_type["ylabel"], title=title or log_histogram.title, title_x=0.5) if n50: fig.add_vline(np.log10(n50)) fig.add_annotation(text='N50', x=np.log10(n50), y=0.95) fig.update_annotations(font_size=8) log_histogram.fig = fig log_histogram.html = log_histogram.fig.to_html(full_html=False, include_plotlyjs='cdn') log_histogram.save(settings) plots.extend([histogram, log_histogram]) plots.append( yield_by_minimal_length_plot(array=array, name=name, path=path, title=title, color=color, settings=settings)) return plots
def violin_or_box_plot(df, y, path, y_name, settings, title=None, plot="violin", log=False): """Create a violin/boxplot/ridge from the received DataFrame. The x-axis should be divided based on the 'dataset' column, the y-axis is specified in the arguments """ comp = Plot(path=f"{path}NanoComp_{y.replace(' ', '_')}_{plot}.html", title=f"Comparing {y_name.lower()}") if plot == 'violin': logging.info(f"NanoComp: Creating violin plot for {y}.") fig = go.Figure() for dataset in df["dataset"].unique(): fig.add_trace( go.Violin(x=df["dataset"][df["dataset"] == dataset], y=df[y][df["dataset"] == dataset], points=False, name=dataset)) process_violin_and_box(fig, log=log, plot_obj=comp, title=title, y_name=y_name, ymax=np.amax(df[y]), settings=settings) elif plot == 'box': logging.info("NanoComp: Creating box plot for {}.".format(y)) fig = go.Figure() for dataset in df["dataset"].unique(): fig.add_trace( go.Box(x=df["dataset"][df["dataset"] == dataset], y=df[y][df["dataset"] == dataset], name=dataset)) process_violin_and_box(fig, log=log, plot_obj=comp, title=title, y_name=y_name, ymax=np.amax(df[y]), settings=settings) elif plot == 'ridge': logging.info("NanoComp: Creating ridges plot for {}.".format(y)) fig = go.Figure() for d in df["dataset"].unique(): fig.add_trace(go.Violin(x=df[y][df['dataset'] == d], name=d)) fig.update_traces(orientation='h', side='positive', width=3, points=False) fig.update_layout(title=title or comp.title, title_x=0.5) comp.fig = fig comp.html = comp.fig.to_html(full_html=False, include_plotlyjs='cdn') comp.save(settings) else: logging.error(f"Unknown comp plot type {plot}") sys.exit(f"Unknown comp plot type {plot}") return [comp]
def scatter(x, y, names, path, plots, color="#4CB391", figformat="png", stat=None, log=False, minvalx=0, minvaly=0, title=None, plot_settings={}, xmax=None, ymax=None): """Create bivariate plots. Create four types of bivariate plots of x vs y, containing marginal summaries -A scatter plot with histograms on axes -A hexagonal binned plot with histograms on axes -A kernel density plot with density curves on axes -A pauvre-style plot using code from https://github.com/conchoecia/pauvre """ logging.info("Nanoplotter: Creating {} vs {} plots using statistics from {} reads.".format( names[0], names[1], x.size)) if not contains_variance([x, y], names): return [] sns.set(style="ticks", **plot_settings) maxvalx = xmax or np.amax(x) maxvaly = ymax or np.amax(y) plots_made = [] if plots["hex"]: hex_plot = Plot( path=path + "_hex." + figformat, title="{} vs {} plot using hexagonal bins".format(names[0], names[1])) plot = sns.jointplot( x=x, y=y, kind="hex", color=color, stat_func=stat, space=0, xlim=(minvalx, maxvalx), ylim=(minvaly, maxvaly), height=10) plot.set_axis_labels(names[0], names[1]) if log: hex_plot.title = hex_plot.title + " after log transformation of read lengths" ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)] plot.ax_joint.set_xticks(np.log10(ticks)) plot.ax_marg_x.set_xticks(np.log10(ticks)) plot.ax_joint.set_xticklabels(ticks) plt.subplots_adjust(top=0.90) plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25) hex_plot.fig = plot hex_plot.save(format=figformat) plots_made.append(hex_plot) sns.set(style="darkgrid", **plot_settings) if plots["dot"]: dot_plot = Plot( path=path + "_dot." + figformat, title="{} vs {} plot using dots".format(names[0], names[1])) plot = sns.jointplot( x=x, y=y, kind="scatter", color=color, stat_func=stat, xlim=(minvalx, maxvalx), ylim=(minvaly, maxvaly), space=0, height=10, joint_kws={"s": 1}) plot.set_axis_labels(names[0], names[1]) if log: dot_plot.title = dot_plot.title + " after log transformation of read lengths" ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)] plot.ax_joint.set_xticks(np.log10(ticks)) plot.ax_marg_x.set_xticks(np.log10(ticks)) plot.ax_joint.set_xticklabels(ticks) plt.subplots_adjust(top=0.90) plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25) dot_plot.fig = plot dot_plot.save(format=figformat) plots_made.append(dot_plot) if plots["kde"]: idx = np.random.choice(x.index, min(2000, len(x)), replace=False) kde_plot = Plot( path=path + "_kde." + figformat, title="{} vs {} plot using a kernel density estimation".format(names[0], names[1])) plot = sns.jointplot( x=x[idx], y=y[idx], kind="kde", clip=((0, np.Inf), (0, np.Inf)), xlim=(minvalx, maxvalx), ylim=(minvaly, maxvaly), space=0, color=color, stat_func=stat, shade_lowest=False, height=10) plot.set_axis_labels(names[0], names[1]) if log: kde_plot.title = kde_plot.title + " after log transformation of read lengths" ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)] plot.ax_joint.set_xticks(np.log10(ticks)) plot.ax_marg_x.set_xticks(np.log10(ticks)) plot.ax_joint.set_xticklabels(ticks) plt.subplots_adjust(top=0.90) plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25) kde_plot.fig = plot kde_plot.save(format=figformat) plots_made.append(kde_plot) if plots["pauvre"] and names == ['Read lengths', 'Average read quality'] and log is False: pauvre_plot = Plot( path=path + "_pauvre." + figformat, title="{} vs {} plot using pauvre-style @conchoecia".format(names[0], names[1])) sns.set(style="white", **plot_settings) margin_plot(df=pd.DataFrame({"length": x, "meanQual": y}), Y_AXES=False, title=title or "Length vs Quality in Pauvre-style", plot_maxlen=None, plot_minlen=0, plot_maxqual=None, plot_minqual=0, lengthbin=None, qualbin=None, BASENAME="whatever", path=pauvre_plot.path, fileform=[figformat], dpi=600, TRANSPARENT=True, QUIET=True) plots_made.append(pauvre_plot) plt.close("all") return plots_made
def length_plots(array, name, path, title=None, n50=None, color="#4CB391", figformat="png"): """Create histogram of normal and log transformed read lengths.""" logging.info("Nanoplotter: Creating length plots for {}.".format(name)) maxvalx = np.amax(array) if n50: logging.info("Nanoplotter: Using {} reads with read length N50 of {}bp and maximum of {}bp." .format(array.size, n50, maxvalx)) else: logging.info("Nanoplotter: Using {} reads maximum of {}bp.".format(array.size, maxvalx)) plots = [] HistType = namedtuple('HistType', 'weight name ylabel') for h_type in [HistType(None, "", "Number of reads"), HistType(array, "Weighted ", "Number of bases")]: histogram = Plot( path=path + h_type.name.replace(" ", "_") + "Histogram" + name.replace(' ', '') + "." + figformat, title=h_type.name + "Histogram of read lengths") ax = sns.distplot( a=array, kde=False, hist=True, bins=max(round(int(maxvalx) / 500), 10), color=color, hist_kws=dict(weights=h_type.weight, edgecolor=color, linewidth=0.2, alpha=0.8)) if n50: plt.axvline(n50) plt.annotate('N50', xy=(n50, np.amax([h.get_height() for h in ax.patches])), size=8) ax.set( xlabel='Read length', ylabel=h_type.ylabel, title=title or histogram.title) plt.ticklabel_format(style='plain', axis='y') histogram.fig = ax.get_figure() histogram.save(format=figformat) plt.close("all") log_histogram = Plot( path=path + h_type.name.replace(" ", "_") + "LogTransformed_Histogram" + name.replace(' ', '') + "." + figformat, title=h_type.name + "Histogram of read lengths after log transformation") ax = sns.distplot( a=np.log10(array), kde=False, hist=True, color=color, hist_kws=dict(weights=h_type.weight, edgecolor=color, linewidth=0.2, alpha=0.8)) ticks = [10**i for i in range(10) if not 10**i > 10 * maxvalx] ax.set( xticks=np.log10(ticks), xticklabels=ticks, xlabel='Read length', ylabel=h_type.ylabel, title=title or log_histogram.title) if n50: plt.axvline(np.log10(n50)) plt.annotate('N50', xy=(np.log10(n50), np.amax( [h.get_height() for h in ax.patches])), size=8) plt.ticklabel_format(style='plain', axis='y') log_histogram.fig = ax.get_figure() log_histogram.save(format=figformat) plt.close("all") plots.extend([histogram, log_histogram]) plots.append(yield_by_minimal_length_plot(array=array, name=name, path=path, title=title, color=color, figformat=figformat)) return plots
def violin_or_box_plot(df, y, figformat, path, y_name, title=None, plot="violin", log=False, palette=None): """Create a violin or boxplot from the received DataFrame. The x-axis should be divided based on the 'dataset' column, the y-axis is specified in the arguments """ comp = Plot(path=path + "NanoComp_" + y.replace(' ', '_') + '.' + figformat, title="Comparing {}".format(y_name.lower())) if plot == 'violin': logging.info("NanoComp: Creating violin plot for {}.".format(y)) process_violin_and_box(ax=sns.violinplot(x="dataset", y=y, data=df, inner=None, cut=0, palette=palette, linewidth=0), log=log, plot_obj=comp, title=title, y_name=y_name, figformat=figformat, ymax=np.amax(df[y])) elif plot == 'box': logging.info("NanoComp: Creating box plot for {}.".format(y)) process_violin_and_box(ax=sns.boxplot(x="dataset", y=y, data=df, palette=palette), log=log, plot_obj=comp, title=title, y_name=y_name, figformat=figformat, ymax=np.amax(df[y])) elif plot == 'ridge': logging.info("NanoComp: Creating ridges plot for {}.".format(y)) comp.fig, axes = joypy.joyplot(df, by="dataset", column=y, title=title or comp.title, x_range=[-0.05, np.amax(df[y])]) if log: xticks = [float(i.get_text()) for i in axes[-1].get_xticklabels()] axes[-1].set_xticklabels([10**i for i in xticks]) axes[-1].set_xticklabels(axes[-1].get_xticklabels(), rotation=30, ha='center') comp.save(format=figformat) else: logging.error("Unknown comp plot type {}".format(plot)) sys.exit("Unknown comp plot type {}".format(plot)) plt.close("all") return [comp]
def scatter_legacy(x, y, names, path, plots, color, settings, stat=None, log=False, minvalx=0, minvaly=0, title=None, xmax=None, ymax=None): """Create bivariate plots. Create four types of bivariate plots of x vs y, containing marginal summaries -A scatter plot with histograms on axes -A hexagonal binned plot with histograms on axes -A kernel density plot with density curves on axes -A pauvre-style plot using code from https://github.com/conchoecia/pauvre """ try: import matplotlib as mpl mpl.use('Agg') import seaborn as sns import matplotlib.pyplot as plt except ImportError: sys.stderr("NanoPlot needs seaborn and matplotlib with --legacy") return [] figformat = settings["format"] if figformat in ["webp", "json"]: figformat = "png" logging.info( f"NanoPlot: Creating {names[0]} vs {names[1]} legacy plots using {x.size} reads." ) if not contains_variance([x, y], names): return [] sns.set(style="ticks") maxvalx = xmax or np.amax(x) maxvaly = ymax or np.amax(y) plots_made = [] path = path + "_legacy" if plots["hex"]: if log: hex_plot = Plot(path=path + "_loglength_hex." + figformat, title="{} vs {} plot using hexagonal bins " "after log transformation of read lengths".format( names[0], names[1])) else: hex_plot = Plot(path=path + "_hex." + figformat, title="{} vs {} plot using hexagonal bins".format( names[0], names[1])) plot = sns.jointplot(x=x, y=y, kind="hex", color=color, stat_func=stat, space=0, xlim=(minvalx, maxvalx), ylim=(minvaly, maxvaly), height=10) plot.set_axis_labels(names[0], names[1]) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] plot.ax_joint.set_xticks(np.log10(ticks)) plot.ax_marg_x.set_xticks(np.log10(ticks)) plot.ax_joint.set_xticklabels(ticks) plt.subplots_adjust(top=0.90) plot.fig.suptitle(title or f"{names[0]} vs {names[1]} plot", fontsize=25) hex_plot.fig = plot hex_plot.save(settings) plots_made.append(hex_plot) sns.set(style="darkgrid") if plots["dot"]: print("we here") if log: dot_plot = Plot(path=path + "_loglength_dot." + figformat, title="{} vs {} plot using dots " "after log transformation of read lengths".format( names[0], names[1])) else: dot_plot = Plot(path=path + "_dot." + figformat, title="{} vs {} plot using dots".format( names[0], names[1])) plot = sns.jointplot(x=x, y=y, kind="scatter", color=color, stat_func=stat, xlim=(minvalx, maxvalx), ylim=(minvaly, maxvaly), space=0, height=10, joint_kws={"s": 1}) plot.set_axis_labels(names[0], names[1]) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] plot.ax_joint.set_xticks(np.log10(ticks)) plot.ax_marg_x.set_xticks(np.log10(ticks)) plot.ax_joint.set_xticklabels(ticks) plt.subplots_adjust(top=0.90) plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25) dot_plot.fig = plot dot_plot.save(settings) plots_made.append(dot_plot) if plots["kde"]: if len(x) > 2: idx = np.random.choice(x.index, min(2000, len(x)), replace=False) if log: kde_plot = Plot( path=path + "_loglength_kde." + figformat, title="{} vs {} plot using a kernel density estimation " "after log transformation of read lengths".format( names[0], names[1])) else: kde_plot = Plot( path=path + "_kde." + figformat, title= f"{names[0]} vs {names[1]} plot using a kernel density estimation" ) plot = sns.jointplot(x=x[idx], y=y[idx], kind="kde", clip=((0, np.Inf), (0, np.Inf)), xlim=(minvalx, maxvalx), ylim=(minvaly, maxvaly), space=0, color=color, stat_func=stat, shade_lowest=False, height=10) plot.set_axis_labels(names[0], names[1]) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] plot.ax_joint.set_xticks(np.log10(ticks)) plot.ax_marg_x.set_xticks(np.log10(ticks)) plot.ax_joint.set_xticklabels(ticks) plt.subplots_adjust(top=0.90) plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25) kde_plot.fig = plot kde_plot.save(settings) plots_made.append(kde_plot) else: sys.stderr.write( "Not enough observations (reads) to create a kde plot.\n") logging.info( "NanoPlot: Not enough observations (reads) to create a kde plot" ) plt.close("all") return plots_made
def scatter(x, y, legacy, names, path, plots, color, colormap, settings, stat=None, log=False, minvalx=0, minvaly=0, title=None, xmax=None, ymax=None): """-> create marginalised scatterplots and KDE plot with marginalized histograms -> update from scatter_legacy function to utilise plotly package - scatterplot with histogram on both axes - kernel density plot with histograms on both axes - hexbin not implemented yet - pauvre plot temporarily not available """ logging.info( f"NanoPlot: Creating {names[0]} vs {names[1]} plots using {x.size} reads." ) if not contains_variance([x, y], names): return [] plots_made = [] idx = np.random.choice(x.index, min(10000, len(x)), replace=False) maxvalx = xmax or np.amax(x[idx]) maxvaly = ymax or np.amax(y[idx]) if plots["dot"]: if log: dot_plot = Plot(path=path + "_loglength_dot.html", title=f"{names[0]} vs {names[1]} plot using dots " "after log transformation of read lengths") else: dot_plot = Plot(path=path + "_dot.html", title=f"{names[0]} vs {names[1]} plot using dots") fig = px.scatter(x=x[idx], y=y[idx], marginal_x="histogram", marginal_y="histogram", range_x=[minvalx, maxvalx], range_y=[minvaly, maxvaly]) fig.update_traces(marker=dict(color=color)) fig.update_yaxes(rangemode="tozero") fig.update_xaxes(rangemode="tozero") fig.update_layout(xaxis_title=names[0], yaxis_title=names[1], title=title or dot_plot.title, title_x=0.5) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] fig.update_layout(xaxis=dict(tickmode='array', tickvals=np.log10(ticks), ticktext=ticks, tickangle=45)) dot_plot.fig = fig dot_plot.html = dot_plot.fig.to_html(full_html=False, include_plotlyjs='cdn') dot_plot.save(settings) plots_made.append(dot_plot) if plots["kde"]: kde_plot = Plot(path=path + "_loglength_kde.html" if log else path + "_kde.html", title=f"{names[0]} vs {names[1]} kde plot") col = hex_to_rgb_scale_0_1(color) fig = ff.create_2d_density(x[idx], y[idx], point_size=3, hist_color=col, point_color=col, colorscale=colormap) fig.update_layout(xaxis_title=names[0], yaxis_title=names[1], title=title or kde_plot.title, title_x=0.5, xaxis=dict(tickangle=45)) if log: ticks = [ 10**i for i in range(10) if not 10**i > 10 * (10**maxvalx) ] fig.update_layout(xaxis=dict(tickmode='array', tickvals=np.log10(ticks), ticktext=ticks, tickangle=45)) kde_plot.fig = fig kde_plot.html = kde_plot.fig.to_html(full_html=False, include_plotlyjs='cdn') kde_plot.save(settings) plots_made.append(kde_plot) if 1 in legacy.values(): settings, args = utils.get_args() plots_made += scatter_legacy(x=x[idx], y=y[idx], names=names, path=path, plots=legacy, color=color, settings=settings, stat=stat, log=log, minvalx=minvalx, minvaly=minvaly, title=title) return plots_made