Example #1
0
def output_barplot(df, figformat, path, title=None, palette=None):
    """Create barplots based on number of reads and total sum of nucleotides sequenced."""
    logging.info(
        "NanoComp: Creating barplots for number of reads and total throughput."
    )
    read_count = Plot(path=path + "NanoComp_number_of_reads." + figformat,
                      title="Comparing number of reads")
    ax = sns.countplot(x="dataset", data=df, palette=palette)
    ax.set(ylabel='Number of reads', title=title or read_count.title)
    plt.xticks(rotation=30, ha='center')
    read_count.fig = ax.get_figure()
    read_count.save(format=figformat)
    plt.close("all")

    throughput_bases = Plot(path=path + "NanoComp_total_throughput." +
                            figformat,
                            title="Comparing throughput in gigabases")
    if "aligned_lengths" in df:
        throughput = df.groupby('dataset')['aligned_lengths'].sum()
        ylabel = 'Total gigabase aligned'
    else:
        throughput = df.groupby('dataset')['lengths'].sum()
        ylabel = 'Total gigabase sequenced'
    ax = sns.barplot(x=list(throughput.index),
                     y=throughput / 1e9,
                     palette=palette,
                     order=df["dataset"].unique())
    ax.set(ylabel=ylabel, title=title or throughput_bases.title)
    plt.xticks(rotation=30, ha='center')
    throughput_bases.fig = ax.get_figure()
    throughput_bases.save(format=figformat)
    plt.close("all")
    return read_count, throughput_bases
Example #2
0
def cumulative_yield(dfs, path, figformat, title, color):
    cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases." +
                        figformat,
                        title="Cumulative yield")
    s = dfs.loc[:, "lengths"].cumsum().resample('1T').max() / 1e9
    ax = sns.regplot(x=s.index.total_seconds() / 3600,
                     y=s,
                     x_ci=None,
                     fit_reg=False,
                     color=color,
                     scatter_kws={"s": 3})
    ax.set(xlabel='Run time (hours)',
           ylabel='Cumulative yield in gigabase',
           title=title or cum_yield_gb.title)
    cum_yield_gb.fig = ax.get_figure()
    cum_yield_gb.save(format=figformat)
    plt.close("all")

    cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads." +
                           figformat,
                           title="Cumulative yield")
    s = dfs.loc[:, "lengths"].resample('10T').count().cumsum()
    ax = sns.regplot(x=s.index.total_seconds() / 3600,
                     y=s,
                     x_ci=None,
                     fit_reg=False,
                     color=color,
                     scatter_kws={"s": 3})
    ax.set(xlabel='Run time (hours)',
           ylabel='Cumulative yield in number of reads',
           title=title or cum_yield_reads.title)
    cum_yield_reads.fig = ax.get_figure()
    cum_yield_reads.save(format=figformat)
    plt.close("all")
    return [cum_yield_gb, cum_yield_reads]
Example #3
0
def output_barplot(df, path, settings, title=None):
    """Create barplots based on number of reads and total sum of nucleotides sequenced."""
    logging.info(
        "NanoComp: Creating barplots for number of reads and total throughput."
    )
    read_count = Plot(path=path + "NanoComp_number_of_reads.html",
                      title="Comparing number of reads")

    read_count.fig = go.Figure()

    counts = df['dataset'].value_counts(sort=False).sort_index()
    idx = counts.index

    for idx, count in zip(idx, counts):
        read_count.fig.add_trace(go.Bar(x=[idx], y=[count], name=idx))

    read_count.fig.update_layout(
        title_text=title or read_count.title,
        title_x=0.5,
        yaxis_title="Number of reads",
    )

    read_count.html = read_count.fig.to_html(full_html=False,
                                             include_plotlyjs='cdn')
    read_count.save(settings)

    throughput_bases = Plot(path=path + "NanoComp_total_throughput.html",
                            title="Comparing throughput in bases")
    if "aligned_lengths" in df:
        throughput = df.groupby('dataset')['aligned_lengths'].sum()
        ylabel = 'Total bases aligned'
    else:
        throughput = df.groupby('dataset')['lengths'].sum()
        ylabel = 'Total bases sequenced'

    idx = df["dataset"].unique()

    throughput_bases.fig = go.Figure()
    for idx, sum_dataset in zip(idx, throughput):
        throughput_bases.fig.add_trace(
            go.Bar(x=[idx], y=[sum_dataset], name=idx))

    throughput_bases.fig.update_layout(
        title=title or throughput_bases.title,
        title_x=0.5,
        yaxis_title=ylabel,
    )

    throughput_bases.html = throughput_bases.fig.to_html(
        full_html=False, include_plotlyjs='cdn')
    throughput_bases.save(settings)

    return read_count, throughput_bases
Example #4
0
def spatial_heatmap(array, path, colormap, figformat, title=None):
    """Taking channel information and creating post run channel activity plots."""
    logging.info(
        "Nanoplotter: Creating heatmap of reads per channel using {} reads.".
        format(array.size))

    activity_map = Plot(path=path + ".html",
                        title="Number of reads generated per channel")

    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))

    for entry in valueCounts.keys():
        layout.template[np.where(
            layout.structure == entry)] = valueCounts[entry]

    data = pd.DataFrame(layout.template,
                        index=layout.yticks,
                        columns=layout.xticks)

    fig = go.Figure(
        data=go.Heatmap(z=data.values.tolist(), colorscale=colormap))
    fig.update_layout(xaxis_title='Channel',
                      yaxis_title='Number of reads',
                      title=title or activity_map.title,
                      title_x=0.5)

    activity_map.fig = fig
    activity_map.html = activity_map.fig.to_html(full_html=False,
                                                 include_plotlyjs='cdn')
    activity_map.save(figformat)
    return [activity_map]
Example #5
0
def quality_over_time(dfs, path, settings, title=None, color="#4CB391"):
    time_qual = Plot(path=path + "TimeQualityViolinPlot.html",
                     title="Violin plot of quality over time")

    fig = go.Figure()

    fig.add_trace(
        go.Violin(y=dfs["quals"],
                  x=dfs["timebin"],
                  points=False,
                  spanmode="hard",
                  line_color='black',
                  line_width=1.5,
                  fillcolor=color,
                  opacity=0.8))

    fig.update_layout(xaxis_title='Interval (hours)',
                      yaxis_title='Basecall quality',
                      title=title or time_qual.title,
                      title_x=0.5)

    fig.update_xaxes(tickangle=45)

    time_qual.fig = fig
    time_qual.html = time_qual.fig.to_html(full_html=False,
                                           include_plotlyjs='cdn')
    time_qual.save(settings)

    return time_qual
Example #6
0
def n50_barplot(df, figformat, path, title=None, palette=None):
    n50_bar = Plot(path=path + "NanoComp_N50." + figformat,
                   title="Comparing read length N50")
    if "aligned_lengths" in df:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Total gigabase aligned'
    else:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Sequenced read length N50'
    ax = sns.barplot(x=list(df["dataset"].unique()),
                     y=n50s,
                     palette=palette,
                     order=df["dataset"].unique())
    ax.set(ylabel=ylabel, title=title or n50_bar.title)
    plt.xticks(rotation=30, ha='center')
    n50_bar.fig = ax.get_figure()
    n50_bar.save(format=figformat)
    plt.close("all")
    return [n50_bar]
Example #7
0
def sequencing_speed_over_time(dfs, path, title, settings, color="#4CB391"):
    time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot.html",
                         title="Violin plot of sequencing speed over time")

    mask = dfs['duration'] != 0

    fig = go.Figure()

    fig.add_trace(
        go.Violin(x=dfs.loc[mask, "timebin"],
                  y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"],
                  points=False,
                  spanmode="hard",
                  line_color='black',
                  line_width=1.5,
                  fillcolor=color,
                  opacity=0.8))

    fig.update_layout(xaxis_title='Interval (hours)',
                      yaxis_title='Sequencing speed (nucleotides/second)',
                      title=title or time_duration.title,
                      title_x=0.5)

    fig.update_xaxes(tickangle=45)

    time_duration.fig = fig
    time_duration.html = time_duration.fig.to_html(full_html=False,
                                                   include_plotlyjs='cdn')
    time_duration.save(settings)

    return time_duration
Example #8
0
def yield_by_minimal_length_plot(array,
                                 name,
                                 path,
                                 settings,
                                 title=None,
                                 color="#4CB391"):
    df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]})
    df["cumyield_gb"] = df["lengths"].cumsum() / 10**9
    idx = np.random.choice(array.index, min(10000, len(array)), replace=False)

    yield_by_length = Plot(path=path + "Yield_By_Length.html",
                           title="Yield by length")

    fig = px.scatter(df,
                     x=df.reindex(idx)["lengths"],
                     y=df.reindex(idx)["cumyield_gb"])
    fig.update_traces(marker=dict(color=color))
    fig.update_layout(xaxis_title='Read length',
                      yaxis_title='Cumulative yield for minimal length [Gb]',
                      title=title or yield_by_length.title,
                      title_x=0.5)

    yield_by_length.fig = fig
    yield_by_length.html = yield_by_length.fig.to_html(full_html=False,
                                                       include_plotlyjs='cdn')
    yield_by_length.save(settings)

    return yield_by_length
Example #9
0
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"):
    """Taking channel information and creating post run channel activity plots."""
    logging.info(
        "Nanoplotter: Creating heatmap of reads per channel using {} reads.".
        format(array.size))
    activity_map = Plot(path=path + "." + figformat,
                        title="Number of reads generated per channel")
    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))
    for entry in valueCounts.keys():
        layout.template[np.where(
            layout.structure == entry)] = valueCounts[entry]
    plt.figure()
    ax = sns.heatmap(data=pd.DataFrame(layout.template,
                                       index=layout.yticks,
                                       columns=layout.xticks),
                     xticklabels="auto",
                     yticklabels="auto",
                     square=True,
                     cbar_kws={"orientation": "horizontal"},
                     cmap=color,
                     linewidths=0.20)
    ax.set_title(title or activity_map.title)
    activity_map.fig = ax.get_figure()
    activity_map.save(format=figformat)
    plt.close("all")
    return [activity_map]
Example #10
0
def length_over_time(dfs, path, figformat, title, log_length=False, plot_settings={}):
    time_length = Plot(path=path + "TimeLengthViolinPlot." + figformat,
                       title="Violin plot of read lengths over time")
    sns.set(style="white", **plot_settings)
    if log_length:
        length_column = "log_lengths"
    else:
        length_column = "lengths"

    if "length_filter" in dfs:  # produced by NanoPlot filtering of too long reads
        temp_dfs = dfs[dfs["length_filter"]]
    else:
        temp_dfs = dfs

    ax = sns.violinplot(x="timebin",
                        y=length_column,
                        data=temp_dfs,
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Read length",
           title=title or time_length.title)
    if log_length:
        ticks = [10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"])]
        ax.set(yticks=np.log10(ticks),
               yticklabels=ticks)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_length.fig = ax.get_figure()
    time_length.save(format=figformat)
    plt.close("all")
    return time_length
Example #11
0
def compare_cumulative_yields(df, path, palette=None, title=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")

    logging.info(
        "NanoComp: Creating cumulative yield plots using {} reads.".format(
            len(dfs)))
    cum_yield_gb = Plot(path=path +
                        "NanoComp_CumulativeYieldPlot_Gigabases.html",
                        title="Cumulative yield")
    data = []
    annotations = []
    for sample, color in zip(df["dataset"].unique(), palette):
        cumsum = dfs.loc[dfs["dataset"] == sample,
                         "lengths"].cumsum().resample('10T').max() / 1e9
        data.append(
            go.Scatter(x=cumsum.index.total_seconds() / 3600,
                       y=cumsum,
                       opacity=0.75,
                       name=sample,
                       marker=dict(color=color)))
        annotations.append(
            dict(xref='paper',
                 x=0.99,
                 y=cumsum[-1],
                 xanchor='left',
                 yanchor='middle',
                 text='{}Gb'.format(round(cumsum[-1])),
                 showarrow=False))

    cum_yield_gb.html = plotly.offline.plot(
        {
            "data":
            data,
            "layout":
            go.Layout(barmode='overlay',
                      title=title or cum_yield_gb.title,
                      xaxis=dict(title="Time (hours)"),
                      yaxis=dict(title="Yield (gigabase)"),
                      annotations=annotations)
        },
        output_type="div",
        show_link=False)

    cum_yield_gb.fig = go.Figure({
        "data":
        data,
        "layout":
        go.Layout(barmode='overlay',
                  title=title or cum_yield_gb.title,
                  xaxis=dict(title="Time (hours)"),
                  yaxis=dict(title="Yield (gigabase)"),
                  annotations=annotations)
    })
    cum_yield_gb.save()
    return [cum_yield_gb]
Example #12
0
def plot_over_time(dfs, path, title, figformat, color="#4CB391"):
    num_reads = Plot(path=path + "NumberOfReads_Over_Time.html",
                     title="Number of reads over time")
    s = dfs.loc[:, "lengths"].resample('10T').count()

    fig = px.scatter(
        data_frame=None,
        x=s.index.total_seconds() / 3600,
        y=s)
    fig.update_traces(marker=dict(color=color))

    fig.update_layout(xaxis_title='Run time (hours)',
                      yaxis_title='Number of reads per 10 minutes',
                      title=title or num_reads.title,
                      title_x=0.5)

    num_reads.fig = fig
    num_reads.html = num_reads.fig.to_html(full_html=False, include_plotlyjs='cdn')
    num_reads.save(figformat)

    plots = [num_reads]

    if "channelIDs" in dfs:
        pores_over_time = Plot(path=path + "ActivePores_Over_Time.html",
                               title="Number of active pores over time")
        s = dfs.loc[:, "channelIDs"].resample('10T').nunique()

        fig = px.scatter(
            data_frame=None,
            x=s.index.total_seconds() / 3600,
            y=s)
        fig.update_traces(marker=dict(color=color))

        fig.update_layout(xaxis_title='Run time (hours)',
                          yaxis_title='Active pores per 10 minutes',
                          title=title or pores_over_time.title,
                          title_x=0.5)

        pores_over_time.fig = fig
        pores_over_time.html = pores_over_time.fig.to_html(full_html=False, include_plotlyjs='cdn')
        pores_over_time.save(figformat)

        plots.append(pores_over_time)
    return plots
Example #13
0
def cumulative_yield(dfs, path, title, color, figformat):
    cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases.html",
                        title="Cumulative yield")

    s = dfs.loc[:, "lengths"].cumsum().resample('10T').max() / 1e9

    fig = px.scatter(
        x=s.index.total_seconds() / 3600,
        y=s)
    fig.update_traces(marker=dict(color=color))

    fig.update_layout(xaxis_title='Run time (hours)',
                      yaxis_title='Cumulative yield in gigabase',
                      title=title or cum_yield_gb.title,
                      title_x=0.5)

    cum_yield_gb.fig = fig
    cum_yield_gb.html = cum_yield_gb.fig.to_html(full_html=False, include_plotlyjs='cdn')
    cum_yield_gb.save(figformat)

    cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads.html",
                           title="Cumulative yield")

    s = dfs.loc[:, "lengths"].resample('10T').count().cumsum()

    fig = px.scatter(
        x=s.index.total_seconds() / 3600,
        y=s)
    fig.update_traces(marker=dict(color=color))

    fig.update_layout(xaxis_title='Run time (hours)',
                      yaxis_title='Cumulative yield in number of reads',
                      title=title or cum_yield_gb.title,
                      title_x=0.5)

    cum_yield_reads.fig = fig
    cum_yield_reads.html = cum_yield_reads.fig.to_html(full_html=False, include_plotlyjs='cdn')
    cum_yield_reads.save(figformat)

    return [cum_yield_gb, cum_yield_reads]
Example #14
0
def length_over_time(dfs,
                     path,
                     title,
                     settings,
                     log_length=False,
                     color="#4CB391"):
    if log_length:
        time_length = Plot(path=path + "TimeLogLengthViolinPlot.html",
                           title="Violin plot of log read lengths over time")
    else:
        time_length = Plot(path=path + "TimeLengthViolinPlot.html",
                           title="Violin plot of read lengths over time")

    length_column = "log_lengths" if log_length else "lengths"

    if "length_filter" in dfs:  # produced by NanoPlot filtering of too long reads
        temp_dfs = dfs[dfs["length_filter"]]
    else:
        temp_dfs = dfs

    fig = go.Figure()

    fig.add_trace(
        go.Violin(y=temp_dfs[length_column],
                  x=temp_dfs["timebin"],
                  points=False,
                  spanmode="hard",
                  line_color='black',
                  line_width=1.5,
                  fillcolor=color,
                  opacity=0.8))
    fig.update_layout(xaxis_title='Interval (hours)',
                      yaxis_title='Read length',
                      title=title or time_length.title,
                      title_x=0.5)

    if log_length:
        ticks = [
            10**i for i in range(10)
            if not 10**i > 10 * np.amax(dfs["lengths"])
        ]
        fig.update_layout(yaxis=dict(
            tickmode='array', tickvals=np.log10(ticks), ticktext=ticks))

    fig.update_yaxes(tickangle=45)

    time_length.fig = fig
    time_length.html = time_length.fig.to_html(full_html=False,
                                               include_plotlyjs='cdn')
    time_length.save(settings)

    return time_length
Example #15
0
def plot_over_time(dfs, path, figformat, title, color):
    num_reads = Plot(path=path + "NumberOfReads_Over_Time." + figformat,
                     title="Number of reads over time")
    s = dfs.loc[:, "lengths"].resample('10T').count()
    ax = sns.regplot(x=s.index.total_seconds() / 3600,
                     y=s,
                     x_ci=None,
                     fit_reg=False,
                     color=color,
                     scatter_kws={"s": 3})
    ax.set(xlabel='Run time (hours)',
           ylabel='Number of reads per 10 minutes',
           title=title or num_reads.title)
    num_reads.fig = ax.get_figure()
    num_reads.save(format=figformat)
    plt.close("all")
    plots = [num_reads]

    if "channelIDs" in dfs:
        pores_over_time = Plot(path=path + "ActivePores_Over_Time." +
                               figformat,
                               title="Number of active pores over time")
        s = dfs.loc[:, "channelIDs"].resample('10T').nunique()
        ax = sns.regplot(x=s.index.total_seconds() / 3600,
                         y=s,
                         x_ci=None,
                         fit_reg=False,
                         color=color,
                         scatter_kws={"s": 3})
        ax.set(xlabel='Run time (hours)',
               ylabel='Active pores per 10 minutes',
               title=title or pores_over_time.title)
        pores_over_time.fig = ax.get_figure()
        pores_over_time.save(format=figformat)
        plt.close("all")
        plots.append(pores_over_time)
    return plots
Example #16
0
def compare_cumulative_yields(df, path, palette=None, title=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")

    logging.info(
        "Nanoplotter: Creating cumulative yield plots using {} reads.".format(
            len(dfs)))
    cum_yield_gb = Plot(path=path +
                        "NanoComp_CumulativeYieldPlot_Gigabases.html",
                        title="Cumulative yield")
    data = []
    for d, c in zip(df["dataset"].unique(), palette):
        s = dfs.loc[dfs["dataset"] == d,
                    "lengths"].cumsum().resample('10T').max() / 1e9
        data.append(
            go.Scatter(x=s.index.total_seconds() / 3600,
                       y=s,
                       opacity=0.75,
                       name=d,
                       marker=dict(color=c)))
    cum_yield_gb.html = plotly.offline.plot(
        {
            "data":
            data,
            "layout":
            go.Layout(
                barmode='overlay',
                title=title or cum_yield_gb.title,
                xaxis=dict(title="Time (hours)"),
                yaxis=dict(title="Yield (gigabase)"),
            )
        },
        output_type="div",
        show_link=False)

    cum_yield_gb.fig = go.Figure({
        "data":
        data,
        "layout":
        go.Layout(
            barmode='overlay',
            title=title or cum_yield_gb.title,
            xaxis=dict(title="Time (hours)"),
            yaxis=dict(title="Yield (gigabase)"),
        )
    })
    cum_yield_gb.save()
    return [cum_yield_gb]
Example #17
0
def active_pores_over_time(df, path, palette=None, title=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")

    logging.info("NanoComp: Creating active pores plot using {} reads.".format(
        len(dfs)))
    active_pores = Plot(path=path + "NanoComp_ActivePoresOverTime.html",
                        title="Active pores over time")
    data = []
    for sample, color in zip(df["dataset"].unique(), palette):
        pores = dfs.loc[dfs["dataset"] == sample,
                        "channelIDs"].resample('10T').nunique()
        data.append(
            go.Scatter(x=pores.index.total_seconds() / 3600,
                       y=pores,
                       opacity=0.75,
                       name=sample,
                       marker=dict(color=color)))

    active_pores.html = plotly.offline.plot(
        {
            "data":
            data,
            "layout":
            go.Layout(
                barmode='overlay',
                title=title or active_pores.title,
                xaxis=dict(title="Time (hours)"),
                yaxis=dict(title="Active pores (per 10 minutes)"),
            )
        },
        output_type="div",
        show_link=False)

    active_pores.fig = go.Figure({
        "data":
        data,
        "layout":
        go.Layout(
            barmode='overlay',
            title=title or active_pores.title,
            xaxis=dict(title="Time (hours)"),
            yaxis=dict(title="Active pores (per 10 minutes)"),
        )
    })
    active_pores.save()
    return active_pores
Example #18
0
def quality_over_time(dfs, path, figformat, title, plot_settings={}):
    time_qual = Plot(path=path + "TimeQualityViolinPlot." + figformat,
                     title="Violin plot of quality over time")
    sns.set(style="white", **plot_settings)
    ax = sns.violinplot(x="timebin",
                        y="quals",
                        data=dfs,
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Basecall quality",
           title=title or time_qual.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_qual.fig = ax.get_figure()
    time_qual.save(format=figformat)
    plt.close("all")
    return time_qual
Example #19
0
def sequencing_speed_over_time(dfs, path, figformat, title, plot_settings={}):
    time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot." + figformat,
                         title="Violin plot of sequencing speed over time")
    sns.set(style="white", **plot_settings)
    if "timebin" not in dfs:
        dfs['timebin'] = add_time_bins(dfs)
    ax = sns.violinplot(x=dfs["timebin"],
                        y=dfs["lengths"] / dfs["duration"],
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Sequencing speed (nucleotides/second)",
           title=title or time_duration.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_duration.fig = ax.get_figure()
    time_duration.save(format=figformat)
    plt.close("all")
    return time_duration
Example #20
0
def compare_sequencing_speed(df, figformat, path, title=None, palette=None):
    logging.info(
        "Nanoplotter: creating comparison of sequencing speed over time.")
    seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time." +
                     figformat,
                     title="Sequencing speed over time")
    dfs = check_valid_time_and_sort(df, "start_time")
    dfs['timebin'] = add_time_bins(dfs)
    ax = sns.violinplot(x=dfs["timebin"],
                        y=dfs["lengths"] / dfs["duration"],
                        hue=dfs["dataset"],
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Sequencing speed (nucleotides/second)")
    plt.xticks(rotation=45, ha='center', fontsize=8)
    seq_speed.fig = ax.get_figure()
    seq_speed.save(format=figformat)
    plt.close("all")
    return [seq_speed]
def yield_by_minimal_length_plot(array, name, path,
                                 title=None, color="#4CB391", figformat="png"):
    df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]})
    df["cumyield_gb"] = df["lengths"].cumsum() / 10**9
    yield_by_length = Plot(
        path=path + "Yield_By_Length." + figformat,
        title="Yield by length")
    ax = sns.regplot(
        x='lengths',
        y="cumyield_gb",
        data=df,
        x_ci=None,
        fit_reg=False,
        color=color,
        scatter_kws={"s": 3})
    ax.set(
        xlabel='Read length',
        ylabel='Cumulative yield for minimal length',
        title=title or yield_by_length.title)
    yield_by_length.fig = ax.get_figure()
    yield_by_length.save(format=figformat)
    plt.close("all")
    return yield_by_length
Example #22
0
def n50_barplot(df, path, settings, title=None):
    '''
    Returns Plot object and creates figure(format specified)/html
    containing bar chart of total gb aligned/sequenced read length n50
    '''
    n50_bar = Plot(path=path + "NanoComp_N50.html",
                   title="Comparing read length N50")
    if "aligned_lengths" in df:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Total gigabase aligned'
    else:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Sequenced read length N50'

    idx = df["dataset"].unique()

    n50_bar.fig = go.Figure()

    for idx, n50 in zip(idx, n50s):
        n50_bar.fig.add_trace(go.Bar(x=[idx], y=[n50], name=idx))

    n50_bar.fig.update_layout(
        title=title or n50_bar.title,
        title_x=0.5,
        yaxis_title=ylabel,
    )

    n50_bar.html = n50_bar.fig.to_html(full_html=False, include_plotlyjs='cdn')
    n50_bar.save(settings)
    return [n50_bar]
Example #23
0
def compare_sequencing_speed(df, path, settings, title=None):
    logging.info(
        "NanoComp: creating comparison of sequencing speed over time.")
    seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time.html",
                     title="Sequencing speed over time")

    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")
    dfs = dfs.loc[dfs["duration"] > 0]

    palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5

    data = []
    for sample, color in zip(df["dataset"].unique(), palette):
        seqspeed = (dfs.loc[dfs["dataset"] == sample, "lengths"] /
                    dfs.loc[dfs["dataset"] == sample,
                            "duration"]).resample('30T').median()
        data.append(
            go.Scatter(x=seqspeed.index.total_seconds() / 3600,
                       y=seqspeed,
                       opacity=0.75,
                       name=sample,
                       mode='lines',
                       marker=dict(color=color)))

    seq_speed.fig = go.Figure({"data": data})

    seq_speed.fig.update_layout(
        title=title or seq_speed.title,
        title_x=0.5,
        xaxis_title='Interval (hours)',
        yaxis_title="Sequencing speed (nucleotides/second)")

    seq_speed.html = seq_speed.fig.to_html(full_html=False,
                                           include_plotlyjs='cdn')
    seq_speed.save(settings)
    return [seq_speed]
Example #24
0
def length_plots(array,
                 name,
                 path,
                 settings,
                 title=None,
                 n50=None,
                 color="#4CB391"):
    """Create histogram of normal and log transformed read lengths."""
    logging.info("NanoPlot:  Creating length plots for {}.".format(name))
    maxvalx = np.amax(array)
    if n50:
        logging.info(
            "NanoPlot: Using {} reads with read length N50 of {}bp and maximum of {}bp."
            .format(array.size, n50, maxvalx))
    else:
        logging.info(
            f"NanoPlot:  Using {array.size} reads maximum of {maxvalx}bp.")

    plots = []

    HistType = [{
        'weight': array,
        'name': 'Weighted',
        'ylabel': 'Number of reads'
    }, {
        'weight': None,
        'name': 'Non weighted',
        'ylabel': 'Number of reads'
    }]

    for h_type in HistType:
        histogram = Plot(path=path + h_type["name"].replace(" ", "_") +
                         "Histogram" + name.replace(' ', '') + ".html",
                         title=f"{h_type['name']} histogram of read lengths")

        hist, bin_edges = np.histogram(array,
                                       bins=max(round(int(maxvalx) / 500), 10),
                                       weights=h_type["weight"])

        fig = go.Figure()

        fig.add_trace(go.Bar(x=bin_edges[1:], y=hist, marker_color=color))

        if n50:
            fig.add_vline(n50)
            fig.add_annotation(text='N50', x=n50, y=0.95)
            fig.update_annotations(font_size=8)

        fig.update_layout(xaxis_title='Read length',
                          yaxis_title=h_type["ylabel"],
                          title=title or histogram.title,
                          title_x=0.5)

        histogram.fig = fig
        histogram.html = histogram.fig.to_html(full_html=False,
                                               include_plotlyjs='cdn')
        histogram.save(settings)

        log_histogram = Plot(
            path=path + h_type["name"].replace(" ", "_") +
            "LogTransformed_Histogram" + name.replace(' ', '') + ".html",
            title=h_type["name"] +
            " histogram of read lengths after log transformation")

        if h_type["weight"] is None:
            hist_log, bin_edges_log = np.histogram(
                np.log10(array),
                bins=max(round(int(maxvalx) / 500), 10),
                weights=h_type["weight"])

        else:
            hist_log, bin_edges_log = np.histogram(
                np.log10(array),
                bins=max(round(int(maxvalx) / 500), 10),
                weights=np.log10(h_type["weight"]))

        fig = go.Figure()
        fig.add_trace(
            go.Bar(x=bin_edges_log[1:], y=hist_log, marker_color=color))

        ticks = [10**i for i in range(10) if not 10**i > 10 * maxvalx]

        fig.update_layout(xaxis=dict(tickmode='array',
                                     tickvals=np.log10(ticks),
                                     ticktext=ticks),
                          xaxis_title='Read length',
                          yaxis_title=h_type["ylabel"],
                          title=title or log_histogram.title,
                          title_x=0.5)

        if n50:
            fig.add_vline(np.log10(n50))
            fig.add_annotation(text='N50', x=np.log10(n50), y=0.95)
            fig.update_annotations(font_size=8)

        log_histogram.fig = fig
        log_histogram.html = log_histogram.fig.to_html(full_html=False,
                                                       include_plotlyjs='cdn')
        log_histogram.save(settings)

        plots.extend([histogram, log_histogram])

    plots.append(
        yield_by_minimal_length_plot(array=array,
                                     name=name,
                                     path=path,
                                     title=title,
                                     color=color,
                                     settings=settings))

    return plots
Example #25
0
def violin_or_box_plot(df,
                       y,
                       path,
                       y_name,
                       settings,
                       title=None,
                       plot="violin",
                       log=False):
    """Create a violin/boxplot/ridge from the received DataFrame.

    The x-axis should be divided based on the 'dataset' column,
    the y-axis is specified in the arguments
    """
    comp = Plot(path=f"{path}NanoComp_{y.replace(' ', '_')}_{plot}.html",
                title=f"Comparing {y_name.lower()}")

    if plot == 'violin':
        logging.info(f"NanoComp: Creating violin plot for {y}.")

        fig = go.Figure()

        for dataset in df["dataset"].unique():
            fig.add_trace(
                go.Violin(x=df["dataset"][df["dataset"] == dataset],
                          y=df[y][df["dataset"] == dataset],
                          points=False,
                          name=dataset))

        process_violin_and_box(fig,
                               log=log,
                               plot_obj=comp,
                               title=title,
                               y_name=y_name,
                               ymax=np.amax(df[y]),
                               settings=settings)

    elif plot == 'box':
        logging.info("NanoComp: Creating box plot for {}.".format(y))

        fig = go.Figure()

        for dataset in df["dataset"].unique():
            fig.add_trace(
                go.Box(x=df["dataset"][df["dataset"] == dataset],
                       y=df[y][df["dataset"] == dataset],
                       name=dataset))

        process_violin_and_box(fig,
                               log=log,
                               plot_obj=comp,
                               title=title,
                               y_name=y_name,
                               ymax=np.amax(df[y]),
                               settings=settings)

    elif plot == 'ridge':
        logging.info("NanoComp: Creating ridges plot for {}.".format(y))

        fig = go.Figure()

        for d in df["dataset"].unique():
            fig.add_trace(go.Violin(x=df[y][df['dataset'] == d], name=d))

        fig.update_traces(orientation='h',
                          side='positive',
                          width=3,
                          points=False)
        fig.update_layout(title=title or comp.title, title_x=0.5)

        comp.fig = fig
        comp.html = comp.fig.to_html(full_html=False, include_plotlyjs='cdn')
        comp.save(settings)

    else:
        logging.error(f"Unknown comp plot type {plot}")
        sys.exit(f"Unknown comp plot type {plot}")

    return [comp]
def scatter(x, y, names, path, plots, color="#4CB391", figformat="png",
            stat=None, log=False, minvalx=0, minvaly=0, title=None,
            plot_settings={}, xmax=None, ymax=None):
    """Create bivariate plots.

    Create four types of bivariate plots of x vs y, containing marginal summaries
    -A scatter plot with histograms on axes
    -A hexagonal binned plot with histograms on axes
    -A kernel density plot with density curves on axes
    -A pauvre-style plot using code from https://github.com/conchoecia/pauvre
    """
    logging.info("Nanoplotter: Creating {} vs {} plots using statistics from {} reads.".format(
        names[0], names[1], x.size))
    if not contains_variance([x, y], names):
        return []
    sns.set(style="ticks", **plot_settings)
    maxvalx = xmax or np.amax(x)
    maxvaly = ymax or np.amax(y)

    plots_made = []

    if plots["hex"]:
        hex_plot = Plot(
            path=path + "_hex." + figformat,
            title="{} vs {} plot using hexagonal bins".format(names[0], names[1]))
        plot = sns.jointplot(
            x=x,
            y=y,
            kind="hex",
            color=color,
            stat_func=stat,
            space=0,
            xlim=(minvalx, maxvalx),
            ylim=(minvaly, maxvaly),
            height=10)
        plot.set_axis_labels(names[0], names[1])
        if log:
            hex_plot.title = hex_plot.title + " after log transformation of read lengths"
            ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25)
        hex_plot.fig = plot
        hex_plot.save(format=figformat)
        plots_made.append(hex_plot)

    sns.set(style="darkgrid", **plot_settings)
    if plots["dot"]:
        dot_plot = Plot(
            path=path + "_dot." + figformat,
            title="{} vs {} plot using dots".format(names[0], names[1]))
        plot = sns.jointplot(
            x=x,
            y=y,
            kind="scatter",
            color=color,
            stat_func=stat,
            xlim=(minvalx, maxvalx),
            ylim=(minvaly, maxvaly),
            space=0,
            height=10,
            joint_kws={"s": 1})
        plot.set_axis_labels(names[0], names[1])
        if log:
            dot_plot.title = dot_plot.title + " after log transformation of read lengths"
            ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25)
        dot_plot.fig = plot
        dot_plot.save(format=figformat)
        plots_made.append(dot_plot)

    if plots["kde"]:
        idx = np.random.choice(x.index, min(2000, len(x)), replace=False)
        kde_plot = Plot(
            path=path + "_kde." + figformat,
            title="{} vs {} plot using a kernel density estimation".format(names[0], names[1]))
        plot = sns.jointplot(
            x=x[idx],
            y=y[idx],
            kind="kde",
            clip=((0, np.Inf), (0, np.Inf)),
            xlim=(minvalx, maxvalx),
            ylim=(minvaly, maxvaly),
            space=0,
            color=color,
            stat_func=stat,
            shade_lowest=False,
            height=10)
        plot.set_axis_labels(names[0], names[1])
        if log:
            kde_plot.title = kde_plot.title + " after log transformation of read lengths"
            ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25)
        kde_plot.fig = plot
        kde_plot.save(format=figformat)
        plots_made.append(kde_plot)

    if plots["pauvre"] and names == ['Read lengths', 'Average read quality'] and log is False:
        pauvre_plot = Plot(
            path=path + "_pauvre." + figformat,
            title="{} vs {} plot using pauvre-style @conchoecia".format(names[0], names[1]))
        sns.set(style="white", **plot_settings)
        margin_plot(df=pd.DataFrame({"length": x, "meanQual": y}),
                    Y_AXES=False,
                    title=title or "Length vs Quality in Pauvre-style",
                    plot_maxlen=None,
                    plot_minlen=0,
                    plot_maxqual=None,
                    plot_minqual=0,
                    lengthbin=None,
                    qualbin=None,
                    BASENAME="whatever",
                    path=pauvre_plot.path,
                    fileform=[figformat],
                    dpi=600,
                    TRANSPARENT=True,
                    QUIET=True)
        plots_made.append(pauvre_plot)
    plt.close("all")
    return plots_made
def length_plots(array, name, path, title=None, n50=None, color="#4CB391", figformat="png"):
    """Create histogram of normal and log transformed read lengths."""
    logging.info("Nanoplotter: Creating length plots for {}.".format(name))
    maxvalx = np.amax(array)
    if n50:
        logging.info("Nanoplotter: Using {} reads with read length N50 of {}bp and maximum of {}bp."
                     .format(array.size, n50, maxvalx))
    else:
        logging.info("Nanoplotter: Using {} reads maximum of {}bp.".format(array.size, maxvalx))

    plots = []
    HistType = namedtuple('HistType', 'weight name ylabel')
    for h_type in [HistType(None, "", "Number of reads"),
                   HistType(array, "Weighted ", "Number of bases")]:
        histogram = Plot(
            path=path + h_type.name.replace(" ", "_") + "Histogram"
            + name.replace(' ', '') + "." + figformat,
            title=h_type.name + "Histogram of read lengths")
        ax = sns.distplot(
            a=array,
            kde=False,
            hist=True,
            bins=max(round(int(maxvalx) / 500), 10),
            color=color,
            hist_kws=dict(weights=h_type.weight,
                          edgecolor=color,
                          linewidth=0.2,
                          alpha=0.8))
        if n50:
            plt.axvline(n50)
            plt.annotate('N50', xy=(n50, np.amax([h.get_height() for h in ax.patches])), size=8)
        ax.set(
            xlabel='Read length',
            ylabel=h_type.ylabel,
            title=title or histogram.title)
        plt.ticklabel_format(style='plain', axis='y')
        histogram.fig = ax.get_figure()
        histogram.save(format=figformat)
        plt.close("all")

        log_histogram = Plot(
            path=path + h_type.name.replace(" ", "_") + "LogTransformed_Histogram"
            + name.replace(' ', '') + "." + figformat,
            title=h_type.name + "Histogram of read lengths after log transformation")
        ax = sns.distplot(
            a=np.log10(array),
            kde=False,
            hist=True,
            color=color,
            hist_kws=dict(weights=h_type.weight,
                          edgecolor=color,
                          linewidth=0.2,
                          alpha=0.8))
        ticks = [10**i for i in range(10) if not 10**i > 10 * maxvalx]
        ax.set(
            xticks=np.log10(ticks),
            xticklabels=ticks,
            xlabel='Read length',
            ylabel=h_type.ylabel,
            title=title or log_histogram.title)
        if n50:
            plt.axvline(np.log10(n50))
            plt.annotate('N50', xy=(np.log10(n50), np.amax(
                [h.get_height() for h in ax.patches])), size=8)
        plt.ticklabel_format(style='plain', axis='y')
        log_histogram.fig = ax.get_figure()
        log_histogram.save(format=figformat)
        plt.close("all")
        plots.extend([histogram, log_histogram])
    plots.append(yield_by_minimal_length_plot(array=array,
                                              name=name,
                                              path=path,
                                              title=title,
                                              color=color,
                                              figformat=figformat))
    return plots
Example #28
0
def violin_or_box_plot(df,
                       y,
                       figformat,
                       path,
                       y_name,
                       title=None,
                       plot="violin",
                       log=False,
                       palette=None):
    """Create a violin or boxplot from the received DataFrame.

    The x-axis should be divided based on the 'dataset' column,
    the y-axis is specified in the arguments
    """
    comp = Plot(path=path + "NanoComp_" + y.replace(' ', '_') + '.' +
                figformat,
                title="Comparing {}".format(y_name.lower()))

    if plot == 'violin':
        logging.info("NanoComp: Creating violin plot for {}.".format(y))
        process_violin_and_box(ax=sns.violinplot(x="dataset",
                                                 y=y,
                                                 data=df,
                                                 inner=None,
                                                 cut=0,
                                                 palette=palette,
                                                 linewidth=0),
                               log=log,
                               plot_obj=comp,
                               title=title,
                               y_name=y_name,
                               figformat=figformat,
                               ymax=np.amax(df[y]))
    elif plot == 'box':
        logging.info("NanoComp: Creating box plot for {}.".format(y))
        process_violin_and_box(ax=sns.boxplot(x="dataset",
                                              y=y,
                                              data=df,
                                              palette=palette),
                               log=log,
                               plot_obj=comp,
                               title=title,
                               y_name=y_name,
                               figformat=figformat,
                               ymax=np.amax(df[y]))
    elif plot == 'ridge':
        logging.info("NanoComp: Creating ridges plot for {}.".format(y))
        comp.fig, axes = joypy.joyplot(df,
                                       by="dataset",
                                       column=y,
                                       title=title or comp.title,
                                       x_range=[-0.05, np.amax(df[y])])
        if log:
            xticks = [float(i.get_text()) for i in axes[-1].get_xticklabels()]
            axes[-1].set_xticklabels([10**i for i in xticks])
        axes[-1].set_xticklabels(axes[-1].get_xticklabels(),
                                 rotation=30,
                                 ha='center')
        comp.save(format=figformat)
    else:
        logging.error("Unknown comp plot type {}".format(plot))
        sys.exit("Unknown comp plot type {}".format(plot))
    plt.close("all")
    return [comp]
Example #29
0
def scatter_legacy(x,
                   y,
                   names,
                   path,
                   plots,
                   color,
                   settings,
                   stat=None,
                   log=False,
                   minvalx=0,
                   minvaly=0,
                   title=None,
                   xmax=None,
                   ymax=None):
    """Create bivariate plots.

    Create four types of bivariate plots of x vs y, containing marginal summaries
    -A scatter plot with histograms on axes
    -A hexagonal binned plot with histograms on axes
    -A kernel density plot with density curves on axes
    -A pauvre-style plot using code from https://github.com/conchoecia/pauvre
    """
    try:
        import matplotlib as mpl
        mpl.use('Agg')
        import seaborn as sns
        import matplotlib.pyplot as plt
    except ImportError:
        sys.stderr("NanoPlot needs seaborn and matplotlib with --legacy")
        return []
    figformat = settings["format"]
    if figformat in ["webp", "json"]:
        figformat = "png"

    logging.info(
        f"NanoPlot: Creating {names[0]} vs {names[1]} legacy plots using {x.size} reads."
    )
    if not contains_variance([x, y], names):
        return []
    sns.set(style="ticks")
    maxvalx = xmax or np.amax(x)
    maxvaly = ymax or np.amax(y)

    plots_made = []
    path = path + "_legacy"

    if plots["hex"]:
        if log:
            hex_plot = Plot(path=path + "_loglength_hex." + figformat,
                            title="{} vs {} plot using hexagonal bins "
                            "after log transformation of read lengths".format(
                                names[0], names[1]))
        else:
            hex_plot = Plot(path=path + "_hex." + figformat,
                            title="{} vs {} plot using hexagonal bins".format(
                                names[0], names[1]))
        plot = sns.jointplot(x=x,
                             y=y,
                             kind="hex",
                             color=color,
                             stat_func=stat,
                             space=0,
                             xlim=(minvalx, maxvalx),
                             ylim=(minvaly, maxvaly),
                             height=10)
        plot.set_axis_labels(names[0], names[1])
        if log:
            ticks = [
                10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
            ]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or f"{names[0]} vs {names[1]} plot",
                          fontsize=25)
        hex_plot.fig = plot
        hex_plot.save(settings)
        plots_made.append(hex_plot)

    sns.set(style="darkgrid")
    if plots["dot"]:
        print("we here")
        if log:
            dot_plot = Plot(path=path + "_loglength_dot." + figformat,
                            title="{} vs {} plot using dots "
                            "after log transformation of read lengths".format(
                                names[0], names[1]))
        else:
            dot_plot = Plot(path=path + "_dot." + figformat,
                            title="{} vs {} plot using dots".format(
                                names[0], names[1]))
        plot = sns.jointplot(x=x,
                             y=y,
                             kind="scatter",
                             color=color,
                             stat_func=stat,
                             xlim=(minvalx, maxvalx),
                             ylim=(minvaly, maxvaly),
                             space=0,
                             height=10,
                             joint_kws={"s": 1})
        plot.set_axis_labels(names[0], names[1])
        if log:
            ticks = [
                10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
            ]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]),
                          fontsize=25)
        dot_plot.fig = plot
        dot_plot.save(settings)
        plots_made.append(dot_plot)

    if plots["kde"]:
        if len(x) > 2:
            idx = np.random.choice(x.index, min(2000, len(x)), replace=False)
            if log:
                kde_plot = Plot(
                    path=path + "_loglength_kde." + figformat,
                    title="{} vs {} plot using a kernel density estimation "
                    "after log transformation of read lengths".format(
                        names[0], names[1]))
            else:
                kde_plot = Plot(
                    path=path + "_kde." + figformat,
                    title=
                    f"{names[0]} vs {names[1]} plot using a kernel density estimation"
                )
            plot = sns.jointplot(x=x[idx],
                                 y=y[idx],
                                 kind="kde",
                                 clip=((0, np.Inf), (0, np.Inf)),
                                 xlim=(minvalx, maxvalx),
                                 ylim=(minvaly, maxvaly),
                                 space=0,
                                 color=color,
                                 stat_func=stat,
                                 shade_lowest=False,
                                 height=10)
            plot.set_axis_labels(names[0], names[1])
            if log:
                ticks = [
                    10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
                ]
                plot.ax_joint.set_xticks(np.log10(ticks))
                plot.ax_marg_x.set_xticks(np.log10(ticks))
                plot.ax_joint.set_xticklabels(ticks)
            plt.subplots_adjust(top=0.90)
            plot.fig.suptitle(title
                              or "{} vs {} plot".format(names[0], names[1]),
                              fontsize=25)
            kde_plot.fig = plot
            kde_plot.save(settings)
            plots_made.append(kde_plot)
        else:
            sys.stderr.write(
                "Not enough observations (reads) to create a kde plot.\n")
            logging.info(
                "NanoPlot: Not enough observations (reads) to create a kde plot"
            )
    plt.close("all")
    return plots_made
Example #30
0
def scatter(x,
            y,
            legacy,
            names,
            path,
            plots,
            color,
            colormap,
            settings,
            stat=None,
            log=False,
            minvalx=0,
            minvaly=0,
            title=None,
            xmax=None,
            ymax=None):
    """->
    create marginalised scatterplots and KDE plot with marginalized histograms
    -> update from scatter_legacy function to utilise plotly package
    - scatterplot with histogram on both axes
    - kernel density plot with histograms on both axes
    - hexbin not implemented yet
    - pauvre plot temporarily not available
    """
    logging.info(
        f"NanoPlot: Creating {names[0]} vs {names[1]} plots using {x.size} reads."
    )
    if not contains_variance([x, y], names):
        return []
    plots_made = []
    idx = np.random.choice(x.index, min(10000, len(x)), replace=False)
    maxvalx = xmax or np.amax(x[idx])
    maxvaly = ymax or np.amax(y[idx])

    if plots["dot"]:
        if log:
            dot_plot = Plot(path=path + "_loglength_dot.html",
                            title=f"{names[0]} vs {names[1]} plot using dots "
                            "after log transformation of read lengths")
        else:
            dot_plot = Plot(path=path + "_dot.html",
                            title=f"{names[0]} vs {names[1]} plot using dots")

        fig = px.scatter(x=x[idx],
                         y=y[idx],
                         marginal_x="histogram",
                         marginal_y="histogram",
                         range_x=[minvalx, maxvalx],
                         range_y=[minvaly, maxvaly])
        fig.update_traces(marker=dict(color=color))
        fig.update_yaxes(rangemode="tozero")
        fig.update_xaxes(rangemode="tozero")

        fig.update_layout(xaxis_title=names[0],
                          yaxis_title=names[1],
                          title=title or dot_plot.title,
                          title_x=0.5)

        if log:
            ticks = [
                10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
            ]
            fig.update_layout(xaxis=dict(tickmode='array',
                                         tickvals=np.log10(ticks),
                                         ticktext=ticks,
                                         tickangle=45))

        dot_plot.fig = fig
        dot_plot.html = dot_plot.fig.to_html(full_html=False,
                                             include_plotlyjs='cdn')
        dot_plot.save(settings)
        plots_made.append(dot_plot)

    if plots["kde"]:
        kde_plot = Plot(path=path + "_loglength_kde.html" if log else path +
                        "_kde.html",
                        title=f"{names[0]} vs {names[1]} kde plot")

        col = hex_to_rgb_scale_0_1(color)
        fig = ff.create_2d_density(x[idx],
                                   y[idx],
                                   point_size=3,
                                   hist_color=col,
                                   point_color=col,
                                   colorscale=colormap)

        fig.update_layout(xaxis_title=names[0],
                          yaxis_title=names[1],
                          title=title or kde_plot.title,
                          title_x=0.5,
                          xaxis=dict(tickangle=45))

        if log:
            ticks = [
                10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
            ]
            fig.update_layout(xaxis=dict(tickmode='array',
                                         tickvals=np.log10(ticks),
                                         ticktext=ticks,
                                         tickangle=45))

        kde_plot.fig = fig
        kde_plot.html = kde_plot.fig.to_html(full_html=False,
                                             include_plotlyjs='cdn')
        kde_plot.save(settings)
        plots_made.append(kde_plot)

    if 1 in legacy.values():
        settings, args = utils.get_args()
        plots_made += scatter_legacy(x=x[idx],
                                     y=y[idx],
                                     names=names,
                                     path=path,
                                     plots=legacy,
                                     color=color,
                                     settings=settings,
                                     stat=stat,
                                     log=log,
                                     minvalx=minvalx,
                                     minvaly=minvaly,
                                     title=title)
    return plots_made