def scatter(files, tracksheets, ttypes, colors):
    nuceriod_plt.config_params(12)

    df = _load(files)
    toplot = df.merge(tracksheets, how='left')

    toplot['ttype'] = toplot['tumor_name'].map(ttypes)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 4))

    ax.set_xscale('log')

    for i, row in toplot[(toplot['qvals_snr'] < 0.1)
                         & (toplot['snr'] > 8)].iterrows():
        ax.scatter(row['muts'],
                   row['prop_increase_in'],
                   s=14,
                   color=colors[row['ttype']])

    plt.title('Significant')
    plt.ylabel('Proportion of increase minor in')
    plt.xlabel('Number of mutations (log)')
    plt.hlines(0, 0, 1000000, linestyle='--', color='grey', alpha=0.6)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.ylim(-0.25, 0.25)
def rotational_bars(rot_high_files, rot_low_files):
    nuceriod_plt.config_params(11)

    df_high = increase.load_d(rot_high_files)
    df_high['rot'] = 'high'

    df_low = increase.load_d(rot_low_files)
    df_low['rot'] = 'low'

    df = pd.concat([df_high, df_low])

    fig, axs = plt.subplots(nrows=len(df.groupby(by='name')),
                            ncols=1,
                            figsize=(1.75, 8))
    order = ['low', 'high']

    for ix, (sig, data) in enumerate(df.groupby(by='name')):
        xvals = []
        yvals = []
        colors = []
        count = 0
        for i in order:
            val = data[data['rot'] == i]['snr'].tolist()[0]
            yvals.append(val)
            xvals.append(count)
            colors.append(COLORS[sig])
            count += 1
        axs[ix].bar(xvals, yvals, color=colors, label=['low', 'high'])
        axs[ix].set_xticks([0, 1])
        axs[ix].set_xticklabels(('low', 'high'), fontsize=11)
        axs[ix].set_ylabel('SNR')
        axs[ix].spines['right'].set_visible(False)
        axs[ix].spines['top'].set_visible(False)
    plt.tight_layout()
def compare(files_deconstructsigs, files_sigfit):
    nuceriod_plt.config_params(11)

    df_deconstructsigs = increase.load_d(files_deconstructsigs)
    df_deconstructsigs['control'] = 'deconstructsigs'

    df_sigfit = increase.load_d(files_sigfit)
    df_sigfit['control'] = 'sigfit'

    toplot = pd.concat([df_deconstructsigs, df_sigfit])

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(3, 2))
    sig_list = []
    decon_list = []
    sig_list2 = []
    decon_list2 = []

    for i, data in toplot.groupby(by='name'):
        if len(data) == 2:
            row = data.iloc[0]
            snr_decon = data[data['control'] == 'sigfit']['snr'].iloc[0]
            snr_sigfit = data[data['control'] ==
                              'deconstructsigs']['snr'].iloc[0]
            sig_list.append(snr_sigfit)
            decon_list.append(snr_decon)
            if row['cross_validation_max'] < 0:
                ax.scatter(-np.log(snr_decon),
                           -np.log(snr_sigfit),
                           c=COLORS[i])
                sig_list2.append(-np.log(snr_sigfit))
                decon_list2.append(-np.log(snr_decon))
            else:
                ax.scatter(np.log(snr_decon), np.log(snr_sigfit), c=COLORS[i])
                sig_list2.append(np.log(snr_sigfit))
                decon_list2.append(np.log(snr_decon))
    plt.xlabel('Period')

    ylabels = [str(2**abs(i)) for i in range(2, 10, 2)]
    yfinal = ylabels[::-1] + ylabels[1:]
    plt.xticks(np.arange(-6, 8, 2), yfinal)

    ylabels = [str(2**abs(i)) for i in range(2, 10, 2)]
    yfinal = ylabels[::-1] + ylabels[1:]
    plt.yticks(np.arange(-6, 8, 2), yfinal)

    slope, intercept, r_value, p_value, std_err = stats.linregress(
        sig_list2, decon_list2)
    xvals = np.arange(-6, 8, 2)
    yvals = [slope * y + intercept for y in xvals]
    plt.plot(xvals, yvals)
    R, pval = stats.pearsonr(sig_list, decon_list)

    plt.text(-4, 3, 'R = {}\npval = {}'.format(round(R, 3), round(pval, 3)))
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.ylabel('SNR (Sigfit)')
    plt.xlabel('SNR (Deconstructsig)')

    plt.tight_layout()
Example #4
0
def autocorrelation(original_file, simulated_files):
    nucperiod_plt.config_params(font_size=14)

    with open(original_file, 'rt') as f:
        mydict = json.load(f)

    pair_count_array = np.array(mydict['pair_count'])
    motif_count = mydict['motif_count']
    len_chunk = mydict['chunk_len']

    # define the signal
    signal = np.array([(v / len_chunk) / (motif_count / len_chunk) ** 2 for v in pair_count_array])

    # define figure
    figsize = (10, 5)
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    ax.set_title('motif autocorrelation')

    # raw signal
    ax.plot(range(4, len(signal)), signal[4:], alpha=0.5, label='raw')

    # 3-smoothing
    signal = spectral.mean_smooth(signal, 3)
    ax.plot(range(4, len(signal)), signal[4:], linewidth=3.0, label='3-smoothed')

    counter = 0

    # detrended smooth autocorrelation
    initial_values_dict = {'a_0': np.mean(signal[4:]), 'a_1': 0., 'a_2': 0.}
    params, obj_func = non_linear.create_quadratic_model(initial_values_dict)
    x = np.arange(len(signal[4:]))
    non_linear_fitter = non_linear.NonLinearFitting(obj_func, params, x, signal[4:])
    _, predicted = non_linear_fitter.least_squares()

    # with quadratic least-squares fit
    ax.plot(range(4, len(signal)), predicted, '--', label='quadratic trend')

    for file in simulated_files:
        counter += 1

        with open(file, 'rt') as f:
            random_chunk = json.load(f)

        pc = np.array(random_chunk['pair_count'])
        mc = random_chunk['motif_count']
        len_random_chunk = random_chunk['chunk_len']

        random_signal = np.array([(v / len_chunk) / (mc / len_random_chunk) ** 2 for v in pc])
        random_signal = spectral.mean_smooth(random_signal, 3)
        if counter == 1:
            label = 'randomized'
        else:
            label = None
        ax.plot(range(4, len(random_signal)), random_signal[4:], linewidth=3.0,
                alpha=0.3, color='grey', label=label)

    ax.legend()
    plt.rcParams['savefig.facecolor'] = fig.get_facecolor()
Example #5
0
def compare(cohorts_5mer, cohorts_3mer, cohorts_linker, tumors=None):
    nuceriod_plt.config_params(14)

    df_5mer = increase.load_d(cohorts_5mer)
    df_5mer['control'] = 'mer5'
    df_3mer = increase.load_d(cohorts_3mer)
    df_3mer['control'] = 'mer3'
    df_linker = increase.load_d(cohorts_linker)
    df_linker['control'] = 'linker'

    df = pd.concat([df_5mer, df_3mer, df_linker])

    df['increase_in'] = df['observed_in'] - df['expected_in']
    df['prop_increase_in'] = df['increase_in'] / df['expected_in']

    df['ttype'] = df['name'].map(TTYPES)
    if tumors is not None:
        df = df[df['name'].isin(tumors)]

    toplot = df.sort_values(by='prop_increase_in', ascending=True)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))

    labels = []
    colors = []
    red_patch = mpatches.Patch(color='red', label='3-mers')
    green_patch = mpatches.Patch(color='green',
                                 label='no nucleosomes in context')
    orange_patch = mpatches.Patch(color='orange', label='5-mers')

    for ix, (ttype, data) in enumerate(
            toplot.sort_values(by='snr', ascending=True).groupby(by='ttype',
                                                                 sort=False)):
        snr1 = data[data['control'] == 'mer3']['snr'].tolist()[0]
        snr2 = data[data['control'] == 'linker']['snr'].tolist()[0]
        snr3 = data[data['control'] == 'mer5']['snr'].tolist()[0]
        colors.append('red')
        colors.append('green')
        colors.append('orange')
        ax.scatter(ix, math.log2(snr1), color='red', s=15, alpha=0.8)
        ax.scatter(ix, math.log2(snr2), color='green', s=15, alpha=0.8)
        ax.scatter(ix, math.log2(snr3), color='orange', s=15, alpha=0.8)

        labels.append(ttype)

    plt.xticks([i for i in range(ix + 1)], labels, rotation=90)
    tick = [2, 4, 6, 8]
    plt.yticks(tick, [str(2**t) for t in tick])
    plt.ylabel('log2(SNR)')
    plt.legend(handles=[red_patch, green_patch, orange_patch])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    plt.tight_layout()
Example #6
0
def plot_bars(snr_high, snr_low):
    nucperiod_plt.config_params()

    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(0.9,1.1,))

    xvals = [0, 1]
    yvals = [snr_low, snr_high]
    axs.bar(xvals, yvals, label=['low','high'], color=['#afdde9ff', '#afdde9ff'])

    plt.xticks(xvals, ['low', 'high'], fontsize=7)
    axs.set_xlabel('SNR', fontsize = 7)
    axs.spines['right'].set_visible(False)
    axs.spines['top'].set_visible(False)
    plt.tight_layout()
Example #7
0
def rotational(cohorts_high, cohorts_low, tumors=None):
    nuceriod_plt.config_params(14)

    df_high = increase.load_d(cohorts_high)
    df_high['control'] = 'high'
    df_low = increase.load_d(cohorts_low)
    df_low['control'] = 'low'

    df = pd.concat([df_high, df_low])

    df['increase_in'] = df['observed_in'] - df['expected_in']
    df['prop_increase_in'] = df['increase_in'] / df['expected_in']

    df['ttype'] = df['name'].map(TTYPES)
    if tumors is not None:
        df = df[df['name'].isin(tumors)]

    toplot = df.sort_values(by='prop_increase_in', ascending=True)

    order = ['low', 'high']
    count = 0
    xvals = []
    yvals = []
    colors = []
    dic_t = collections.defaultdict(dict)
    labels = []
    for sig, data in toplot.sort_values(by='snr').groupby(by='ttype',
                                                          sort=False):
        if sig in COLORS:
            labels.append('{}'.format(sig))
            for i in order:
                val = data[data['control'] == i]['snr'].tolist()[0]
                dic_t['Sign {}'.format(sig)][i] = val
                yvals.append(math.log2(val))
                xvals.append(count)
                colors.append(COLORS[sig])
                count += 1

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 1.5))
    ax.bar(xvals, yvals, color=colors, label=labels)
    ax.set_ylabel('SNR')
    plt.xticks(np.arange(0.5, 54, 2), labels, rotation=90, fontsize=13)
    tick = [2, 4, 6, 8]
    plt.yticks(tick, [str(2**t) for t in tick])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
def sapiens_bars(snr_high, snr_medium, snr_low):
    nucperiod_plt.config_params(7)

    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(1.3, 1.5))
    xvals = [0, 1, 2]
    yvals = [snr_low, snr_medium, snr_high]

    axs.bar(xvals,
            yvals,
            label=['low', 'medium', 'high'],
            color=['#afdde9ff', '#afdde9ff', '#afdde9ff'])

    axs.set_xticks([0, 1, 2])
    axs.set_xticklabels(('very-low', 'low', 'high'), rotation=90)
    axs.set_ylabel('SNR')
    axs.spines['right'].set_visible(False)
    axs.spines['top'].set_visible(False)
    plt.tight_layout()
Example #9
0
def scatter(df, feature1, feature2, selected_organisms=None, **kwargs):
    just_once = True
    if selected_organisms is None:
        selected_organisms = []
    nucperiod_plt.config_params(font_size=24)
    fig, axes = plt.subplots(1, 1, figsize=(12, 14))
    phyllum = ['protists', 'fungi', 'plants', 'vertebrates', 'insects', 'nematodes', 'deuterostomes']
    color_label = dict(zip(phyllum, ['blue', 'grey', 'green', 'pink', 'black', 'cyan', 'orange']))
    for org_type in phyllum:
        ds = df[df['org_type'] == org_type]
        linewidths_normal = [1 for _ in ds[feature1].values]
        linewidths_snr = [2 if a < 1e-2 and b > 0 else 0. for a, b in zip(ds['qval_power_enrichment'].values, ds[feature1].values)]
        if just_once:
            axes.scatter([], [], s=300, linewidths=linewidths_snr,
                         edgecolor='#8b0000ff', color='None', label='q-value < 0.01')
            just_once = False
        axes.scatter(ds[feature1].values, ds[feature2].values, s=350, linewidths=linewidths_snr,
                     edgecolor='#8b0000ff', color='None')
        axes.scatter(ds[feature1].values, ds[feature2].values, s=150, linewidths=linewidths_normal,
                     edgecolor='black', color=color_label[org_type], label=org_type)
    for i, txt in enumerate(df.index.values):
        if txt in selected_organisms:
            axes.annotate(txt, (df.loc[txt, feature1] + 0.02, df.loc[txt, feature2] + 0.02))
    axes.set_ylabel('Proportion of 1 Mb chunks with with MP at {0} $\pm$ 0.5 bp (ratio)'.format(kwargs['period']))
    axes.set_xlabel('Power Enrichment at {0} $\pm$ 0.5 bp (odds ratio)'.format(kwargs['period']))
    # Power enrichment: power significance at {0} $\pm$ 0.5 bp compared to other periods (odds ratio)
    axes.vlines(0, 0, 1, colors='red', linestyles='dashed', alpha=0.5)
    axes.legend(loc=(1.03, 0.712))
    axes.set_xticks([-3, -2, -1, 0, 1, 2, 3])
    axes.set_xticklabels([0.001, 0.01, 0.1, 1, 10, 100, 1000])
    axes.set_title('Period = {0} bp'.format(kwargs['period']))
    if kwargs:
        xmin = kwargs['xmin']
        ymin = kwargs['ymin']
        xmax = kwargs['xmax']
        ymax = kwargs['ymax']
        axes.set_xlim(xmin, xmax)
        axes.set_ylim(ymin, ymax)

    plt.rcParams['savefig.facecolor'] = fig.get_facecolor()
Example #10
0
def zoomin(cohorts, tumors):
    nuceriod_plt.config_params(14)

    toplot = _load(cohorts, tumors)

    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(8, 7), sharex=True)

    toplot.sort_values(by='qvals_snr', ascending=False, inplace=True)

    for i, row in toplot.iterrows():
        edgecolor = None
        snr = row['snr']
        if (row['qvals_snr'] < 0.05) & (snr > 8):
            edgecolor = 'darkred'

        if row['cross_validation_max'] < 0:

            ax[1].scatter(row['peak'],
                          -np.log2(snr),
                          s=200,
                          linewidth=2,
                          edgecolor=edgecolor,
                          color='white',
                          label=row['name'],
                          alpha=1)

            ax[1].scatter(row['peak'],
                          -np.log2(snr),
                          s=80,
                          edgecolor='grey',
                          linewidth=0.5,
                          color=COLORS[row['ttype']],
                          label=row['name'],
                          alpha=1)

            if edgecolor == 'darkred':
                ax[1].text(row['peak'] + 1, -np.log2(snr) - 0.15, row['ttype'])

    for i, row in toplot.iterrows():
        edgecolor = None
        snr = row['snr']
        if row['qvals_snr'] < 0.05 and snr > 8:
            edgecolor = 'darkred'
        if row['cross_validation_max'] > 0:
            ax[0].scatter(row['peak'],
                          np.log2(snr),
                          s=200,
                          linewidth=2,
                          edgecolor=edgecolor,
                          color='white',
                          label=row['name'],
                          alpha=1)

            ax[0].scatter(row['peak'],
                          np.log2(snr),
                          s=80,
                          edgecolor='grey',
                          linewidth=0.5,
                          color=COLORS[row['ttype']],
                          label=row['name'],
                          alpha=1)

            if edgecolor == 'darkred':
                ax[0].text(row['peak'] + 1, np.log2(snr) - 0.15, row['ttype'])

    plt.xlabel('Period')

    xlim = [i for i in range(8, 22, 2)]
    ax[0].set_xticks(xlim)
    ax[1].set_xticks(xlim)
    yvals = [i for i in range(2, 10, 2)]
    ax[0].set_yticks(yvals)

    yvals = [i for i in range(-8, 0, 2)]
    ax[1].set_yticks(yvals)
    ylabels = [str(2**abs(i)) for i in range(2, 10, 2)]
    ax[0].set_yticklabels(ylabels)

    ylabels = ['{}'.format(str(2**abs(i))) for i in range(-8, 0, 2)]
    ax[1].set_yticklabels(ylabels)

    ax[0].spines['right'].set_visible(False)
    ax[0].spines['top'].set_visible(False)

    ax[1].xaxis.set_ticks_position('top')
    ax[1].spines['bottom'].set_visible(False)
    ax[1].spines['right'].set_visible(False)

    ax[0].set_ylim(1.5, 10)
    ax[1].set_ylim(-10, -1.5)

    plt.tight_layout()
def zoomout(files):
    nuceriod_plt.config_params(14)

    df = _load(files)

    df = df[df['nmuts_whole_nucleosome'] > 10000]

    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(8, 7), sharex=True)

    for i, row in df.iterrows():
        edgecolor = None
        snr = row['snr']
        if row['qvals_snr'] < 0.05 and snr > 8:
            edgecolor = 'darkred'

        if row['cross_validation_max'] > 0:

            ax[0].scatter(row['peak'],
                          np.log2(snr),
                          s=200,
                          linewidth=2,
                          edgecolor=edgecolor,
                          color='white',
                          label=row['name'],
                          alpha=1)

            ax[0].scatter(row['peak'],
                          np.log2(snr),
                          s=80,
                          edgecolor='grey',
                          linewidth=0.5,
                          color=COLORS[row['name']],
                          label=row['name'],
                          alpha=1)
            if edgecolor == 'darkred':
                ax[0].text(row['peak'] + 10,
                           np.log2(snr) - 0.15, row['outname'])

    for i, row in df.iterrows():
        edgecolor = None
        snr = row['snr']
        if row['qvals_snr'] < 0.05 and snr > 8:
            edgecolor = 'darkred'

        if row['cross_validation_max'] < 0:

            ax[1].scatter(row['peak'],
                          -np.log2(snr),
                          s=200,
                          linewidth=2,
                          edgecolor=edgecolor,
                          color='white',
                          label=row['name'],
                          alpha=1)

            ax[1].scatter(row['peak'],
                          -np.log2(snr),
                          s=80,
                          edgecolor='grey',
                          linewidth=0.5,
                          color=COLORS[row['name']],
                          label=row['name'],
                          alpha=1)

            if edgecolor == 'darkred':
                ax[1].text(row['peak'] + 10, -np.log2(snr) - 0.15,
                           row['outname'])

    plt.ylabel('log2(SNR)')
    plt.xlabel('Period')

    xlim = [i for i in range(50, 260, 20)]
    ax[0].set_xticks(xlim)
    ax[1].set_xticks(xlim)
    yvals = [i for i in range(2, 10, 2)]
    ax[0].set_yticks(yvals)

    yvals = [i for i in range(-8, 0, 2)]
    ax[1].set_yticks(yvals)
    ylabels = [str(2**abs(i)) for i in range(2, 10, 2)]
    ax[0].set_yticklabels(ylabels)

    ylabels = [str(2**abs(i)) for i in range(-8, 0, 2)]
    ax[1].set_yticklabels(ylabels)
    ax[0].spines['right'].set_visible(False)
    ax[0].spines['top'].set_visible(False)

    ax[1].xaxis.set_ticks_position('top')
    ax[1].spines['bottom'].set_visible(False)
    ax[1].spines['right'].set_visible(False)

    ax[0].set_ylim(1.5, 10)
    ax[1].set_ylim(-10, -1.5)

    plt.tight_layout()
def sigmoid(files, tracksheets, ttypes, colors):
    nuceriod_plt.config_params(12)

    df = _load(files)
    toplot = df.merge(tracksheets, how='left')

    toplot['ttype'] = toplot['tumor_name'].map(ttypes)

    ttype_inc = collections.defaultdict(float)
    total_count = 0
    ttype_vals = collections.defaultdict(lambda: collections.defaultdict(list))
    prop_significant = collections.defaultdict(float)
    for ttype, data in toplot.groupby(by='ttype'):
        if len(data) > 10:
            significant = 0
            d = data.sort_values(by='prop_increase_in', ascending=True)
            mean_d = d['prop_increase_in'].median()
            ttype_inc[ttype] = mean_d
            for i, row in d.iterrows():
                ttype_vals[ttype]['Prop'].append(row['prop_increase_in'])
                total_count += 1
                if (row['qvals_snr'] < 0.1) & (row['snr'] > 8):
                    significant += 1
                    c = 'darkred'
                else:
                    c = 'grey'
                ttype_vals[ttype]['col'].append(c)
            prop_significant[ttype] = 100 * significant / len(data)

    fig, ax = plt.subplots(nrows=1,
                           ncols=len(ttype_inc),
                           figsize=(17, 3.5),
                           sharey=True)

    ax[0].set_ylabel('Relative increase in mutation rate')
    ax[0].yaxis.set_ticks(np.arange(-0.3, 0.3, 0.1))

    for index, t in enumerate(sorted(ttype_inc, key=ttype_inc.get)):
        count = 0

        for ix, val in enumerate(ttype_vals[t]['Prop']):
            alpha = 0.3
            if ttype_vals[t]['col'][ix] == 'darkred':
                alpha = 1
            ax[index].scatter(count,
                              val,
                              color=ttype_vals[t]['col'][ix],
                              s=10,
                              lw=0,
                              alpha=alpha)
            count += 1
        ax[index].hlines(ttype_inc[t],
                         count / 2 - count * 0.4 / 2,
                         count / 2 + count * 0.4 / 2,
                         lw=2,
                         color='darkgreen')
        ax[index].spines['top'].set_visible(False)
        ax[index].spines['right'].set_visible(False)
        ax[index].spines['bottom'].set_visible(False)
        ax[index].spines['left'].set_visible(False)
        ax[index].set_xlabel(t, rotation=90)
        ax[index].xaxis.set_ticks_position('none')

        if index > 0:
            ax[index].yaxis.set_ticks_position('none')

        labels = [item.get_text() for item in ax[index].get_xticklabels()]
        empty_string_labels = [''] * len(labels)
        ax[index].set_xticklabels(empty_string_labels)
        ax[index].text(
            ttype_inc[t], 0.22,
            'n={}\n{}%'.format(count, round(prop_significant[t], 1)))
        ax[index].set_ylim(-0.2, 0.3)

        ax[index].add_patch(
            plt.Rectangle((0, -0.8),
                          count,
                          0.03,
                          color=colors[t],
                          lw=1,
                          clip_on=False,
                          linewidth=0))

    red_dot = mlines.Line2D([], [],
                            color='darkred',
                            marker='o',
                            linestyle='None',
                            markersize=5,
                            label='Significant sample')

    grey_dot = mlines.Line2D([], [],
                             color='grey',
                             marker='o',
                             linestyle='None',
                             markersize=5,
                             label='Non significant sample')

    median_line = mlines.Line2D([], [],
                                color='darkgreen',
                                marker='_',
                                linestyle='None',
                                lw=40,
                                markersize=10,
                                label='Median')

    plt.legend(handles=[red_dot, grey_dot, median_line],
               bbox_to_anchor=[1.1, 1.1])
Example #13
0
def plot_single(table, title):
    nucperiod_plt.config_params(font_size=10)

    YLIM = 60

    table = pd.read_csv(table, sep='\t')

    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(20, 12))

    snrs = []
    df = table[table['maxp'] < 19.5]
    df = df[df['maxp'] > 5.5]
    for i in range(6, 20):
        snrs.append(df['snr_{0}p'.format(str(i))].values)
    ax[0, 0].boxplot(snrs)
    ax[0, 0].set_xticklabels(range(6, 20))
    ax[0, 0].set_xlabel('Period (bp)')
    ax[0, 0].set_ylabel('SNR')
    ax[0, 0].set_title('SNR for all 1 Mb chunks')

    snrs = []
    counts = []
    for i in range(6, 20):
        df = table[table['maxp'] < i + 0.5]
        df = df[df['maxp'] > i - 0.5]
        counts.append(len(df))
        snrs.append(df['snr_{0}p'.format(str(i))].values)
    ax[1, 0].boxplot(snrs)
    ax[1, 0].plot(range(1, 15), [v / 10 for v in counts], label='x10 no. chunks')
    ax[1, 0].set_xticklabels(range(6, 20))
    ax[1, 0].set_xlabel('Period (bp)')
    ax[1, 0].set_ylabel('SNR')
    ax[1, 0].set_title('SNR for 1 Mb chunks\n peaking at each period')
    ax[1, 0].legend()

    snrs = []
    df = table[table['maxp'] < 19.5]
    df = df[df['maxp'] > 5.5]
    for i in range(6, 20):
        snrs.append(df['fold_power_increase_{0}p'.format(str(i))].values)
    ax[0, 1].boxplot(snrs)
    ax[0, 1].set_xticklabels(range(6, 20))
    ax[0, 1].set_ylim(-3, YLIM)
    ax[0, 1].set_xlabel('Period (bp)')
    ax[0, 1].set_ylabel('Fold Power Increase')
    ax[0, 1].set_title('Fold power increase for all 1 Mb chunks')

    snrs = []
    for i in range(6, 20):
        df = table[table['maxp'] < i + 0.5]
        df = df[df['maxp'] > i - 0.5]
        snrs.append(df['fold_power_increase_{0}p'.format(str(i))].values)
    ax[1, 1].boxplot(np.array(snrs))
    ax[1, 1].plot(range(1, 15), [v / 10 for v in counts], label='x10 no. chunks')
    ax[1, 1].set_xticklabels(range(6, 20))
    ax[1, 1].set_ylim(-3, YLIM)
    ax[1, 1].set_xlabel('Period (bp)')
    ax[1, 1].set_ylabel('Fold Power Increase')
    ax[1, 1].set_title('Fold Power Increase for 1 Mb chunks\n peaking at each period')
    ax[1, 1].legend()

    snrs = []
    df = table[table['maxp'] < 19.5]
    df = df[df['maxp'] > 5.5]
    for i in range(6, 20):
        snrs.append(df['fold_snr_increase_{0}p'.format(str(i))].values)
    ax[0, 2].boxplot(snrs)
    ax[0, 2].set_xticklabels(range(6, 20))
    ax[0, 2].set_ylim(-3, YLIM)
    ax[0, 2].set_xlabel('Period (bp)')
    ax[0, 2].set_ylabel('Fold SNR Increase')
    ax[0, 2].set_title('Fold SNR Increase for all 1 Mb chunks')

    snrs = []
    for i in range(6, 20):
        df = table[table['maxp'] < i + 0.5]
        df = df[df['maxp'] > i - 0.5]
        snrs.append(df['fold_snr_increase_{0}p'.format(str(i))].values)
    ax[1, 2].boxplot(snrs)
    ax[1, 2].plot(range(1, 15), [v / 10 for v in counts], label='x10 no. chunks')
    ax[1, 2].set_xticklabels(range(6, 20))
    ax[1, 2].set_ylim(-3, YLIM)
    ax[1, 2].set_xlabel('Period (bp)')
    ax[1, 2].set_ylabel('Fold SNR Increase')
    ax[1, 2].set_title('Fold SNR Increase for 1 Mb chunks\n peaking at each period')
    ax[1, 2].legend()

    discoveries = []
    for i in range(6, 20):
        discoveries.append(len(table[table['logpval_power_{0}p'.format(str(i))] == 2]))
    barlist = ax[0, 3].bar(list(range(6, 20)), discoveries)
    barlist[list(range(6, 20)).index(10)].set_color('r')
    ax[0, 3].hlines(np.mean(np.array(discoveries)), 5.6, 19.4, linestyles='dashed', colors='grey')
    ax[0, 3].set_xticks(range(6, 20))
    ax[0, 3].set_xlabel('Period (bp)')
    ax[0, 3].set_ylabel('No. chunks significantly high in power')
    ax[0, 3].set_title('Power Enrichment')

    discoveries = []
    for i in range(6, 20):
        discoveries.append(len(table[table['logpval_snr_{0}p'.format(str(i))] == 2]))
    barlist = ax[1, 3].bar(list(range(6, 20)), discoveries)
    barlist[list(range(6, 20)).index(10)].set_color('r')
    ax[1, 3].hlines(np.mean(np.array(discoveries)), 5.6, 19.4, linestyles='dashed', colors='grey')
    ax[1, 3].set_xticks(range(6, 20))
    ax[1, 3].set_ylabel('No. chunks significantly high in SNR')
    ax[1, 3].set_xlabel('Period (bp)')
    ax[1, 3].set_title('SNR Enrichment')

    fig.suptitle(title)
    plt.rcParams['savefig.facecolor'] = fig.get_facecolor()
Example #14
0
def spectrum(original_file, simulated_files):
    nucperiod_plt.config_params(font_size=14)

    with open(original_file, 'rt') as f:
        mydict = json.load(f)

    pair_count_array = np.array(mydict['pair_count'])
    motif_count = mydict['motif_count']
    len_chunk = mydict['chunk_len']

    # define the signal
    signal = np.array([(v / len_chunk) / (motif_count / len_chunk) ** 2 for v in pair_count_array])

    # define figure
    figsize = (10, 5)
    fig_spec, ax_spec = plt.subplots(1, 1, figsize=figsize)
    ax_spec.set_xlabel('period (bp)')
    ax_spec.set_ylabel('power')

    # 3-smoothing
    signal = spectral.mean_smooth(signal, 3)

    counter = 0

    # detrended smooth autocorrelation
    initial_values_dict = {'a_0': np.mean(signal[4:]), 'a_1': 0., 'a_2': 0.}
    params, obj_func = non_linear.create_quadratic_model(initial_values_dict)
    x = np.arange(len(signal[4:]))
    non_linear_fitter = non_linear.NonLinearFitting(obj_func, params, x, signal[4:])
    _, predicted = non_linear_fitter.least_squares()
    signal_detrended = signal[4:] - predicted

    for file in simulated_files:
        counter += 1

        with open(file, 'rt') as f:
            random_chunk = json.load(f)

        pc = np.array(random_chunk['pair_count'])
        mc = random_chunk['motif_count']
        len_random_chunk = random_chunk['chunk_len']

        random_signal = np.array([(v / len_chunk) / (mc / len_random_chunk) ** 2 for v in pc])
        random_signal = spectral.mean_smooth(random_signal, 3)

        # detrended smooth autocorrelation
        initial_values_dict = {'a_0': np.mean(random_signal[4:]), 'a_1': 0., 'a_2': 0.}
        params, obj_func = non_linear.create_quadratic_model(initial_values_dict)
        x = np.arange(len(random_signal[4:]))
        non_linear_fitter = non_linear.NonLinearFitting(obj_func, params, x, random_signal[4:])
        _, predicted = non_linear_fitter.least_squares()
        random_signal_detrended = random_signal[4:] - predicted

        # DTFT
        if counter == 1:
            label = 'randomized'
        else:
            label = None
        dtft_spectrum_plot(random_signal_detrended, ax_spec, title='periodogram',
                           norm=False, color='grey', alpha=0.5, label=label)

    dtft_spectrum_plot(signal_detrended, ax_spec, title='periodogram', norm=False, label='observed')
    ax_spec.legend()
    plt.rcParams['savefig.facecolor'] = fig_spec.get_facecolor()