Example #1
0
def plot():
    print('Plotando...')
    # ###### PLOT ALL NUMERIC!
    data.dropna(inplace=True)
    data.drop('same_strand', 1, inplace=True)
    data = data.infer_objects()
    data = data.select_dtypes(exclude=['object'])

    pd.plotting.scatter_matrix(data.drop(['end', 'start'], 1))
    data.plot.scatter('start', 'transcription', alpha=.2)
    data.plot.scatter('distance', 'transcription', alpha=.2)
    plot_box(data[data.distance > 0], 'distance', 'transcription', bins=50)

    genome_map = data.sort_values('start')[['start', 'transcription']]
    genome_map['start'] = pd.cut(genome_map.start, int(1e4))
    genome_map = genome_map.groupby('start').sum()
    le = len(genome_map.transcription)
    plt.figure()
    plt.pcolor(
        genome_map.transcription.values.reshape(int(le**.5), int(le**.5)))
    plt.colorbar()
    plt.figure()

    plt.pcolor(data.corr(method='spearman'))
    plt.figure()
    data.transcription.hist()

    save_all_figs()
Example #2
0
def plot(counts):
    hist = counts.value_counts()
    print(hist)
    soma = hist.sum()
    print('soma:', soma, 'fração de 0 repetições:',
          hist[0]/soma, 'fração com até 1 rep.', hist[0:2].sum()/soma)

    for log in (False, True):

        print(f'Construindo histograma, log={log}')
        plt.figure(figsize=(11, 4.8))

        # ax = counts.hist(bins=200, figsize=(4, 4.8))
        plt.bar(hist.index, hist, log=log)

        plt.xlabel('Número de repetições')
        plt.ylabel('Número de sequências')
        plt.title('Quantidade de cópias repetidas')

        print("Histograma salvo.")

    save_all_figs()
Example #3
0
def main():
    #print('Quantidades das populações')
    #[print(name, s) for name, s in u.get_subsets(d).items()]

    for corr_transcr in ('transcription', 'complement_transcription',
                         'gene_transcription', 'gene_complement_transcription',
                         'correlation', 'complement_correlation',
                         'motherlength', 'repetitions'):
        u.print_header(corr_transcr)
        # D = d[d[corr_transcr] != 0].dropna(subset=[corr_transcr])  # drop zeros
        D = d.dropna(subset=[corr_transcr])
        # print(f'Comprimento total: {d.shape[0]} | Comprimento coluna: {D.shape[0]}')

        if 'gene' in corr_transcr:
            D = D.sort_values('distance')  # Keep only closest copy to each G
            D = D.drop_duplicates(subset='neighbor_gene')

        subsets = get_subsets(D, corr_transcr)
        compare(subsets, corr_transcr)

    if show_flag:
        plt.show()
    else:
        save_all_figs()
Example #4
0
#                     olap_mask = (max(mask.start - hstart, 0),
#                                  min(mask.end - hstart, 1000))
#                     for i in range(*olap_mask):
#                         mask_count[i] += 1
#                     # plt.fill_between(olap_mask, *axesy, alpha=.1)
#                     count += 1
#
#             print(irow, '/', n_heads, 'plotted:', count)
#
#     except KeyboardInterrupt:
#         pass
#
#     return mask_count

# NORMALIZE
complete_counts = [c / max(complete_counts) for c in complete_counts]
#mask_count = [c/max(mask_count) for c in mask_count]

plt.figure(figsize=(9, 4.8))
plt.plot(complete_counts, label='Read count')
#plt.plot(mask_count, label='Masked count')
plt.title('Perfil geral de transcrição das sequências sonda')
plt.xlabel('Distância ao início da sonda (pb)')
plt.ylabel('Contagem de reads normalizada')

if show_flag:
    plt.show()

save_all_figs()
print('Pronto.')
Example #5
0

def main(func):
    counts_sum = func(counts)
    counts_sum = counts_sum.iloc[1:]
    genes_data = counts_sum[counts_sum.index.str.startswith('Smp')]
    genes_data = genes_data.iloc[~genes_data.index.str.endswith('complement')]

    with_perere = head_data.gene_transcription.dropna().drop_duplicates()
    lone_genes = genes_data.drop(
        head_data.neighbor_gene.dropna().unique()).dropna()

    # print(
    #    with_perere,
    #    lone_genes,
    #    genes_data
    # )

    print(head_data.neighbor_gene.duplicated().sum())
    plt.figure(figsize=(6, 10))
    u.multibox_compare(
        (with_perere, lone_genes, genes_data),
        ('Com Perere-3 vizinho', 'Sem Perere-3 vizinho', 'Total'),
        margin=3)


if __name__ == '__main__':
    for func in (pd.DataFrame.sum, pd.DataFrame.max):
        main(counts)
    u.save_all_figs()
Example #6
0
def main():
    head_data = pd.read_table(pardir/'genome_annotation/all_together_now.tsv')

    # # Very important to drop NaN's! (we use stuff like data[data.relative_position != 'olap'])
    # This keeps only heads with neighbor genes.
    head_data = head_data.dropna().reset_index(drop=True)


    # ################# CORRELATION ########################
    print("TABELA DE CORRELAÇÕES DE SPEARMAN")
    print(head_data[['transcription', 'correlation', 'distance']].corr(method='spearman'))


    # #################### SPLITS ##########################

    # drop overlapped
    head_data = head_data[head_data.relative_position != 'olap']

    # ##### SAME/DIFF STRAND
    same_strand = head_data[head_data.same_strand]
    diff_strand = head_data.drop(same_strand.index)

    # ##### UP/DOWN-STREAM: ----->   -->
    # downstream = same_strand[(same_strand.strand == '+') &
    #                          (same_strand.relative_position == 'dir')]
    # downstream = downstream.append(same_strand[(same_strand.strand == '-') &
    #                                            (same_strand.relative_position == 'esq')])

    # upstream = same_strand[(same_strand.strand == '+') &
    #                          (same_strand.relative_position == 'esq')]
    # upstream = upstream.append(same_strand[(same_strand.strand == '-') &
    #                                            (same_strand.relative_position == 'dir')])

    # ##==============================================================
    downstream = head_data[(head_data.strand == '+') &
                           (head_data.relative_position == 'dir')]
    downstream = downstream.append(head_data[(head_data.strand == '-') &
                                             (head_data.relative_position == 'esq')])

    upstream = head_data.drop(downstream.index)



    # ################# Wilcoxon #####################

    thresholds = (1e3, 1e4, 2e4, upstream.append(downstream).distance.max())

    abc = ('a) ', 'b) ', 'c) ')

    fig, axs = plt.subplots(1, len(thresholds), figsize=(11, 4.8))
    for ax in axs:
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])

    ax = fig.add_subplot(111, frameon=False)
    ax.grid(False)
    plt.tick_params(labelcolor='none',
                    top=False, bottom=False, left=False, right=False)
    plt.ylabel('Correlação transcricional com o gene vizinho')

    for i, thresh in enumerate(thresholds):
        # Data selection
        a, b = [p.loc[p.distance <= thresh].correlation
                for p in [downstream, upstream]]

        pvalue = mannwhitneyu(a, b).pvalue
        plabel = f'p-valor:\n{pvalue:.5f}'
        label = f"Distância < {thresh:.0f}"

        print(label, pvalue, 'Medians:', a.median(), b.median())

        fig.add_subplot(1, len(thresholds), i + 1, frameon=False)
        plt.title(label)

        boxplot([a, b])
        plt.annotate(plabel, (.5, .2), xycoords='axes fraction', ha='center')
        plt.xticks([0, 1], labels=['Downstream', 'Upstream'])

    plt.tight_layout()


    # ======================= P VS. DIS ==========================

    plt.figure(figsize=(11, 4.8))
    plt.subplot(211)

    xdistances = range(100, int(upstream.distance.max()), 100)
    ypvalues = [mannwhitneyu(downstream.loc[downstream.distance <= thresh].correlation,
                             upstream.loc[upstream.distance <= thresh].correlation).pvalue
                for thresh in xdistances]

    updataamounts = [len(upstream.loc[upstream.distance <= thresh]) for thresh in xdistances]
    downdataamounts = [len(downstream.loc[downstream.distance <= thresh]) for thresh in xdistances]

    plt.plot(xdistances, ypvalues)
    plt.semilogx()
    plt.ylabel('p-valor')

    # ----------------------------------
    plt.subplot(212)
    plt.plot(xdistances, downdataamounts, label="Downstream")
    plt.plot(xdistances, updataamounts, label="Upstream")
    plt.legend()

    plt.semilogx()
    plt.xlabel('Distância ao vizinho (pb)')
    plt.ylabel('Quantidade de pontos')

    # ===========================================
    # Só na faixa selecionada
    # plt.figure(dpi=200)
    # limits = 1e3, 1e4
    # print(i.distance.between(*limits))
    # plt.boxplot([[i.correlation.loc[i.distance.between(*limits)]]
                 # for i in (upstream, downstream)])

    # ===========================================
    save_all_figs()