Ejemplo n.º 1
0
def main(subsets, corr_transcr):
    plt.figure(figsize=(12, 12))
    pmatrix = []
    tri_ind = np.triu(np.arange(16).reshape(4, 4), k=1)
    tri_ind = tri_ind[tri_ind != 0]
    print(tri_ind)

    n = 0
    for label_i, subset_i in subsets.items():
        line = []

        for label_j, subset_j in subsets.items():

            if n in tri_ind:
                plt.subplot(4, 4, n + 1)
                print('lengths:', len(subset_i), len(subset_j))
                pvalue = box_compare(subset_i.dropna(),
                                     subset_j.dropna(),
                                     labels=(label_i, label_j))
            else:
                pvalue = 0

            line.append(pvalue)
            n += 1
        pmatrix.append(line)

    plt.tight_layout()

    ### PLOT BOXES TOGETHER
    plt.figure(figsize=(4, 8))
    plt.title(corr_transcr)
    plt.ylabel(corr_transcr)
    #multibox_compare(*list(zip(*[(df.replace(0, pd.np.nan).dropna(), key) for key, df in subsets.items()])))
    multibox_compare(
        *list(zip(*[(df.dropna(), key) for key, df in subsets.items()])))
Ejemplo n.º 2
0
def main(func):
    counts_sum = func(counts)
    counts_sum = counts_sum.iloc[1:]
    genes_data = counts_sum[counts_sum.index.str.startswith('Smp')]
    genes_data = genes_data.iloc[~genes_data.index.str.endswith('complement')]

    with_perere = head_data.gene_transcription.dropna().drop_duplicates()
    lone_genes = genes_data.drop(
        head_data.neighbor_gene.dropna().unique()).dropna()

    # print(
    #    with_perere,
    #    lone_genes,
    #    genes_data
    # )

    print(head_data.neighbor_gene.duplicated().sum())
    plt.figure(figsize=(6, 10))
    u.multibox_compare(
        (with_perere, lone_genes, genes_data),
        ('Com Perere-3 vizinho', 'Sem Perere-3 vizinho', 'Total'),
        margin=3)
Ejemplo n.º 3
0
def compare(subsets, corr_transcr):
    plt.figure(figsize=(4, 8))
    plt.title(corr_transcr)
    plt.ylabel(corr_transcr)
    pp = multibox_compare(*zip(*[(df.dropna(), key)
                                 for key, df in subsets.items()]))
    for labels, p in pp.items():
        seta, setb = labels
        print(labels,
              subsets[seta].describe(),
              subsets[setb].describe(),
              p,
              sep='\n\t')
Ejemplo n.º 4
0
dirs = u.pardir.glob('[0-9]*_counted_reads')
counts = {}

for d in dirs:
    dkey = int(d.stem.split('bp_')[0])
    print('Reading', dkey)
    counts[dkey] = pd.read_table(d/'parsed_data/heads_rpkm.tsv')

counts = dict(sorted(counts.items()))

for transcr in ('transcription', 'complement_transcription'):
    df = pd.DataFrame()
    u.print_header(transcr)

    for headlen, rpkm in counts.items():
        rpkm = rpkm[transcr][rpkm[transcr] != 0]
        df[headlen] = rpkm.describe()
    df.columns.name = 'headlen'
    # df.index.name = 'stat'
    df = df.T
    print(df)
    df.to_csv(f'multiheadlen_{transcr}.tsv', sep='\t')

    if u.args.plot:
        plt.figure(figsize=(4,10))
        # u.multibox_compare([c[transcr][c[transcr] != 0] for c in counts.values()], labels=counts.keys())
        u.multibox_compare([c[transcr] for c in counts.values()], labels=counts.keys())
        plt.savefig(f'multiheadlen/{transcr}.png')
        plt.show()

Ejemplo n.º 5
0
    for c in d.glob('*.tsv'):
        counts[dkey] = counts[dkey].add(pd.read_table(c,
                                                      names=['id', 'count'],
                                                      index_col='id'),
                                        fill_value=0)

    counts[dkey] = counts[dkey][counts[dkey].index.str.startswith('head')]
    #counts[dkey] = counts[dkey][~counts[dkey].index.str.endswith('complement')]

counts = dict(sorted(counts.items()))

if u.args.plot:
    plt.figure(figsize=(5, 10))
    u.multibox_compare(
        [c.loc[c['count'] != 0, 'count'] for c in counts.values()],
        labels=counts.keys())
    plt.savefig('multiheadlen/counts.png')
    plt.show()

counts_df = pd.DataFrame()

for headlen, count in counts.items():
    count = count[count['count'] != 0]
    counts_df[headlen] = count['count'].describe()
    # for j_headlen in counts:
    #           print(i_headlen, '/', j_headlen, ':',
    #           counts[i_headlen].mean() / counts[j_headlen].mean())

counts_df = counts_df.T
counts_df.index.name = 'headlen'
Ejemplo n.º 6
0
# description: Compares genes' to gene complements' transcription in each reads library.
# in: pardir/'counted_reads/aggregated.tsv'
# plot:
import sys
import utils as u
sys.path.append(str(u.scripts_dir))
from correlate_heads_to_near_genes import out_rpkm_by_lib as rpkm_by_lib

import pandas as pd
import matplotlib.pyplot as plt

d = pd.read_table(rpkm_by_lib, index_col='library')
d = d.loc[:, d.columns.str.contains('Smp')]  # Restrict to genes.

comp = d.loc[:, d.columns.str.endswith('_complement')]
direct = d.drop(comp.columns, axis=1)
comp = comp.T
direct = direct.T

plt.figure(figsize=(comp.shape[1] * 3, 5))

for i, lib in enumerate(comp.columns):
    plt.subplot(1, comp.shape[1], i + 1)
    plt.title(lib)
    plt.ylabel('RPKM')
    u.multibox_compare([direct[lib], comp[lib]],
                       labels=('Genes', 'Complementares'))
plt.tight_layout()
u.save_all_figs()