Esempio n. 1
0
def new_TE_distribution():
    fields, filtered = filtered_low_counts(0)

    TEs = {'ingolia_1': np.log2(filtered[:, fields['ingolia:RPF_1']]) - np.log2(filtered[:, fields['ingolia:mRNA_1']]),
           'ingolia_2': np.log2(filtered[:, fields['ingolia:RPF_2']]) - np.log2(filtered[:, fields['ingolia:mRNA_2']]),
           'ingolia_both': np.log2(filtered[:, fields['ingolia:RPF_1']] + filtered[:, fields['ingolia:RPF_2']]) - np.log2(filtered[:, fields['ingolia:mRNA_1']] + filtered[:, fields['ingolia:mRNA_2']]),
           'weinberg_RiboZero': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:RiboZero']]),
           'weinberg_Dynabeads': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Dynabeads']]),
           'weinberg_Unselected': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Unselected']]),
           'artificial': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Dynabeads']] / np.asarray(filtered[:, fields['CDS_length']], dtype=float)),
           'artificial2': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Dynabeads']] * np.asarray(filtered[:, fields['CDS_length']], dtype=float)),
          }

    for name in ['weinberg_RiboZero', 'weinberg_Dynabeads', 'weinberg_Unselected', 'ingolia_both', 'artificial', 'artificial2']:
        plt.hist(TEs[name] - np.mean(TEs[name]), histtype='step', bins=100, range=(-4, 4), label=name)

    plt.legend()
    plt.xlabel('log2(RPF RPKM / mRNA RPKM')
    plt.ylabel('Number of genes')

    explore_UTRs.scatter_with_hists_colors(TEs['weinberg_RiboZero'] - np.mean(TEs['weinberg_RiboZero']),
                                           TEs['weinberg_Unselected'] - np.mean(TEs['weinberg_Unselected']),
                                           'weinberg_Ribozero',
                                           'weinberg_Unselected',
                                           'Joint distribution of TEs',
                                          )

    print scipy.stats.pearsonr(TEs['weinberg_RiboZero'], TEs['weinberg_Unselected'])
    print scipy.stats.spearmanr(TEs['weinberg_RiboZero'], TEs['weinberg_Unselected'])

    return TEs
Esempio n. 2
0
def mRNA_RPKM_length_bias():
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    genome_dir = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/genome'
    coding_sequence_fetcher = gtf.make_coding_sequence_fetcher(gtf_fn, genome_dir)
    lengths = np.asarray([len(coding_sequence_fetcher(name)) for name in gene_names])

    explore_UTRs.scatter_with_hists_colors(lengths,
                                           #np.log2(arrays['weinberg']['Dynabeads']) - np.log2(arrays['weinberg']['Unselected']),
                                           np.log2(arrays['weinberg']['RPF']) - np.log2(arrays['ingolia']['RPF']),
                                           'coding sequence length',
                                           'log2(Ingolia mRNA RPKM / Weinberg mRNA RPKM)',
                                           '',
                                          )

    plt.ylim(-7, 7)
    plt.gcf().set_size_inches(12, 8)