def degree_hist(dg_txt=None):
	if dg_txt is None:
		dg_txt = '/Share/home/zhangqf7/gongjing/zebrafish/data/paris/shi-zp-5-rep-combine/27-DG.txt'
	dg_dict = read_dg_txt(dg_txt)

	trans_dict = loadTransGtfBed2()

	RRI_dict = nested_dict(2, list)
	for i,j in dg_dict.items():
		if j['lchr'] != j['rchr']:
			RRI_dict[j['RRI_type']][j['lchr']].append(j['rchr'])
			RRI_dict[j['RRI_type']][j['rchr']].append(j['lchr'])
		else:
			RRI_dict[j['RRI_type']][j['lchr']].append(j['rchr'])

	for i in ['inter', 'intra']:
		savefn = dg_txt.replace('.txt', '.%s.degree.txt'%(i))
		degree_ls_ls = [[], [], []]

		with open(savefn, 'w') as SAVEFN:
			for k,v in RRI_dict[i].items():
				print >>SAVEFN, '\t'.join(map(str, [ k, trans_dict[k]['type'], len(v), len(set(v)), ','.join(list(set(v))) ]))
				degree_ls_ls[0].append(len(set(v)))

				if trans_dict[k]['type'] == 'mRNA':
					degree_ls_ls[1].append(len(set(v)))
				if trans_dict[k]['type'] == 'lncRNA':
					degree_ls_ls[2].append(len(set(v)))

		degree_mean_ls = [np.mean(i) for i in degree_ls_ls]
		gj.cumulate_dist_plot(ls_ls=degree_ls_ls,ls_ls_label=['%s, mean=%.2f'%(i,j) for i,j in zip(['all', 'mRNA', 'lncRNA'], degree_mean_ls)], bins=40,title='degree distribution',ax=None,savefn=savefn.replace('.txt', '.pdf'),xlabel='log2(# of interacting partners)',ylabel=None,add_vline=None,add_hline=None,log2transform=1,xlim=None,ylim=None)
Esempio n. 2
0
def read_len_dist_all(
    savefn='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/read_len_dist_all.png'
):
    gj.printFuncRun('read_len_dist_all')
    gj.printFuncArgs()
    library_info_dict = library_info()
    trimmed_dict = library_info_dict['lib']['trimmed']
    print trimmed_dict
    read_len_ls_ls = []
    read_cut_len_ls_ls = []
    fig, ax = plt.subplots(3, 1, sharex=True, figsize=(14, 16))
    color_ls = gj.sns_color_ls()
    sample_ls = []
    for n, (i, j) in enumerate(trimmed_dict.items()):
        sample_ls.append(i)
        print i, j
        fq_len_txt = j + '.len.txt'
        trimlog = j + '.trimlog'
        df = pd.read_csv(fq_len_txt, sep='\s+', header=None)
        df.columns = ['# of reads', 'read length']
        df.plot(ax=ax[0], x='read length', y='# of reads', label=i)
        df_trimlog = pd.read_csv(trimlog, header=None, sep='\s+')
        df_trimlog.columns = [
            'seq_name', 'sample_name', 'survive_len', 'survive_start',
            'survive_end', 'cut_len'
        ]
        df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0]
        cut_len_ls = list(df_trimlog['cut_len'])
        n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])]
        n = gj.ls_ls_flat(n)
        read_len_ls_ls.append(n)
        read_cut_len_ls_ls.append(cut_len_ls)
    gj.cumulate_dist_plot(read_len_ls_ls,
                          ls_ls_label=sample_ls,
                          bins=40,
                          title=None,
                          ax=ax[1],
                          savefn=None,
                          xlabel='Length',
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0)
    gj.cumulate_dist_plot(read_cut_len_ls_ls,
                          ls_ls_label=sample_ls,
                          bins=40,
                          title=None,
                          ax=ax[2],
                          savefn=None,
                          xlabel='Length',
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0)
    plt.tight_layout()
    plt.savefig(savefn)
    plt.close()
    gj.printFuncRun('read_len_dist_all')
Esempio n. 3
0
def read_len_dist(
    fq='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R1_001.trimmed.fastq'
):
    gj.printFuncRun('read_len_dist')
    gj.printFuncArgs()
    fq_len_txt = fq + '.len.txt'
    subprocess.call([
        "awk '{if(NR%%4==2) print length($1)}' %s| sort|uniq -c|sort -k2,2n > %s "
        % (fq, fq_len_txt)
    ],
                    shell=True)  # use double % to escape
    df = pd.read_csv(fq_len_txt, sep='\s+', header=None)
    df.columns = ['# of reads', 'read length']
    df_plot = df[['read length', '# of reads']]
    print df_plot
    fig, ax = plt.subplots(2, 1, sharex=True)
    df.plot(ax=ax[0], x='read length', y='# of reads')
    df.plot(kind='scatter', ax=ax[0], x='read length', y='# of reads')

    df_trimlog = pd.read_csv(fq + '.trimlog', header=None, sep='\s+')
    df_trimlog.columns = [
        'seq_name', 'sample_name', 'survive_len', 'survive_start',
        'survive_end', 'cut_len'
    ]
    df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0]
    cut_len_ls = list(df_trimlog['cut_len'])

    n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])]
    n = gj.ls_ls_flat(n)
    gj.cumulate_dist_plot(
        [n, cut_len_ls],
        ls_ls_label=['kethoxal read length', 'kethoxal read cut length'],
        bins=40,
        title=None,
        ax=ax[1],
        savefn=None,
        xlabel='Length',
        ylabel=None,
        add_vline=None,
        add_hline=None,
        log2transform=0)

    plt.tight_layout()
    plt.savefig(fq + '.len.png')
    plt.close()

    gj.printFuncRun('read_len_dist')
Esempio n. 4
0
def main():
    notreat = compare_corr(
        out1=
        '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-11-14_7_library_total_Kethoxal_remove/kethoxalseq_noTreat.out',
        savefn=
        '/Share2/home/zhangqf5/gongjing/Kethoxal_RNA_structure/Keth-seq/results/F2b.kethoxal-notreat.icshape.corr.txt',
        label='kethoxalseq:icSHAPE')
    gj.cumulate_dist_plot(
        ls_ls=[notreat[-2]],
        ls_ls_label=[
            'kethoxal vs icshape',
        ],
        bins=40,
        title=None,
        ax=None,
        savefn=
        '/Share2/home/zhangqf5/gongjing/Kethoxal_RNA_structure/Keth-seq/results/F2b.kethoxal_icshape_corr.pdf',
        xlabel=None,
        ylabel=None,
        add_vline=None,
        add_hline=None,
        log2transform=0)
    tx_corr_scatter()
from glob import glob

ls_ls, ls_ls_label = [], []
file_ls = sys.argv[1:-1]
savefn = sys.argv[-1]
print "input: ", file_ls
print "output: ", savefn
for sample_fn in file_ls:
    df = pd.read_csv(sample_fn,
                     header=None,
                     sep='\t',
                     keep_default_na=False,
                     na_values=['n/a'])
    df.dropna(axis=0, how='any', inplace=True)
    ls_ls.append(list(df[4]))
    ls_ls_label.append(sample_fn.split('/')[-1])
#    savefn = './test.png'
gj.cumulate_dist_plot(ls_ls=ls_ls,
                      ls_ls_label=ls_ls_label,
                      bins=40,
                      title=None,
                      ax=None,
                      savefn=savefn,
                      xlabel=None,
                      ylabel='',
                      add_vline=[0.6, 0.7],
                      add_hline=[0, 0.3, 0.4, 1],
                      log2transform=0,
                      xlim=[-0.05, 1.05],
                      ylim=[-0.05, 1.05])
def TE_rep_corr(TE1, TE2, savefn, label1='control', label2='RK33'):
    df_TE1 = pd.read_csv(TE1, header=0, sep='\t')
    df_TE2 = pd.read_csv(TE2, header=0, sep='\t')

    fig, ax = plt.subplots()
    ls_ls = [
        list(df_TE2['log2(TE(%s))' % (label1)]),
        list(df_TE2['log2(TE(%s))' % (label2)])
    ]
    # df_TE1[['log2(TE(control))', 'log2(TE(RK33))']].plot(kind='bar', ax=ax)
    gj.cumulate_dist_plot(ls_ls=ls_ls,
                          ls_ls_label=[label1, label2],
                          bins=40000,
                          title=None,
                          ax=None,
                          savefn=TE1 + '.cumulate.png',
                          xlabel=None,
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0,
                          xlim=None,
                          ylim=None)

    df_merge = df_TE1.merge(df_TE2, on='transcript', how='inner')

    df_merge['mean(TE(%s))' %
             (label1)] = (df_merge['TE(%s)_x' %
                                   (label1)] + df_merge['TE(%s)_y' %
                                                        (label1)]) / 2.0
    df_merge['mean(TE(%s))' %
             (label2)] = (df_merge['TE(%s)_x' %
                                   (label2)] + df_merge['TE(%s)_y' %
                                                        (label2)]) / 2.0
    df_merge['log2(mean(TE(%s)))' % (label1)] = np.log2(
        df_merge['mean(TE(%s))' % (label1)])
    df_merge['log2(mean(TE(%s)))' % (label2)] = np.log2(
        df_merge['mean(TE(%s))' % (label2)])
    ls_ls = [
        list(df_merge['log2(mean(TE(%s)))' % (label1)]),
        list(df_merge['log2(mean(TE(%s)))' % (label2)])
    ]
    p = gj.ks_2samp(ls_ls[0], ls_ls[1])
    print "pvalue: %s" % (p)
    gj.cumulate_dist_plot(ls_ls=ls_ls,
                          ls_ls_label=[label1, label2],
                          bins=40000,
                          title=None,
                          ax=None,
                          savefn=TE1 + '.cumulate.mean.pdf',
                          xlabel=None,
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0,
                          xlim=[-5, 5],
                          ylim=None)

    df_merge.to_csv(savefn, header=True, index=False, sep='\t')

    fig, ax = plt.subplots(figsize=(8, 8))
    df_merge.plot(kind='scatter',
                  x='log2(TE(%s))_x' % (label1),
                  y='log2(TE(%s))_y' % (label1),
                  ax=ax)
    r, p = stats.pearsonr(df_merge['log2(TE(%s))_x' % (label1)],
                          df_merge['log2(TE(%s))_y' % (label1)])
    plt.title("r: %s, p:%s" % (r, p))

    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)

    plt.tight_layout()
    plt.savefig(savefn.replace('.txt', '.%s.pdf' % (label1)))
    plt.close()

    fig, ax = plt.subplots(figsize=(8, 8))
    df_merge.plot(kind='scatter',
                  x='log2(TE(%s))_x' % (label2),
                  y='log2(TE(%s))_y' % (label2),
                  ax=ax)
    r, p = stats.pearsonr(df_merge['log2(TE(%s))_x' % (label2)],
                          df_merge['log2(TE(%s))_y' % (label2)])
    plt.title("r: %s, p:%s" % (r, p))

    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)

    plt.tight_layout()
    plt.savefig(savefn.replace('.txt', '.%s.pdf' % (label2)))
    plt.close()
def plot_gini_compare(save_dir=None):
    save_dir = save_dir if save_dir is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/result/16-08-08_16_library_invivo_invitro/gini'
    fn_ls = os.listdir(save_dir)
    T_t_ls = ['T%st%s' % (i, j) for i in [0, 1, 2] for j in [0, 20, 200]]
    fn_ls = [
        i for i in fn_ls if i.endswith('.txt') and i.split('.')[1] in T_t_ls
    ]
    print fn_ls

    df_ls = []
    for fn in fn_ls:
        fn_path = save_dir + '/' + fn
        print "process: %s" % (fn_path)
        df = pd.read_csv(fn_path, header=0, sep='\t')
        condition = fn.split('.')[0].split('_')[2]
        T_t_cutoff = fn.split('.')[1]
        df['condition'] = condition
        df['T_t_cutoff'] = T_t_cutoff

        print df.head()

        df_ls.append(df)
    df_all = pd.concat(df_ls)
    savefn = save_dir + '/' + 'vivo_vitro_gini_all.txt'
    df_all.to_csv(savefn, index=False, header=True, sep='\t')
    print df_all.head()

    # df_all_melt = pd.melt(df_all, id_vars=['tx', 'null_pct_cutoff', 'condition', 'T_t_cutoff'], value_vars=['vivo', 'vitro'], var_name='vivo/vitro', value_name='Gini')
    # df_all_melt['condition'] = ['%s,%s'%(i,j) for i,j in zip(df_all_melt['condition'], df_all_melt['vivo/vitro'])]
    # print df_all_melt.head()
    # df_all_melt.sort_values(by=['condition', 'T_t_cutoff'], inplace=True)
    # df_all_melt = df_all_melt[df_all_melt['null_pct_cutoff'].isin([0.4, 0.6])]
    # df_all_melt = df_all_melt[df_all_melt['T_t_cutoff'].isin(['T0t0', 'T0t20', 'T1t20'])]
    # df_all_melt.to_csv(savefn.replace('.txt', '.plot.txt'), index=False, header=True, sep='\t')
    # g = sns.FacetGrid(data=df_all_melt, row='T_t_cutoff', col='null_pct_cutoff', sharey=True, margin_titles=True)
    # g = g.map(sns.boxplot, 'condition', 'Gini', )
    # g.set_xticklabels(rotation=90)
    # g.set(ylim=(0.5,1))
    # g.savefig(savefn.replace('.txt', '.png'))
    # plt.close()

    # g = sns.FacetGrid(data=df_all_melt, row='T_t_cutoff', col='null_pct_cutoff', sharey=True, margin_titles=True)
    # g = g.map(sns.countplot, 'condition',)
    # g.set_xticklabels(rotation=90)
    # g.savefig(savefn.replace('.txt', '.count.png'))
    # plt.close()

    df_select = df_all_melt[(df_all_melt['null_pct_cutoff'] == 0.4)
                            & (df_all_melt['T_t_cutoff'] == 'T1t20')
                            & df_all_melt['condition'].isin(
                                ['mRNA,vivo', 'mRNA,vitro'])]
    print df_select

    fig, ax = plt.subplots(figsize=(4, 6))
    sns.boxplot(x='condition', y='Gini', data=df_select, ax=ax)
    plt.tight_layout()
    ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
    plt.savefig(savefn.replace('.txt', '.select.png'))
    plt.close()

    condition_ls = []
    condition_gini_ls = []
    for i in df_select['condition'].value_counts().keys():
        print i
        df_select[df_select['condition'] == i]
        condition_ls.append(i)
        condition_gini_ls.append(
            list(df_select[df_select['condition'] == i]['Gini']))
        savefn = savefn.replace('.txt', '.select2.png')
    gj.cumulate_dist_plot(ls_ls=condition_gini_ls,
                          ls_ls_label=condition_ls,
                          bins=40,
                          title=None,
                          ax=None,
                          savefn=savefn,
                          xlabel=None,
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0)

    stat, pval = stats.ks_2samp(condition_gini_ls[0], condition_gini_ls[1])
    print pval