def degree_hist(dg_txt=None): if dg_txt is None: dg_txt = '/Share/home/zhangqf7/gongjing/zebrafish/data/paris/shi-zp-5-rep-combine/27-DG.txt' dg_dict = read_dg_txt(dg_txt) trans_dict = loadTransGtfBed2() RRI_dict = nested_dict(2, list) for i,j in dg_dict.items(): if j['lchr'] != j['rchr']: RRI_dict[j['RRI_type']][j['lchr']].append(j['rchr']) RRI_dict[j['RRI_type']][j['rchr']].append(j['lchr']) else: RRI_dict[j['RRI_type']][j['lchr']].append(j['rchr']) for i in ['inter', 'intra']: savefn = dg_txt.replace('.txt', '.%s.degree.txt'%(i)) degree_ls_ls = [[], [], []] with open(savefn, 'w') as SAVEFN: for k,v in RRI_dict[i].items(): print >>SAVEFN, '\t'.join(map(str, [ k, trans_dict[k]['type'], len(v), len(set(v)), ','.join(list(set(v))) ])) degree_ls_ls[0].append(len(set(v))) if trans_dict[k]['type'] == 'mRNA': degree_ls_ls[1].append(len(set(v))) if trans_dict[k]['type'] == 'lncRNA': degree_ls_ls[2].append(len(set(v))) degree_mean_ls = [np.mean(i) for i in degree_ls_ls] gj.cumulate_dist_plot(ls_ls=degree_ls_ls,ls_ls_label=['%s, mean=%.2f'%(i,j) for i,j in zip(['all', 'mRNA', 'lncRNA'], degree_mean_ls)], bins=40,title='degree distribution',ax=None,savefn=savefn.replace('.txt', '.pdf'),xlabel='log2(# of interacting partners)',ylabel=None,add_vline=None,add_hline=None,log2transform=1,xlim=None,ylim=None)
def read_len_dist_all( savefn='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/read_len_dist_all.png' ): gj.printFuncRun('read_len_dist_all') gj.printFuncArgs() library_info_dict = library_info() trimmed_dict = library_info_dict['lib']['trimmed'] print trimmed_dict read_len_ls_ls = [] read_cut_len_ls_ls = [] fig, ax = plt.subplots(3, 1, sharex=True, figsize=(14, 16)) color_ls = gj.sns_color_ls() sample_ls = [] for n, (i, j) in enumerate(trimmed_dict.items()): sample_ls.append(i) print i, j fq_len_txt = j + '.len.txt' trimlog = j + '.trimlog' df = pd.read_csv(fq_len_txt, sep='\s+', header=None) df.columns = ['# of reads', 'read length'] df.plot(ax=ax[0], x='read length', y='# of reads', label=i) df_trimlog = pd.read_csv(trimlog, header=None, sep='\s+') df_trimlog.columns = [ 'seq_name', 'sample_name', 'survive_len', 'survive_start', 'survive_end', 'cut_len' ] df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0] cut_len_ls = list(df_trimlog['cut_len']) n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])] n = gj.ls_ls_flat(n) read_len_ls_ls.append(n) read_cut_len_ls_ls.append(cut_len_ls) gj.cumulate_dist_plot(read_len_ls_ls, ls_ls_label=sample_ls, bins=40, title=None, ax=ax[1], savefn=None, xlabel='Length', ylabel=None, add_vline=None, add_hline=None, log2transform=0) gj.cumulate_dist_plot(read_cut_len_ls_ls, ls_ls_label=sample_ls, bins=40, title=None, ax=ax[2], savefn=None, xlabel='Length', ylabel=None, add_vline=None, add_hline=None, log2transform=0) plt.tight_layout() plt.savefig(savefn) plt.close() gj.printFuncRun('read_len_dist_all')
def read_len_dist( fq='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R1_001.trimmed.fastq' ): gj.printFuncRun('read_len_dist') gj.printFuncArgs() fq_len_txt = fq + '.len.txt' subprocess.call([ "awk '{if(NR%%4==2) print length($1)}' %s| sort|uniq -c|sort -k2,2n > %s " % (fq, fq_len_txt) ], shell=True) # use double % to escape df = pd.read_csv(fq_len_txt, sep='\s+', header=None) df.columns = ['# of reads', 'read length'] df_plot = df[['read length', '# of reads']] print df_plot fig, ax = plt.subplots(2, 1, sharex=True) df.plot(ax=ax[0], x='read length', y='# of reads') df.plot(kind='scatter', ax=ax[0], x='read length', y='# of reads') df_trimlog = pd.read_csv(fq + '.trimlog', header=None, sep='\s+') df_trimlog.columns = [ 'seq_name', 'sample_name', 'survive_len', 'survive_start', 'survive_end', 'cut_len' ] df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0] cut_len_ls = list(df_trimlog['cut_len']) n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])] n = gj.ls_ls_flat(n) gj.cumulate_dist_plot( [n, cut_len_ls], ls_ls_label=['kethoxal read length', 'kethoxal read cut length'], bins=40, title=None, ax=ax[1], savefn=None, xlabel='Length', ylabel=None, add_vline=None, add_hline=None, log2transform=0) plt.tight_layout() plt.savefig(fq + '.len.png') plt.close() gj.printFuncRun('read_len_dist')
def main(): notreat = compare_corr( out1= '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-11-14_7_library_total_Kethoxal_remove/kethoxalseq_noTreat.out', savefn= '/Share2/home/zhangqf5/gongjing/Kethoxal_RNA_structure/Keth-seq/results/F2b.kethoxal-notreat.icshape.corr.txt', label='kethoxalseq:icSHAPE') gj.cumulate_dist_plot( ls_ls=[notreat[-2]], ls_ls_label=[ 'kethoxal vs icshape', ], bins=40, title=None, ax=None, savefn= '/Share2/home/zhangqf5/gongjing/Kethoxal_RNA_structure/Keth-seq/results/F2b.kethoxal_icshape_corr.pdf', xlabel=None, ylabel=None, add_vline=None, add_hline=None, log2transform=0) tx_corr_scatter()
from glob import glob ls_ls, ls_ls_label = [], [] file_ls = sys.argv[1:-1] savefn = sys.argv[-1] print "input: ", file_ls print "output: ", savefn for sample_fn in file_ls: df = pd.read_csv(sample_fn, header=None, sep='\t', keep_default_na=False, na_values=['n/a']) df.dropna(axis=0, how='any', inplace=True) ls_ls.append(list(df[4])) ls_ls_label.append(sample_fn.split('/')[-1]) # savefn = './test.png' gj.cumulate_dist_plot(ls_ls=ls_ls, ls_ls_label=ls_ls_label, bins=40, title=None, ax=None, savefn=savefn, xlabel=None, ylabel='', add_vline=[0.6, 0.7], add_hline=[0, 0.3, 0.4, 1], log2transform=0, xlim=[-0.05, 1.05], ylim=[-0.05, 1.05])
def TE_rep_corr(TE1, TE2, savefn, label1='control', label2='RK33'): df_TE1 = pd.read_csv(TE1, header=0, sep='\t') df_TE2 = pd.read_csv(TE2, header=0, sep='\t') fig, ax = plt.subplots() ls_ls = [ list(df_TE2['log2(TE(%s))' % (label1)]), list(df_TE2['log2(TE(%s))' % (label2)]) ] # df_TE1[['log2(TE(control))', 'log2(TE(RK33))']].plot(kind='bar', ax=ax) gj.cumulate_dist_plot(ls_ls=ls_ls, ls_ls_label=[label1, label2], bins=40000, title=None, ax=None, savefn=TE1 + '.cumulate.png', xlabel=None, ylabel=None, add_vline=None, add_hline=None, log2transform=0, xlim=None, ylim=None) df_merge = df_TE1.merge(df_TE2, on='transcript', how='inner') df_merge['mean(TE(%s))' % (label1)] = (df_merge['TE(%s)_x' % (label1)] + df_merge['TE(%s)_y' % (label1)]) / 2.0 df_merge['mean(TE(%s))' % (label2)] = (df_merge['TE(%s)_x' % (label2)] + df_merge['TE(%s)_y' % (label2)]) / 2.0 df_merge['log2(mean(TE(%s)))' % (label1)] = np.log2( df_merge['mean(TE(%s))' % (label1)]) df_merge['log2(mean(TE(%s)))' % (label2)] = np.log2( df_merge['mean(TE(%s))' % (label2)]) ls_ls = [ list(df_merge['log2(mean(TE(%s)))' % (label1)]), list(df_merge['log2(mean(TE(%s)))' % (label2)]) ] p = gj.ks_2samp(ls_ls[0], ls_ls[1]) print "pvalue: %s" % (p) gj.cumulate_dist_plot(ls_ls=ls_ls, ls_ls_label=[label1, label2], bins=40000, title=None, ax=None, savefn=TE1 + '.cumulate.mean.pdf', xlabel=None, ylabel=None, add_vline=None, add_hline=None, log2transform=0, xlim=[-5, 5], ylim=None) df_merge.to_csv(savefn, header=True, index=False, sep='\t') fig, ax = plt.subplots(figsize=(8, 8)) df_merge.plot(kind='scatter', x='log2(TE(%s))_x' % (label1), y='log2(TE(%s))_y' % (label1), ax=ax) r, p = stats.pearsonr(df_merge['log2(TE(%s))_x' % (label1)], df_merge['log2(TE(%s))_y' % (label1)]) plt.title("r: %s, p:%s" % (r, p)) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes ] # now plot both limits against eachother ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) plt.tight_layout() plt.savefig(savefn.replace('.txt', '.%s.pdf' % (label1))) plt.close() fig, ax = plt.subplots(figsize=(8, 8)) df_merge.plot(kind='scatter', x='log2(TE(%s))_x' % (label2), y='log2(TE(%s))_y' % (label2), ax=ax) r, p = stats.pearsonr(df_merge['log2(TE(%s))_x' % (label2)], df_merge['log2(TE(%s))_y' % (label2)]) plt.title("r: %s, p:%s" % (r, p)) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes ] # now plot both limits against eachother ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) plt.tight_layout() plt.savefig(savefn.replace('.txt', '.%s.pdf' % (label2))) plt.close()
def plot_gini_compare(save_dir=None): save_dir = save_dir if save_dir is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/result/16-08-08_16_library_invivo_invitro/gini' fn_ls = os.listdir(save_dir) T_t_ls = ['T%st%s' % (i, j) for i in [0, 1, 2] for j in [0, 20, 200]] fn_ls = [ i for i in fn_ls if i.endswith('.txt') and i.split('.')[1] in T_t_ls ] print fn_ls df_ls = [] for fn in fn_ls: fn_path = save_dir + '/' + fn print "process: %s" % (fn_path) df = pd.read_csv(fn_path, header=0, sep='\t') condition = fn.split('.')[0].split('_')[2] T_t_cutoff = fn.split('.')[1] df['condition'] = condition df['T_t_cutoff'] = T_t_cutoff print df.head() df_ls.append(df) df_all = pd.concat(df_ls) savefn = save_dir + '/' + 'vivo_vitro_gini_all.txt' df_all.to_csv(savefn, index=False, header=True, sep='\t') print df_all.head() # df_all_melt = pd.melt(df_all, id_vars=['tx', 'null_pct_cutoff', 'condition', 'T_t_cutoff'], value_vars=['vivo', 'vitro'], var_name='vivo/vitro', value_name='Gini') # df_all_melt['condition'] = ['%s,%s'%(i,j) for i,j in zip(df_all_melt['condition'], df_all_melt['vivo/vitro'])] # print df_all_melt.head() # df_all_melt.sort_values(by=['condition', 'T_t_cutoff'], inplace=True) # df_all_melt = df_all_melt[df_all_melt['null_pct_cutoff'].isin([0.4, 0.6])] # df_all_melt = df_all_melt[df_all_melt['T_t_cutoff'].isin(['T0t0', 'T0t20', 'T1t20'])] # df_all_melt.to_csv(savefn.replace('.txt', '.plot.txt'), index=False, header=True, sep='\t') # g = sns.FacetGrid(data=df_all_melt, row='T_t_cutoff', col='null_pct_cutoff', sharey=True, margin_titles=True) # g = g.map(sns.boxplot, 'condition', 'Gini', ) # g.set_xticklabels(rotation=90) # g.set(ylim=(0.5,1)) # g.savefig(savefn.replace('.txt', '.png')) # plt.close() # g = sns.FacetGrid(data=df_all_melt, row='T_t_cutoff', col='null_pct_cutoff', sharey=True, margin_titles=True) # g = g.map(sns.countplot, 'condition',) # g.set_xticklabels(rotation=90) # g.savefig(savefn.replace('.txt', '.count.png')) # plt.close() df_select = df_all_melt[(df_all_melt['null_pct_cutoff'] == 0.4) & (df_all_melt['T_t_cutoff'] == 'T1t20') & df_all_melt['condition'].isin( ['mRNA,vivo', 'mRNA,vitro'])] print df_select fig, ax = plt.subplots(figsize=(4, 6)) sns.boxplot(x='condition', y='Gini', data=df_select, ax=ax) plt.tight_layout() ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45) plt.savefig(savefn.replace('.txt', '.select.png')) plt.close() condition_ls = [] condition_gini_ls = [] for i in df_select['condition'].value_counts().keys(): print i df_select[df_select['condition'] == i] condition_ls.append(i) condition_gini_ls.append( list(df_select[df_select['condition'] == i]['Gini'])) savefn = savefn.replace('.txt', '.select2.png') gj.cumulate_dist_plot(ls_ls=condition_gini_ls, ls_ls_label=condition_ls, bins=40, title=None, ax=None, savefn=savefn, xlabel=None, ylabel=None, add_vline=None, add_hline=None, log2transform=0) stat, pval = stats.ks_2samp(condition_gini_ls[0], condition_gini_ls[1]) print pval