def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0]) print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def RT_combine(rt1=None, rt2=None, rt_comb=None): gj.printFuncRun('RT_combine') gj.printFuncArgs() combineRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/combineRTreplicates.pl' subprocess.call(["%s -i %s:%s -o %s" % (combineRT_pl, rt1, rt2, rt_comb)], shell=True) gj.printFuncRun('RT_combine')
def mapping_PE(fastq1=None, fastq2=None, mapper='bowtie2', index_dir=None): gj.printFuncRun('mapping_PE') gj.printFuncArgs() map_sam = fastq1.replace('paired.clip.fastq', 'sam') map_rRNA_sam = fastq1.replace('paired.clip.fastq', 'rRNA.sam') # map to rRNA index = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/rRNA/index/rrna_uniq' rRNA_unmap_fastq = fastq1.replace('fastq', 'rRNAUnmap.fastq') subprocess.call([ "bowtie2 -p 12 -1 %s -2 %s -x %s -S %s --non-deterministic --time --un-conc %s --no-unal" % (fastq1, fastq2, index, map_rRNA_sam, rRNA_unmap_fastq) ], shell=True) # map to transcriptome index = '/Share/home/zhangqf/database/GenomeAnnotation/INDEX/Bowtie2/mm10/Gencode_transcriptome/whole_transcriptome/mm10' #index = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/Cyprinus_carpio/rna' #map_sam = fastq1.replace() rRNA_unmap_fastq1 = fastq1.replace('fastq', 'rRNAUnmap.1.fastq') rRNA_unmap_fastq2 = fastq1.replace('fastq', 'rRNAUnmap.2.fastq') subprocess.call([ "bowtie2 -p 12 -1 %s -2 %s -x %s -S %s --non-deterministic --time --no-unal" % (rRNA_unmap_fastq1, rRNA_unmap_fastq2, index, map_sam) ], shell=True) gj.printFuncRun('mapping_PE')
def remove_adapter_PE_new(fastq1=None, fastq2=None): gj.printFuncRun('remove_adapter_PE_new') gj.printFuncArgs() trimmed_fastq1 = fastq1.replace('.fastq', 'trimmed.fastq') trimmed_fastq2 = fastq2.replace('.fastq', 'trimmed.fastq') adapter_fa1 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal.fa' adapter_fa2 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal_rev.fa' adapter_fa = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal_PE.fa' trimmomatic = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/bin/trimmomatic-0.30.jar' R1_paired = fastq1.replace('fastq', 'paired.fastq') R1_unpaired = fastq1.replace('fastq', 'unpaired.fastq') R2_paired = fastq2.replace('fastq', 'paired.fastq') R2_unpaired = fastq2.replace('fastq', 'unpaired.fastq') trimlog = fastq1.replace('_R1_001.fastq', '.trim.log') subprocess.call([ "java -jar %s PE -threads 32 -phred33 -trimlog %s %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:33" % (trimmomatic, trimlog, fastq1, fastq2, R1_paired, R1_unpaired, R2_paired, R2_unpaired, adapter_fa) ], shell=True) clip_fastq1 = R1_paired.replace('fastq', 'clip.fastq') clip_trimlog1 = clip_fastq1 + '.log' clip_fastq2 = R2_paired.replace('fastq', 'clip.fastq') clip_trimlog2 = clip_fastq2 + '.log' subprocess.call( ["cutadapt -u 13 -o %s %s" % (clip_fastq1, trimmed_fastq1)], shell=True) subprocess.call( ["cutadapt -u -13 -o %s %s" % (clip_fastq2, trimmed_fastq2)], shell=True) gj.printFuncRun('remove_adapter_PE_new')
def read_tmp_out(tmp_out=None,file_str=None,sample=None): gj.printFuncRun('read_tmp_out') gj.printFuncArgs() fa_dict = read_fa() tx_base_pos_dict = nested_dict(2, list) # {tx:{'A':[pos1,pos2],'T':[]}} base_enrich_dict = nested_dict(1, int) with open(tmp_out, 'r') as TMP_OUT: for line in TMP_OUT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') transcript_id = arr[0] transcript_len = int(arr[1]) if transcript_len != len(fa_dict[transcript_id]): print "transcirpt length not conistent with reference: %s, tmp_out len: %s, reference len: %s"%(transcript_id, transcript_len, len(fa_dict[transcript_id])) sys.exit() for n,base_enrichment_score in enumerate(arr[4:]): score = base_enrichment_score.split(',')[0] #if score != "NULL" and float(score) != 0 and float(score) >= 0.3: if score != "NULL" and float(score) != 0: base = fa_dict[transcript_id][n] tx_base_pos_dict[transcript_id][base].append(n) base_enrich_dict[base.upper()] += 1 print base_enrich_dict #val_ls = [base_enrich_dict[i] for i in ['A','T','C','G']] #gj.plot_ls_pie(labels=['A','T','C','G'],val=val_ls,dic="",title_str="",file_str=file_str) TXT = open(file_str, 'w') for i,j in base_enrich_dict.items(): print >>TXT,i+'\t'+str(j) TXT.close() gj.printFuncRun('read_tmp_out')
def read_bed(bed=None, fa=None): gj.printFuncRun('read_bed') gj.printFuncArgs() base_dict = nested_dict(1, int) fa_dict = read_fa(fa) with open(bed, 'r') as BED: for n,line in enumerate(BED): line = line.strip() if not line or line.startswith('#'): continue if n%1000000 == 0: print "process: %s"%(n) arr = line.split('\t') tx_id = arr[0] tx_start = int(arr[1]) tx_end = int(arr[2]) strand = arr[5] if strand == "+": base = fa_dict[tx_id][tx_start-1] elif strand == "-": # base = fa_dict[tx_id][tx_end] # base = base_complementary(base) continue else: print "unknown strand: %s"%(strand) sys.exit() base_dict[base] += 1 print base_dict bed_base_txt = bed.replace('bed','base.txt') with open(bed_base_txt,'w') as TXT: for i,j in base_dict.items(): print >>TXT,i+'\t'+str(j) gj.printFuncRun('read_bed')
def inner_distance(sam=None, output_prefix=None): gj.printFuncRun('inner_distance') gj.printFuncArgs() bed12 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10.transCoor.bed12' subprocess.call( ["inner_distance.py -i %s -o %s -r %s" % (sam, output_prefix, bed12)], shell=True) gj.printFuncRun('inner_distance')
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0]) fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()} print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def read_len_dist_all( savefn='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/read_len_dist_all.png' ): gj.printFuncRun('read_len_dist_all') gj.printFuncArgs() library_info_dict = library_info() trimmed_dict = library_info_dict['lib']['trimmed'] print trimmed_dict read_len_ls_ls = [] read_cut_len_ls_ls = [] fig, ax = plt.subplots(3, 1, sharex=True, figsize=(14, 16)) color_ls = gj.sns_color_ls() sample_ls = [] for n, (i, j) in enumerate(trimmed_dict.items()): sample_ls.append(i) print i, j fq_len_txt = j + '.len.txt' trimlog = j + '.trimlog' df = pd.read_csv(fq_len_txt, sep='\s+', header=None) df.columns = ['# of reads', 'read length'] df.plot(ax=ax[0], x='read length', y='# of reads', label=i) df_trimlog = pd.read_csv(trimlog, header=None, sep='\s+') df_trimlog.columns = [ 'seq_name', 'sample_name', 'survive_len', 'survive_start', 'survive_end', 'cut_len' ] df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0] cut_len_ls = list(df_trimlog['cut_len']) n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])] n = gj.ls_ls_flat(n) read_len_ls_ls.append(n) read_cut_len_ls_ls.append(cut_len_ls) gj.cumulate_dist_plot(read_len_ls_ls, ls_ls_label=sample_ls, bins=40, title=None, ax=ax[1], savefn=None, xlabel='Length', ylabel=None, add_vline=None, add_hline=None, log2transform=0) gj.cumulate_dist_plot(read_cut_len_ls_ls, ls_ls_label=sample_ls, bins=40, title=None, ax=ax[2], savefn=None, xlabel='Length', ylabel=None, add_vline=None, add_hline=None, log2transform=0) plt.tight_layout() plt.savefig(savefn) plt.close() gj.printFuncRun('read_len_dist_all')
def read_clean_map_rt(fastq): gj.printFuncRun('read_clean_map_rt') gj.printFuncArgs() collapse_fq = read_collapse(fastq=fastq) trimmed_fastq = remove_adapter(fastq=collapse_fq) map_sam = mapping(fastq=trimmed_fastq) sam_rpkm = rpkm_cal(sam=map_sam) sam_rt = RT_cal(sam=map_sam) gj.printFuncRun('read_clean_map_rt')
def rpkm_cal(sam=None): gj.printFuncRun('rpkm_cal') gj.printFuncArgs() estimateRPKM_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/estimateRPKM.pl' sam_rpkm = sam.replace('sam', 'rpkm') subprocess.call(["%s -i %s -o %s" % (estimateRPKM_pl, sam, sam_rpkm)], shell=True) gj.printFuncRun('rpkm_cal') return sam_rpkm
def get_dir_fastq(dir='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-03-25_library_of_concentration'): gj.printFuncRun('get_dir_fastq') gj.printFuncArgs() fn_ls = os.listdir(dir) fastq_fn_ls = [i for i in fn_ls if i.endswith('fastq') and 'Undetermined' not in i] print fastq_fn_ls fastq_fn_ls = [dir+'/'+i for i in fastq_fn_ls] gj.printFuncRun('get_dir_fastq') return fastq_fn_ls
def FPKM_count(bam=None, output_prefix=None, rRNA=0): gj.printFuncRun('FPKM_count') gj.printFuncArgs() bed12 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10.transCoor.bed12' if rRNA: bed12 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10.rRNA.bed12' subprocess.call( ["FPKM_count.py -i %s -o %s -r %s" % (bam, output_prefix, bed12)], shell=True) gj.printFuncRun('FPKM_count')
def RT_cal(sam=None): gj.printFuncRun('RT_cal') gj.printFuncArgs() sam_rt = sam.replace('sam', 'rt') sam_rpkm = sam.replace('sam', 'rpkm') calcRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/calcRT.pl' subprocess.call( ["%s -i %s -o %s -r %s -c 1" % (calcRT_pl, sam, sam_rt, sam_rpkm)], shell=True) gj.printFuncRun('RT_cal') return sam_rt
def read_collapse(fastq=None): gj.printFuncRun('read_collapse') gj.printFuncArgs() collapse_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/readCollapse.pl' collapse_fq = fastq.replace('fastq', 'rmdup.fastq') seq_freq_fa = fastq.replace('fastq', 'fa') subprocess.call([ "%s -U %s -o %s -f %s" % (collapse_pl, fastq, collapse_fq, seq_freq_fa) ], shell=True) gj.printFuncRun('read_collapse') return collapse_fq
def RT_normalize(rt=None): gj.printFuncRun('RT_normalize') gj.printFuncArgs() normalize_rt = rt.replace('.rt', '.normalized.rt').replace('RT', 'normalized.RT') normalizeRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/normalizeRTfile.pl' subprocess.call([ "%s -i %s -o %s -m mean:vigintile2 -d 32 -l 32" % (normalizeRT_pl, rt, normalize_rt) ], shell=True) gj.printFuncRun('RT_normalize')
def calc_enrich(f_normalized_rt=None, b_normalized_rt=None, icshape_tmp_out=None, x=0.25): gj.printFuncRun('calc_enrich') gj.printFuncArgs() calc_enrich_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/calcEnrich.pl' subprocess.call([ "%s -f %s -b %s -o %s -w factor5:scaling1 -x %s" % (calc_enrich_pl, f_normalized_rt, b_normalized_rt, icshape_tmp_out, x) ], shell=True) gj.printFuncRun('calc_enrich')
def read_rpkm_txt(txt, min_val=-1): gj.printFuncRun('read_rpkm_txt') gj.printFuncArgs() val_dict = nested_dict() gene_ls = [] with open(txt, 'r') as TXT: for line in TXT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') val_dict[arr[0]] = float(arr[4]) gene_ls.append(arr[0]) gj.printFuncRun('read_rpkm_txt') return val_dict,gene_ls
def RT_correlation(rt1=None, rt2=None, rt_corr=None, coverage_cutoff=0, background_base_density=0): gj.printFuncRun('RT_correlation') gj.printFuncArgs() correlationRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/correlationRT.pl' subprocess.call([ "%s -1 %s -2 %s -T %s -b %s > %s" % (correlationRT_pl, rt1, rt2, coverage_cutoff, background_base_density, rt_corr) ], shell=True) gj.printFuncRun('RT_correlation')
def read_len_dist( fq='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R1_001.trimmed.fastq' ): gj.printFuncRun('read_len_dist') gj.printFuncArgs() fq_len_txt = fq + '.len.txt' subprocess.call([ "awk '{if(NR%%4==2) print length($1)}' %s| sort|uniq -c|sort -k2,2n > %s " % (fq, fq_len_txt) ], shell=True) # use double % to escape df = pd.read_csv(fq_len_txt, sep='\s+', header=None) df.columns = ['# of reads', 'read length'] df_plot = df[['read length', '# of reads']] print df_plot fig, ax = plt.subplots(2, 1, sharex=True) df.plot(ax=ax[0], x='read length', y='# of reads') df.plot(kind='scatter', ax=ax[0], x='read length', y='# of reads') df_trimlog = pd.read_csv(fq + '.trimlog', header=None, sep='\s+') df_trimlog.columns = [ 'seq_name', 'sample_name', 'survive_len', 'survive_start', 'survive_end', 'cut_len' ] df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0] cut_len_ls = list(df_trimlog['cut_len']) n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])] n = gj.ls_ls_flat(n) gj.cumulate_dist_plot( [n, cut_len_ls], ls_ls_label=['kethoxal read length', 'kethoxal read cut length'], bins=40, title=None, ax=ax[1], savefn=None, xlabel='Length', ylabel=None, add_vline=None, add_hline=None, log2transform=0) plt.tight_layout() plt.savefig(fq + '.len.png') plt.close() gj.printFuncRun('read_len_dist')
def run_fastq(fq): run_fastq_log = fq+'.log' gj.printFuncRun('run_fastq') gj.printFuncArgs() LOG = open(run_fastq_log,'w') sys.stdout = LOG sys.stderr = LOG fq = fq.replace('trimmed.fastq','fastq') collapse_fq = fq.replace('fastq','rmdup.fastq') trimmed_fastq = fq.replace('fastq','trimmed.fastq') map_sam = fq.replace('fastq','sam') map_rRNA_sam = fq.replace('fastq','rRNA.sam') sam_rpkm = fq.replace('fastq','rpkm') sam_rt = fq.replace('fastq','rt') # for raw data icshape.read_collapse(fastq=fq) icshape.remove_adapter(fastq=collapse_fq) icshape.mapping(fastq=trimmed_fastq) icshape.rpkm_cal(sam=map_sam) icshape.RT_cal(sam=map_sam) icshape.rpkm_cal(sam=map_rRNA_sam) icshape.RT_cal(sam=map_rRNA_sam) # for clean data # icshape.mapping(fastq=trimmed_fastq) # icshape.rpkm_cal(sam=map_sam) # icshape.RT_cal(sam=map_sam) # icshape.rpkm_cal(sam=map_rRNA_sam) # icshape.RT_cal(sam=map_rRNA_sam) # for Rfam # map_rfam_sam = fq.replace('fastq','rfam.sam') # rfam_sam_rpkm = fq.replace('fastq','rfam.rpkm') # index = '/Share2/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/Rfam/Parsed_Structure/human.dot.dedup.fa' # icshape.map_rfam(fastq=trimmed_fastq, map_sam=map_rfam_sam, index=index) # icshape.rpkm_cal(sam=map_rfam_sam) # icshape.RT_cal(sam=map_rfam_sam) gj.printFuncRun('run_fastq') LOG.close() sys.stdout = sys.__stdout__
def filter_enrich(icshape_tmp_out=None, average_coverage=2, background_base_density=200, skip_leading=5, skilp_tailing=30): gj.printFuncRun('filter_enrich') gj.printFuncArgs() icshape_out = icshape_tmp_out.replace( '.tmp', '.T%st%s' % (average_coverage, background_base_density)) filter_enrich_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/filterEnrich.pl' subprocess.call([ "perl %s -i %s -o %s -T %s -t %s -s %s -e %s" % (filter_enrich_pl, icshape_tmp_out, icshape_out, average_coverage, background_base_density, skip_leading, skilp_tailing) ], shell=True) gj.printFuncRun('filter_enrich')
def remove_adapter_PE(fastq1=None, fastq2=None): gj.printFuncRun('remove_adapter_PE') gj.printFuncArgs() trimming_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/trimming.pl' trimmed_fastq1 = fastq1.replace('rmdup.fastq', 'trimmed.fastq') trimmed_fastq2 = fastq2.replace('rmdup.fastq', 'trimmed.fastq') adapter_fa = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/N02381_CY_80-65347437_DNASEQ/PF_data/170705-X1B/TruSeq3-PE.fa' subprocess.call([ "%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m 50" % (trimming_pl, fastq1, trimmed_fastq1, adapter_fa) ], shell=True) subprocess.call([ "%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m 50" % (trimming_pl, fastq2, trimmed_fastq2, adapter_fa) ], shell=True) gj.printFuncRun('remove_adapter_PE')
def corr_plot(fn_str=None, label_str=None, savefn=None, intersect='common'): gj.printFuncRun('corr_plot') gj.printFuncArgs() fn_ls = fn_str.split(':') label_ls = label_str.split(':') gene_ls_ls = [] rpkm_dict_ls = [] rpkm_ls_ls = [] for fn in fn_ls: rpkm_dict, gene_ls = read_rpkm_txt(fn) rpkm_dict_ls.append(rpkm_dict) gene_ls_ls.append(gene_ls) rpkm_ls_ls.append([np.log2(i) for i in rpkm_dict.values()]) # gj.cumulate_dist_plot(ls_ls=rpkm_ls_ls,ls_ls_label=label_ls,bins=40,title=None,ax=None,savefn=savefn+'.cdf.png',xlabel='log2(RPKM)',ylabel=None,add_vline=None,add_hline=None,log2transform=0) if intersect == 'common': genes = gj.ls_ls_common(ls_ls=gene_ls_ls, return_ls=1) elif intersect == 'union': genes = gj.ls_ls_union(ls_ls=gene_ls_ls, return_ls=1) SAVEFN = open(savefn, 'w') print >> SAVEFN, '#gene' + '\t' + '\t'.join(label_ls) for gene in genes: gene_rpkm_ls = [] for sample_rpkm_dict in rpkm_dict_ls: rpkm = np.log2(float( sample_rpkm_dict[gene])) if sample_rpkm_dict.has_key( gene) else np.log2(0.001) gene_rpkm_ls.append(rpkm) print >> SAVEFN, gene + '\t' + '\t'.join(map(str, gene_rpkm_ls)) SAVEFN.close() df = pd.read_csv(savefn, sep='\t', header=0) gj.df_corr_matrix_plot(df[label_ls], savefn=savefn + '.png', size=4, rot=30, share_x_y=1, hue=None, diag='kde') gj.printFuncRun('corr_plot')
def remove_adapter(fastq=None, trimmed_fastq=None): gj.printFuncRun('remove_adapter') gj.printFuncArgs() trimming_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/trimming.pl' if trimmed_fastq is None: trimmed_fastq = fastq.replace('rmdup.fastq', 'trimmed.fastq') adapter_fa = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal.fa' subprocess.call([ "%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m 0" % (trimming_pl, fastq, trimmed_fastq, adapter_fa) ], shell=True) """ for min_len in [50,25]: trimmed_fastq_minLen = trimmed_fastq.replace('fastq','minLen'+str(min_len)+'.fastq') subprocess.call(["%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m %s"%(trimming_pl, fastq, trimmed_fastq, adapter_fa, min_len)],shell=True) """ gj.printFuncRun('remove_adapter') return trimmed_fastq
def read_collapse_PE(fastq1=None, fastq2=None): gj.printFuncRun('read_collapse_PE') gj.printFuncArgs() collapse_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/readCollapse.pl' collapse_fq1 = fastq1.replace('fastq', 'rmdup.fastq') seq_freq_fa1 = fastq1.replace('fastq', 'fa') collapse_fq2 = fastq2.replace('fastq', 'rmdup.fastq') seq_freq_fa2 = fastq2.replace('fastq', 'fa') subprocess.call([ "%s -U %s -o %s -f %s" % (collapse_pl, fastq1, collapse_fq1, seq_freq_fa1) ], shell=True) subprocess.call([ "%s -U %s -o %s -f %s" % (collapse_pl, fastq2, collapse_fq2, seq_freq_fa2) ], shell=True) gj.printFuncRun('read_collapse_PE')
def read_icshape_out(out=None, pureID=1): gj.printFuncRun('read_icshape_out') gj.printFuncArgs() out_dict = nested_dict() with open(out, 'r') as OUT: for line in OUT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') tx_id = arr[0] if pureID: tx_id = tx_id.split('.')[0] length = int(arr[1]) rpkm = float(arr[2]) if arr[2] != '*' else arr[2] reactivity_ls = arr[3:] out_dict[tx_id]['tx_id'] = tx_id out_dict[tx_id]['rpkm'] = rpkm out_dict[tx_id]['reactivity_ls'] = reactivity_ls gj.printFuncRun('read_icshape_out') return out_dict
def mapping(fastq=None, mapper='bowtie2', index_dir=None): gj.printFuncRun('mapping') gj.printFuncArgs() map_sam = fastq.replace('trimmed.fastq', 'sam') map_rRNA_sam = fastq.replace('trimmed.fastq', 'rRNA.sam') #index = '/Share/home/zhangqf/database/GenomeAnnotation/INDEX/Bowtie2/mm_rRNA/mm_rRNA' index = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/rRNA/index/rrna_uniq' rRNA_unmap_fastq = fastq.replace('fastq', 'rRNAUnmap.fastq') subprocess.call([ "bowtie2 -U %s -S %s -x %s --non-deterministic --time --un %s" % (fastq, map_rRNA_sam, index, rRNA_unmap_fastq) ], shell=True) #index = '/Share/home/zhangqf/database/GenomeAnnotation/INDEX/Bowtie2/mm10/Gencode_transcriptome/whole_transcriptome/mm10' #subprocess.call(["bowtie2 -U %s -S %s -x %s --non-deterministic --time"%(rRNA_unmap_fastq, map_sam, index)],shell=True) gj.printFuncRun('mapping') return map_sam
def read_pair_len_dist(fastq1=None, fastq2=None, savefn=None): gj.printFuncRun('read_pair_len_dist') if fastq1 is None: fastq1 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R1_001.paired.fastqT' if fastq2 is None: fastq2 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R2_001.paired.fastqT' if savefn is None: savefn = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005.paired.fastq.len.png' gj.printFuncArgs() read_len_ls1 = [] with open(fastq1, 'r') as FQ1: for n, line in enumerate(FQ1): if n % 4 == 1: read_len_ls1.append(len(line.strip())) read_len_ls2 = [] with open(fastq2, 'r') as FQ2: for n, line in enumerate(FQ2): if n % 4 == 1: read_len_ls2.append(len(line.strip())) df = pd.DataFrame({'read1': read_len_ls1, 'read2': read_len_ls2}) print df.head() gj.df_sns_jointplot(col_str_x='read1', col_str_y='read2', savefn=savefn, df=df, list1='list1', list2='list2', xlim=None, ylim=None, x_y_lim_same=1, title_str='', title_suptitle='right', use_scale_x_y_lim=0, color=None, xlabel=None, ylabel=None) gj.printFuncRun('read_pair_len_dist')
def read_tmp_out(tmp_out=None,file_str=None,fa=None): """ caculate base enriched ratio """ file_str = file_str if file_str is not None else tmp_out.replace(".out", ".bass_count.txt") gj.printFuncRun('read_tmp_out') gj.printFuncArgs() fa_dict = read_fa(fa) tx_base_pos_dict = nested_dict(2, list) # {tx:{'A':[pos1,pos2],'T':[]}} base_enrich_dict = nested_dict(1, int) with open(tmp_out, 'r') as TMP_OUT: for line in TMP_OUT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') transcript_id = arr[0] transcript_len = int(arr[1]) if transcript_len != len(fa_dict[transcript_id]): print "transcirpt length not conistent with reference: %s, tmp_out len: %s, reference len: %s"%(transcript_id, transcript_len, len(fa_dict[transcript_id])) sys.exit() for n,base_enrichment_score in enumerate(arr[4:]): score = base_enrichment_score.split(',')[0] #if score != "NULL" and float(score) != 0 and float(score) >= 0.3: if score != "NULL" and float(score) != 0: base = fa_dict[transcript_id][n] tx_base_pos_dict[transcript_id][base].append(n) base_enrich_dict[base] += 1 print base_enrich_dict #val_ls = [base_enrich_dict[i] for i in ['A','T','C','G']] #gj.plot_ls_pie(labels=['A','T','C','G'],val=val_ls,dic="",title_str="",file_str=file_str) TXT = open(file_str, 'w') for i,j in base_enrich_dict.items(): print >>TXT,i+'\t'+str(j) TXT.close() gj.printFuncRun('read_tmp_out')
def compare_structure_gini(out1=None, out2=None, condition1=None, condition2=None, save_dir=None, T=2, t=200): out1 = out1 if out1 is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-08-08_16_library_invivo_invitro/in_vivo_mRNA_kethoxal.T%st%s.out' % ( T, t) out2 = out2 if out2 is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-08-08_16_library_invivo_invitro/in_vitro_mRNA_kethoxal.T%st%s.out' % ( T, t) condition1 = condition1 if condition1 is not None else out1.split( '/')[-1].split('.')[0].replace('_kethoxal', '') condition2 = condition2 if condition2 is not None else out2.split( '/')[-1].split('.')[0].replace('_kethoxal', '') save_dir = save_dir if save_dir is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/result/16-08-08_16_library_invivo_invitro/gini' gj.printFuncRun('compare_structure_gini') gj.printFuncArgs() out_dict1 = read_icshape_out(out1) out_dict2 = read_icshape_out(out2) overlap_savefn = save_dir + '/' + '%s_%s.%s.txOverlap.png' % ( condition1, condition2, out1.split('/')[-1].split('.')[-2]) gj.venn3plot(mode='string', subsets_ls=[set(out_dict1.keys()), set(out_dict2.keys())], labels_ls=[condition1, condition2], title_str=None, save_fn=overlap_savefn, axis=None) overlap_tx = set(out_dict1.keys()) & set(out_dict2.keys()) null_pct_ls = [0.2, 0.4, 0.6, 0.8, 0.9, 1] gini_savefn = save_dir + '/' + '%s-%s.txt' % (out1.split('/')[-1], out2.split('/')[-1]) SAVEFN = open(gini_savefn, 'w') header_ls = [ 'tx', 'null_pct_cutoff', 'vivo', 'vitro', 'null_pct1', 'null_pct2' ] print >> SAVEFN, '\t'.join(header_ls) for null_pct in null_pct_ls: out1_gini_ls = [ gj.gini(out_dict1[tx]['reactivity_ls'], mode='gini', null_pct=null_pct) for tx in overlap_tx ] out2_gini_ls = [ gj.gini(out_dict2[tx]['reactivity_ls'], mode='gini', null_pct=null_pct) for tx in overlap_tx ] out1_gini_ls_filter = [] out2_gini_ls_filter = [] overlap_tx_filter = [] for i, j, tx in zip(out1_gini_ls, out2_gini_ls, overlap_tx): if float(i) >= 0 and float(j) >= 0: out1_gini_ls_filter.append(i) out2_gini_ls_filter.append(j) overlap_tx_filter.append(tx) out1_tx_null_pct = len([ 'NULL' for n in out_dict1[tx]['reactivity_ls'] if n == 'NULL' ]) / float(len(out_dict1[tx]['reactivity_ls'])) out2_tx_null_pct = len([ 'NULL' for n in out_dict2[tx]['reactivity_ls'] if n == 'NULL' ]) / float(len(out_dict2[tx]['reactivity_ls'])) print >> SAVEFN, '\t'.join( map(str, [ tx, null_pct, i, j, out1_tx_null_pct, out2_tx_null_pct ])) SAVEFN.close() gj.printFuncRun('compare_structure_gini')