コード例 #1
0
def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict
コード例 #2
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def RT_combine(rt1=None, rt2=None, rt_comb=None):
    gj.printFuncRun('RT_combine')
    gj.printFuncArgs()
    combineRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/combineRTreplicates.pl'
    subprocess.call(["%s -i %s:%s -o %s" % (combineRT_pl, rt1, rt2, rt_comb)],
                    shell=True)
    gj.printFuncRun('RT_combine')
コード例 #3
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def mapping_PE(fastq1=None, fastq2=None, mapper='bowtie2', index_dir=None):
    gj.printFuncRun('mapping_PE')
    gj.printFuncArgs()
    map_sam = fastq1.replace('paired.clip.fastq', 'sam')
    map_rRNA_sam = fastq1.replace('paired.clip.fastq', 'rRNA.sam')

    # map to rRNA
    index = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/rRNA/index/rrna_uniq'
    rRNA_unmap_fastq = fastq1.replace('fastq', 'rRNAUnmap.fastq')
    subprocess.call([
        "bowtie2 -p 12 -1 %s -2 %s -x %s -S %s --non-deterministic --time --un-conc %s --no-unal"
        % (fastq1, fastq2, index, map_rRNA_sam, rRNA_unmap_fastq)
    ],
                    shell=True)

    # map to transcriptome
    index = '/Share/home/zhangqf/database/GenomeAnnotation/INDEX/Bowtie2/mm10/Gencode_transcriptome/whole_transcriptome/mm10'
    #index = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/Cyprinus_carpio/rna'
    #map_sam = fastq1.replace()
    rRNA_unmap_fastq1 = fastq1.replace('fastq', 'rRNAUnmap.1.fastq')
    rRNA_unmap_fastq2 = fastq1.replace('fastq', 'rRNAUnmap.2.fastq')
    subprocess.call([
        "bowtie2 -p 12 -1 %s -2 %s -x %s -S %s --non-deterministic --time --no-unal"
        % (rRNA_unmap_fastq1, rRNA_unmap_fastq2, index, map_sam)
    ],
                    shell=True)

    gj.printFuncRun('mapping_PE')
コード例 #4
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def remove_adapter_PE_new(fastq1=None, fastq2=None):
    gj.printFuncRun('remove_adapter_PE_new')
    gj.printFuncArgs()
    trimmed_fastq1 = fastq1.replace('.fastq', 'trimmed.fastq')
    trimmed_fastq2 = fastq2.replace('.fastq', 'trimmed.fastq')
    adapter_fa1 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal.fa'
    adapter_fa2 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal_rev.fa'
    adapter_fa = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal_PE.fa'
    trimmomatic = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/bin/trimmomatic-0.30.jar'
    R1_paired = fastq1.replace('fastq', 'paired.fastq')
    R1_unpaired = fastq1.replace('fastq', 'unpaired.fastq')
    R2_paired = fastq2.replace('fastq', 'paired.fastq')
    R2_unpaired = fastq2.replace('fastq', 'unpaired.fastq')
    trimlog = fastq1.replace('_R1_001.fastq', '.trim.log')
    subprocess.call([
        "java -jar %s PE -threads 32 -phred33 -trimlog %s %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:33"
        % (trimmomatic, trimlog, fastq1, fastq2, R1_paired, R1_unpaired,
           R2_paired, R2_unpaired, adapter_fa)
    ],
                    shell=True)

    clip_fastq1 = R1_paired.replace('fastq', 'clip.fastq')
    clip_trimlog1 = clip_fastq1 + '.log'
    clip_fastq2 = R2_paired.replace('fastq', 'clip.fastq')
    clip_trimlog2 = clip_fastq2 + '.log'
    subprocess.call(
        ["cutadapt -u 13 -o %s %s" % (clip_fastq1, trimmed_fastq1)],
        shell=True)
    subprocess.call(
        ["cutadapt -u -13 -o %s %s" % (clip_fastq2, trimmed_fastq2)],
        shell=True)

    gj.printFuncRun('remove_adapter_PE_new')
コード例 #5
0
ファイル: atcg_stats.py プロジェクト: Tsinghua-gongjing/test
def read_tmp_out(tmp_out=None,file_str=None,sample=None):
	gj.printFuncRun('read_tmp_out')
	gj.printFuncArgs()
	fa_dict = read_fa()
	tx_base_pos_dict = nested_dict(2, list) # {tx:{'A':[pos1,pos2],'T':[]}}
	base_enrich_dict = nested_dict(1, int)
	with open(tmp_out, 'r') as TMP_OUT:
		for line in TMP_OUT:
			line = line.strip()
			if not line or line.startswith('#'): continue
			arr = line.split('\t')
			transcript_id = arr[0]
			transcript_len = int(arr[1])
			if transcript_len != len(fa_dict[transcript_id]):
				print "transcirpt length not conistent with reference: %s, tmp_out len: %s, reference len: %s"%(transcript_id, transcript_len, len(fa_dict[transcript_id]))
				sys.exit()
			for n,base_enrichment_score in enumerate(arr[4:]):
				score = base_enrichment_score.split(',')[0]
				#if score != "NULL" and float(score) != 0 and float(score) >= 0.3:
				if score != "NULL" and float(score) != 0:
					base = fa_dict[transcript_id][n]
					tx_base_pos_dict[transcript_id][base].append(n)
					base_enrich_dict[base.upper()] += 1
	print base_enrich_dict

	#val_ls = [base_enrich_dict[i] for i in ['A','T','C','G']]
	#gj.plot_ls_pie(labels=['A','T','C','G'],val=val_ls,dic="",title_str="",file_str=file_str)
	TXT = open(file_str, 'w')
	for i,j in base_enrich_dict.items():
		print >>TXT,i+'\t'+str(j)
	TXT.close()

	gj.printFuncRun('read_tmp_out')
コード例 #6
0
def read_bed(bed=None, fa=None):
	gj.printFuncRun('read_bed')
	gj.printFuncArgs()
	base_dict = nested_dict(1, int)
	fa_dict = read_fa(fa)
	with open(bed, 'r') as BED:
		for n,line in enumerate(BED):
			line = line.strip()
			if not line or line.startswith('#'): continue
			if n%1000000 == 0: print "process: %s"%(n)
			arr = line.split('\t')
			tx_id = arr[0]
			tx_start = int(arr[1])
			tx_end = int(arr[2])
			strand = arr[5]
			if strand == "+":
				base = fa_dict[tx_id][tx_start-1]
			elif strand == "-":
				# base = fa_dict[tx_id][tx_end]
				# base = base_complementary(base)
				continue
			else:
				print "unknown strand: %s"%(strand)
				sys.exit()
			base_dict[base] += 1
	print base_dict

	bed_base_txt = bed.replace('bed','base.txt')
	with open(bed_base_txt,'w') as TXT:
		for i,j in base_dict.items():
			print >>TXT,i+'\t'+str(j)

	gj.printFuncRun('read_bed')
コード例 #7
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def inner_distance(sam=None, output_prefix=None):
    gj.printFuncRun('inner_distance')
    gj.printFuncArgs()
    bed12 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10.transCoor.bed12'
    subprocess.call(
        ["inner_distance.py -i %s -o %s -r %s" % (sam, output_prefix, bed12)],
        shell=True)
    gj.printFuncRun('inner_distance')
コード例 #8
0
ファイル: atcg_stats.py プロジェクト: Tsinghua-gongjing/test
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict
コード例 #9
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def read_len_dist_all(
    savefn='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/read_len_dist_all.png'
):
    gj.printFuncRun('read_len_dist_all')
    gj.printFuncArgs()
    library_info_dict = library_info()
    trimmed_dict = library_info_dict['lib']['trimmed']
    print trimmed_dict
    read_len_ls_ls = []
    read_cut_len_ls_ls = []
    fig, ax = plt.subplots(3, 1, sharex=True, figsize=(14, 16))
    color_ls = gj.sns_color_ls()
    sample_ls = []
    for n, (i, j) in enumerate(trimmed_dict.items()):
        sample_ls.append(i)
        print i, j
        fq_len_txt = j + '.len.txt'
        trimlog = j + '.trimlog'
        df = pd.read_csv(fq_len_txt, sep='\s+', header=None)
        df.columns = ['# of reads', 'read length']
        df.plot(ax=ax[0], x='read length', y='# of reads', label=i)
        df_trimlog = pd.read_csv(trimlog, header=None, sep='\s+')
        df_trimlog.columns = [
            'seq_name', 'sample_name', 'survive_len', 'survive_start',
            'survive_end', 'cut_len'
        ]
        df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0]
        cut_len_ls = list(df_trimlog['cut_len'])
        n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])]
        n = gj.ls_ls_flat(n)
        read_len_ls_ls.append(n)
        read_cut_len_ls_ls.append(cut_len_ls)
    gj.cumulate_dist_plot(read_len_ls_ls,
                          ls_ls_label=sample_ls,
                          bins=40,
                          title=None,
                          ax=ax[1],
                          savefn=None,
                          xlabel='Length',
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0)
    gj.cumulate_dist_plot(read_cut_len_ls_ls,
                          ls_ls_label=sample_ls,
                          bins=40,
                          title=None,
                          ax=ax[2],
                          savefn=None,
                          xlabel='Length',
                          ylabel=None,
                          add_vline=None,
                          add_hline=None,
                          log2transform=0)
    plt.tight_layout()
    plt.savefig(savefn)
    plt.close()
    gj.printFuncRun('read_len_dist_all')
コード例 #10
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def read_clean_map_rt(fastq):
    gj.printFuncRun('read_clean_map_rt')
    gj.printFuncArgs()
    collapse_fq = read_collapse(fastq=fastq)
    trimmed_fastq = remove_adapter(fastq=collapse_fq)
    map_sam = mapping(fastq=trimmed_fastq)
    sam_rpkm = rpkm_cal(sam=map_sam)
    sam_rt = RT_cal(sam=map_sam)
    gj.printFuncRun('read_clean_map_rt')
コード例 #11
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def rpkm_cal(sam=None):
    gj.printFuncRun('rpkm_cal')
    gj.printFuncArgs()
    estimateRPKM_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/estimateRPKM.pl'
    sam_rpkm = sam.replace('sam', 'rpkm')
    subprocess.call(["%s -i %s -o %s" % (estimateRPKM_pl, sam, sam_rpkm)],
                    shell=True)
    gj.printFuncRun('rpkm_cal')
    return sam_rpkm
コード例 #12
0
def get_dir_fastq(dir='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-03-25_library_of_concentration'):
	gj.printFuncRun('get_dir_fastq')
	gj.printFuncArgs()
	fn_ls = os.listdir(dir)
	fastq_fn_ls = [i for i in fn_ls if i.endswith('fastq') and 'Undetermined' not in i]
	print fastq_fn_ls
	fastq_fn_ls = [dir+'/'+i for i in fastq_fn_ls]
	gj.printFuncRun('get_dir_fastq')
	return fastq_fn_ls
コード例 #13
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def FPKM_count(bam=None, output_prefix=None, rRNA=0):
    gj.printFuncRun('FPKM_count')
    gj.printFuncArgs()
    bed12 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10.transCoor.bed12'
    if rRNA:
        bed12 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10.rRNA.bed12'
    subprocess.call(
        ["FPKM_count.py -i %s -o %s -r %s" % (bam, output_prefix, bed12)],
        shell=True)
    gj.printFuncRun('FPKM_count')
コード例 #14
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def RT_cal(sam=None):
    gj.printFuncRun('RT_cal')
    gj.printFuncArgs()
    sam_rt = sam.replace('sam', 'rt')
    sam_rpkm = sam.replace('sam', 'rpkm')
    calcRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/calcRT.pl'
    subprocess.call(
        ["%s -i %s -o %s -r %s -c 1" % (calcRT_pl, sam, sam_rt, sam_rpkm)],
        shell=True)
    gj.printFuncRun('RT_cal')
    return sam_rt
コード例 #15
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def RT_normalize(rt=None):
    gj.printFuncRun('RT_normalize')
    gj.printFuncArgs()
    normalize_rt = rt.replace('.rt',
                              '.normalized.rt').replace('RT', 'normalized.RT')
    normalizeRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/normalizeRTfile.pl'
    subprocess.call([
        "%s -i %s -o %s -m mean:vigintile2 -d 32 -l 32" %
        (normalizeRT_pl, rt, normalize_rt)
    ],
                    shell=True)
    gj.printFuncRun('RT_normalize')
コード例 #16
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def read_collapse(fastq=None):
    gj.printFuncRun('read_collapse')
    gj.printFuncArgs()
    collapse_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/readCollapse.pl'
    collapse_fq = fastq.replace('fastq', 'rmdup.fastq')
    seq_freq_fa = fastq.replace('fastq', 'fa')
    subprocess.call([
        "%s -U %s -o %s -f %s" % (collapse_pl, fastq, collapse_fq, seq_freq_fa)
    ],
                    shell=True)
    gj.printFuncRun('read_collapse')
    return collapse_fq
コード例 #17
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def calc_enrich(f_normalized_rt=None,
                b_normalized_rt=None,
                icshape_tmp_out=None,
                x=0.25):
    gj.printFuncRun('calc_enrich')
    gj.printFuncArgs()
    calc_enrich_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/calcEnrich.pl'
    subprocess.call([
        "%s -f %s -b %s -o %s -w factor5:scaling1 -x %s" %
        (calc_enrich_pl, f_normalized_rt, b_normalized_rt, icshape_tmp_out, x)
    ],
                    shell=True)
    gj.printFuncRun('calc_enrich')
コード例 #18
0
def read_rpkm_txt(txt, min_val=-1):
	gj.printFuncRun('read_rpkm_txt')
	gj.printFuncArgs()
	val_dict = nested_dict()
	gene_ls = []
	with open(txt, 'r') as TXT:
		for line in TXT:
			line = line.strip()
			if not line or line.startswith('#'): continue
			arr = line.split('\t')
			val_dict[arr[0]] = float(arr[4])
			gene_ls.append(arr[0])
	gj.printFuncRun('read_rpkm_txt')
	return val_dict,gene_ls
コード例 #19
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def RT_correlation(rt1=None,
                   rt2=None,
                   rt_corr=None,
                   coverage_cutoff=0,
                   background_base_density=0):
    gj.printFuncRun('RT_correlation')
    gj.printFuncArgs()
    correlationRT_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/correlationRT.pl'
    subprocess.call([
        "%s -1 %s -2 %s -T %s -b %s > %s" %
        (correlationRT_pl, rt1, rt2, coverage_cutoff, background_base_density,
         rt_corr)
    ],
                    shell=True)
    gj.printFuncRun('RT_correlation')
コード例 #20
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def read_len_dist(
    fq='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R1_001.trimmed.fastq'
):
    gj.printFuncRun('read_len_dist')
    gj.printFuncArgs()
    fq_len_txt = fq + '.len.txt'
    subprocess.call([
        "awk '{if(NR%%4==2) print length($1)}' %s| sort|uniq -c|sort -k2,2n > %s "
        % (fq, fq_len_txt)
    ],
                    shell=True)  # use double % to escape
    df = pd.read_csv(fq_len_txt, sep='\s+', header=None)
    df.columns = ['# of reads', 'read length']
    df_plot = df[['read length', '# of reads']]
    print df_plot
    fig, ax = plt.subplots(2, 1, sharex=True)
    df.plot(ax=ax[0], x='read length', y='# of reads')
    df.plot(kind='scatter', ax=ax[0], x='read length', y='# of reads')

    df_trimlog = pd.read_csv(fq + '.trimlog', header=None, sep='\s+')
    df_trimlog.columns = [
        'seq_name', 'sample_name', 'survive_len', 'survive_start',
        'survive_end', 'cut_len'
    ]
    df_trimlog = df_trimlog[df_trimlog['cut_len'] > 0]
    cut_len_ls = list(df_trimlog['cut_len'])

    n = [[i] * j for i, j in zip(df['read length'], df['# of reads'])]
    n = gj.ls_ls_flat(n)
    gj.cumulate_dist_plot(
        [n, cut_len_ls],
        ls_ls_label=['kethoxal read length', 'kethoxal read cut length'],
        bins=40,
        title=None,
        ax=ax[1],
        savefn=None,
        xlabel='Length',
        ylabel=None,
        add_vline=None,
        add_hline=None,
        log2transform=0)

    plt.tight_layout()
    plt.savefig(fq + '.len.png')
    plt.close()

    gj.printFuncRun('read_len_dist')
コード例 #21
0
def run_fastq(fq):
	run_fastq_log = fq+'.log'
	gj.printFuncRun('run_fastq')
	gj.printFuncArgs()

	LOG = open(run_fastq_log,'w')
	sys.stdout = LOG
	sys.stderr = LOG

	fq = fq.replace('trimmed.fastq','fastq')

	collapse_fq = fq.replace('fastq','rmdup.fastq')
	trimmed_fastq = fq.replace('fastq','trimmed.fastq')
	map_sam = fq.replace('fastq','sam')
	map_rRNA_sam = fq.replace('fastq','rRNA.sam')
	sam_rpkm = fq.replace('fastq','rpkm')
	sam_rt = fq.replace('fastq','rt')

	# for raw data
	icshape.read_collapse(fastq=fq)
	icshape.remove_adapter(fastq=collapse_fq)
	icshape.mapping(fastq=trimmed_fastq)
	icshape.rpkm_cal(sam=map_sam)
	icshape.RT_cal(sam=map_sam)
	icshape.rpkm_cal(sam=map_rRNA_sam)
	icshape.RT_cal(sam=map_rRNA_sam)

	# for clean data
	# icshape.mapping(fastq=trimmed_fastq)
	# icshape.rpkm_cal(sam=map_sam)
	# icshape.RT_cal(sam=map_sam)
	# icshape.rpkm_cal(sam=map_rRNA_sam)
	# icshape.RT_cal(sam=map_rRNA_sam)

	# for Rfam
	# map_rfam_sam = fq.replace('fastq','rfam.sam')
	# rfam_sam_rpkm = fq.replace('fastq','rfam.rpkm')
	# index = '/Share2/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/Rfam/Parsed_Structure/human.dot.dedup.fa'
	# icshape.map_rfam(fastq=trimmed_fastq, map_sam=map_rfam_sam, index=index)
	# icshape.rpkm_cal(sam=map_rfam_sam)
	# icshape.RT_cal(sam=map_rfam_sam)

	gj.printFuncRun('run_fastq')

	LOG.close()
	sys.stdout = sys.__stdout__
コード例 #22
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def filter_enrich(icshape_tmp_out=None,
                  average_coverage=2,
                  background_base_density=200,
                  skip_leading=5,
                  skilp_tailing=30):
    gj.printFuncRun('filter_enrich')
    gj.printFuncArgs()
    icshape_out = icshape_tmp_out.replace(
        '.tmp', '.T%st%s' % (average_coverage, background_base_density))
    filter_enrich_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/filterEnrich.pl'
    subprocess.call([
        "perl %s -i %s -o %s -T %s -t %s -s %s -e %s" %
        (filter_enrich_pl, icshape_tmp_out, icshape_out, average_coverage,
         background_base_density, skip_leading, skilp_tailing)
    ],
                    shell=True)
    gj.printFuncRun('filter_enrich')
コード例 #23
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def remove_adapter_PE(fastq1=None, fastq2=None):
    gj.printFuncRun('remove_adapter_PE')
    gj.printFuncArgs()
    trimming_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/trimming.pl'
    trimmed_fastq1 = fastq1.replace('rmdup.fastq', 'trimmed.fastq')
    trimmed_fastq2 = fastq2.replace('rmdup.fastq', 'trimmed.fastq')
    adapter_fa = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/N02381_CY_80-65347437_DNASEQ/PF_data/170705-X1B/TruSeq3-PE.fa'
    subprocess.call([
        "%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m 50" %
        (trimming_pl, fastq1, trimmed_fastq1, adapter_fa)
    ],
                    shell=True)
    subprocess.call([
        "%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m 50" %
        (trimming_pl, fastq2, trimmed_fastq2, adapter_fa)
    ],
                    shell=True)
    gj.printFuncRun('remove_adapter_PE')
コード例 #24
0
def corr_plot(fn_str=None, label_str=None, savefn=None, intersect='common'):
    gj.printFuncRun('corr_plot')
    gj.printFuncArgs()
    fn_ls = fn_str.split(':')
    label_ls = label_str.split(':')
    gene_ls_ls = []
    rpkm_dict_ls = []
    rpkm_ls_ls = []
    for fn in fn_ls:
        rpkm_dict, gene_ls = read_rpkm_txt(fn)
        rpkm_dict_ls.append(rpkm_dict)
        gene_ls_ls.append(gene_ls)
        rpkm_ls_ls.append([np.log2(i) for i in rpkm_dict.values()])

    # gj.cumulate_dist_plot(ls_ls=rpkm_ls_ls,ls_ls_label=label_ls,bins=40,title=None,ax=None,savefn=savefn+'.cdf.png',xlabel='log2(RPKM)',ylabel=None,add_vline=None,add_hline=None,log2transform=0)

    if intersect == 'common':
        genes = gj.ls_ls_common(ls_ls=gene_ls_ls, return_ls=1)
    elif intersect == 'union':
        genes = gj.ls_ls_union(ls_ls=gene_ls_ls, return_ls=1)

    SAVEFN = open(savefn, 'w')
    print >> SAVEFN, '#gene' + '\t' + '\t'.join(label_ls)
    for gene in genes:
        gene_rpkm_ls = []
        for sample_rpkm_dict in rpkm_dict_ls:
            rpkm = np.log2(float(
                sample_rpkm_dict[gene])) if sample_rpkm_dict.has_key(
                    gene) else np.log2(0.001)
            gene_rpkm_ls.append(rpkm)
        print >> SAVEFN, gene + '\t' + '\t'.join(map(str, gene_rpkm_ls))
    SAVEFN.close()

    df = pd.read_csv(savefn, sep='\t', header=0)
    gj.df_corr_matrix_plot(df[label_ls],
                           savefn=savefn + '.png',
                           size=4,
                           rot=30,
                           share_x_y=1,
                           hue=None,
                           diag='kde')

    gj.printFuncRun('corr_plot')
コード例 #25
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def remove_adapter(fastq=None, trimmed_fastq=None):
    gj.printFuncRun('remove_adapter')
    gj.printFuncArgs()
    trimming_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/trimming.pl'
    if trimmed_fastq is None:
        trimmed_fastq = fastq.replace('rmdup.fastq', 'trimmed.fastq')
    adapter_fa = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/data/adapter/kethoxal.fa'
    subprocess.call([
        "%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m 0" %
        (trimming_pl, fastq, trimmed_fastq, adapter_fa)
    ],
                    shell=True)
    """
	for min_len in [50,25]:
		trimmed_fastq_minLen = trimmed_fastq.replace('fastq','minLen'+str(min_len)+'.fastq')
		subprocess.call(["%s -U %s -o %s -l 13 -t 0 -c phred33 -a %s -m %s"%(trimming_pl, fastq, trimmed_fastq, adapter_fa, min_len)],shell=True)
	"""
    gj.printFuncRun('remove_adapter')
    return trimmed_fastq
コード例 #26
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def read_collapse_PE(fastq1=None, fastq2=None):
    gj.printFuncRun('read_collapse_PE')
    gj.printFuncArgs()
    collapse_pl = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/scripts/icSHAPE-master/scripts/readCollapse.pl'
    collapse_fq1 = fastq1.replace('fastq', 'rmdup.fastq')
    seq_freq_fa1 = fastq1.replace('fastq', 'fa')
    collapse_fq2 = fastq2.replace('fastq', 'rmdup.fastq')
    seq_freq_fa2 = fastq2.replace('fastq', 'fa')
    subprocess.call([
        "%s -U %s -o %s -f %s" %
        (collapse_pl, fastq1, collapse_fq1, seq_freq_fa1)
    ],
                    shell=True)
    subprocess.call([
        "%s -U %s -o %s -f %s" %
        (collapse_pl, fastq2, collapse_fq2, seq_freq_fa2)
    ],
                    shell=True)
    gj.printFuncRun('read_collapse_PE')
コード例 #27
0
def read_icshape_out(out=None, pureID=1):
    gj.printFuncRun('read_icshape_out')
    gj.printFuncArgs()
    out_dict = nested_dict()
    with open(out, 'r') as OUT:
        for line in OUT:
            line = line.strip()
            if not line or line.startswith('#'): continue
            arr = line.split('\t')
            tx_id = arr[0]
            if pureID:
                tx_id = tx_id.split('.')[0]
            length = int(arr[1])
            rpkm = float(arr[2]) if arr[2] != '*' else arr[2]
            reactivity_ls = arr[3:]
            out_dict[tx_id]['tx_id'] = tx_id
            out_dict[tx_id]['rpkm'] = rpkm
            out_dict[tx_id]['reactivity_ls'] = reactivity_ls
    gj.printFuncRun('read_icshape_out')
    return out_dict
コード例 #28
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def mapping(fastq=None, mapper='bowtie2', index_dir=None):
    gj.printFuncRun('mapping')
    gj.printFuncArgs()
    map_sam = fastq.replace('trimmed.fastq', 'sam')
    map_rRNA_sam = fastq.replace('trimmed.fastq', 'rRNA.sam')

    #index = '/Share/home/zhangqf/database/GenomeAnnotation/INDEX/Bowtie2/mm_rRNA/mm_rRNA'
    index = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/rRNA/index/rrna_uniq'
    rRNA_unmap_fastq = fastq.replace('fastq', 'rRNAUnmap.fastq')
    subprocess.call([
        "bowtie2 -U %s -S %s -x %s --non-deterministic --time --un %s" %
        (fastq, map_rRNA_sam, index, rRNA_unmap_fastq)
    ],
                    shell=True)

    #index = '/Share/home/zhangqf/database/GenomeAnnotation/INDEX/Bowtie2/mm10/Gencode_transcriptome/whole_transcriptome/mm10'
    #subprocess.call(["bowtie2 -U %s -S %s -x %s --non-deterministic --time"%(rRNA_unmap_fastq, map_sam, index)],shell=True)

    gj.printFuncRun('mapping')
    return map_sam
コード例 #29
0
ファイル: icshape.py プロジェクト: XingyangQian/KS_test
def read_pair_len_dist(fastq1=None, fastq2=None, savefn=None):
    gj.printFuncRun('read_pair_len_dist')
    if fastq1 is None:
        fastq1 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R1_001.paired.fastqT'
    if fastq2 is None:
        fastq2 = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005_R2_001.paired.fastqT'
    if savefn is None:
        savefn = '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/PE100/CHe-XC-M1K_S3_L005.paired.fastq.len.png'
    gj.printFuncArgs()
    read_len_ls1 = []
    with open(fastq1, 'r') as FQ1:
        for n, line in enumerate(FQ1):
            if n % 4 == 1:
                read_len_ls1.append(len(line.strip()))
    read_len_ls2 = []
    with open(fastq2, 'r') as FQ2:
        for n, line in enumerate(FQ2):
            if n % 4 == 1:
                read_len_ls2.append(len(line.strip()))
    df = pd.DataFrame({'read1': read_len_ls1, 'read2': read_len_ls2})
    print df.head()
    gj.df_sns_jointplot(col_str_x='read1',
                        col_str_y='read2',
                        savefn=savefn,
                        df=df,
                        list1='list1',
                        list2='list2',
                        xlim=None,
                        ylim=None,
                        x_y_lim_same=1,
                        title_str='',
                        title_suptitle='right',
                        use_scale_x_y_lim=0,
                        color=None,
                        xlabel=None,
                        ylabel=None)

    gj.printFuncRun('read_pair_len_dist')
コード例 #30
0
ファイル: G_stop_ratio.py プロジェクト: XingyangQian/KS_test
def read_tmp_out(tmp_out=None,file_str=None,fa=None):
	""" caculate base enriched ratio
	"""
	file_str = file_str if file_str is not None else tmp_out.replace(".out", ".bass_count.txt")
	gj.printFuncRun('read_tmp_out')
	gj.printFuncArgs()
	fa_dict = read_fa(fa)
	tx_base_pos_dict = nested_dict(2, list) # {tx:{'A':[pos1,pos2],'T':[]}}
	base_enrich_dict = nested_dict(1, int)
	with open(tmp_out, 'r') as TMP_OUT:
		for line in TMP_OUT:
			line = line.strip()
			if not line or line.startswith('#'): continue
			arr = line.split('\t')
			transcript_id = arr[0]
			transcript_len = int(arr[1])
			if transcript_len != len(fa_dict[transcript_id]):
				print "transcirpt length not conistent with reference: %s, tmp_out len: %s, reference len: %s"%(transcript_id, transcript_len, len(fa_dict[transcript_id]))
				sys.exit()
			for n,base_enrichment_score in enumerate(arr[4:]):
				score = base_enrichment_score.split(',')[0]
				#if score != "NULL" and float(score) != 0 and float(score) >= 0.3:
				if score != "NULL" and float(score) != 0:
					base = fa_dict[transcript_id][n]
					tx_base_pos_dict[transcript_id][base].append(n)
					base_enrich_dict[base] += 1
	print base_enrich_dict

	#val_ls = [base_enrich_dict[i] for i in ['A','T','C','G']]
	#gj.plot_ls_pie(labels=['A','T','C','G'],val=val_ls,dic="",title_str="",file_str=file_str)
	TXT = open(file_str, 'w')
	for i,j in base_enrich_dict.items():
		print >>TXT,i+'\t'+str(j)
	TXT.close()

	gj.printFuncRun('read_tmp_out')
def opening_base_compare_stats_bar_plot():
    egg_cell1 = '/Share/home/zhangqf7/gongjing/zebrafish/result/structure_change_compare/egg_1cell.base.txt'
    sphere_shield = '/Share/home/zhangqf7/gongjing/zebrafish/result/structure_change_compare/sphere_shield.base.txt'
    egg_cell1_distance_ls, egg_cell1_base_ls = read_compare(egg_cell1)
    sphere_shield_distance_ls, sphere_shield_base_ls = read_compare(
        sphere_shield)
    print gj.printFuncRun('creat df_egg_cell1')
    df_egg_cell1 = pd.DataFrame({'difference': egg_cell1_distance_ls,
                       'sample': '1cell-egg', 'base': egg_cell1_base_ls})
    print gj.printFuncRun('finsh creat df_egg_cell1')
    print gj.printFuncRun('creat df_sphere_shield')
    df_sphere_shield = pd.DataFrame({'difference': sphere_shield_distance_ls,
                       'sample': 'shield-sphere', 'base': sphere_shield_base_ls})
    print gj.printFuncRun('finish creat df_sphere_shield')
    # print gj.printFuncRun('concat')
    # df = pd.concat([df_egg_cell1, df_sphere_shield], axis=0)
    # print gj.printFuncRun('concat')
    # print df.head()

    diff_cutoff = 0.25
    df_egg_cell1_above = df_egg_cell1[df_egg_cell1['difference']>=diff_cutoff]
    df_sphere_shield_above = df_sphere_shield[df_sphere_shield['difference']>=diff_cutoff]
    print 'egg-1cell', 'all', df_egg_cell1['base'].value_counts(), df_egg_cell1_above['base'].value_counts()
    print "sphere-shield", 'all', df_sphere_shield['base'].value_counts(), df_sphere_shield_above['base'].value_counts()
    egg_1cell_all = df_egg_cell1['base'].value_counts().to_dict()
    egg_1cell_above = df_egg_cell1_above['base'].value_counts().to_dict()
    sphere_shield_all = df_sphere_shield['base'].value_counts().to_dict()
    sphere_shield_above = df_sphere_shield_above['base'].value_counts().to_dict()

    base_ls, sample_ls, value_ls = [],[],[]
    # for d,label in zip([egg_1cell_all, egg_1cell_above, sphere_shield_all, sphere_shield_above], ['egg_1cell_all', 'egg_1cell_above', 'sphere_shield_all', 'sphere_shield_above']):
    for d,label in zip([egg_1cell_all, egg_1cell_above], ['egg_1cell_all', 'egg_1cell_above']):
        if 'N' in d: d.pop('N')
        for b in ['A', 'T', 'C', 'G']:
            base_ls.append(b)
            sample_ls.append(label)
            value = d[b]
            value_ls.append(value)

            value_ratio = d[b] / float(sum(d.values()))
            base_ls.append(b)
            sample_ls.append(label+'\n(ratio)')
            value_ls.append(value_ratio)
    df_stat = pd.DataFrame({'base':base_ls, 'sample':sample_ls, 'value':value_ls})
    print df_stat

    fig,ax = plt.subplots(2,1)
    sns.barplot(x='sample', y='value', hue='base', data=df_stat[df_stat['value']>=1], hue_order=['A', 'T', 'C', 'G'], ax=ax[0])
    sns.barplot(x='sample', y='value', hue='base', data=df_stat[df_stat['value']<1], hue_order=['A', 'T', 'C', 'G'], ax=ax[1])
    plt.tight_layout()
    savefn = '/Share/home/zhangqf7/gongjing/zebrafish/result/structure_change_compare/base_num_ratio.onlyegg_1cell.pdf'
    plt.savefig(savefn)
    plt.close()

    print "all",egg_1cell_all,"above",egg_1cell_above
    for b in ['A', 'T', 'C', 'G']:
        b_above = egg_1cell_above[b]
        non_b_above = sum(egg_1cell_above.values()) - b_above
        b_stable = egg_1cell_all[b] - b_above
        non_b_stable = sum(egg_1cell_all.values()) - b_above - non_b_above - b_stable
        print b_above, non_b_above, b_stable, non_b_stable
        oddsratio, pvalue = stats.fisher_exact([[b_above, non_b_above], [b_stable, non_b_stable]])
        print b, oddsratio, pvalue
コード例 #32
0
def compare_structure_gini(out1=None,
                           out2=None,
                           condition1=None,
                           condition2=None,
                           save_dir=None,
                           T=2,
                           t=200):
    out1 = out1 if out1 is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-08-08_16_library_invivo_invitro/in_vivo_mRNA_kethoxal.T%st%s.out' % (
        T, t)
    out2 = out2 if out2 is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/16-08-08_16_library_invivo_invitro/in_vitro_mRNA_kethoxal.T%st%s.out' % (
        T, t)
    condition1 = condition1 if condition1 is not None else out1.split(
        '/')[-1].split('.')[0].replace('_kethoxal', '')
    condition2 = condition2 if condition2 is not None else out2.split(
        '/')[-1].split('.')[0].replace('_kethoxal', '')
    save_dir = save_dir if save_dir is not None else '/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/result/16-08-08_16_library_invivo_invitro/gini'
    gj.printFuncRun('compare_structure_gini')
    gj.printFuncArgs()

    out_dict1 = read_icshape_out(out1)
    out_dict2 = read_icshape_out(out2)

    overlap_savefn = save_dir + '/' + '%s_%s.%s.txOverlap.png' % (
        condition1, condition2, out1.split('/')[-1].split('.')[-2])
    gj.venn3plot(mode='string',
                 subsets_ls=[set(out_dict1.keys()),
                             set(out_dict2.keys())],
                 labels_ls=[condition1, condition2],
                 title_str=None,
                 save_fn=overlap_savefn,
                 axis=None)

    overlap_tx = set(out_dict1.keys()) & set(out_dict2.keys())
    null_pct_ls = [0.2, 0.4, 0.6, 0.8, 0.9, 1]
    gini_savefn = save_dir + '/' + '%s-%s.txt' % (out1.split('/')[-1],
                                                  out2.split('/')[-1])
    SAVEFN = open(gini_savefn, 'w')
    header_ls = [
        'tx', 'null_pct_cutoff', 'vivo', 'vitro', 'null_pct1', 'null_pct2'
    ]
    print >> SAVEFN, '\t'.join(header_ls)
    for null_pct in null_pct_ls:
        out1_gini_ls = [
            gj.gini(out_dict1[tx]['reactivity_ls'],
                    mode='gini',
                    null_pct=null_pct) for tx in overlap_tx
        ]
        out2_gini_ls = [
            gj.gini(out_dict2[tx]['reactivity_ls'],
                    mode='gini',
                    null_pct=null_pct) for tx in overlap_tx
        ]

        out1_gini_ls_filter = []
        out2_gini_ls_filter = []
        overlap_tx_filter = []
        for i, j, tx in zip(out1_gini_ls, out2_gini_ls, overlap_tx):
            if float(i) >= 0 and float(j) >= 0:
                out1_gini_ls_filter.append(i)
                out2_gini_ls_filter.append(j)
                overlap_tx_filter.append(tx)

                out1_tx_null_pct = len([
                    'NULL'
                    for n in out_dict1[tx]['reactivity_ls'] if n == 'NULL'
                ]) / float(len(out_dict1[tx]['reactivity_ls']))
                out2_tx_null_pct = len([
                    'NULL'
                    for n in out_dict2[tx]['reactivity_ls'] if n == 'NULL'
                ]) / float(len(out_dict2[tx]['reactivity_ls']))

                print >> SAVEFN, '\t'.join(
                    map(str, [
                        tx, null_pct, i, j, out1_tx_null_pct, out2_tx_null_pct
                    ]))
    SAVEFN.close()

    gj.printFuncRun('compare_structure_gini')