def run_cuffquant_ERCC_k(self): sh_file = "%s/s08.1.cuffquant_k.ERCC.sh" % (self.script_dir) sh_work_file = "%s/s08.1.cuffquant_k.ERCC_work.sh" % (self.script_dir) cflk_dir = self['sftw_name'].cflk_dir if not os.path.isdir(self.cuffquant_ercc_k): os.mkdir(self.cuffquant_ercc_k) sh_info = """ cflk_dir=$1 in_bam=$2 gtf_file=$3 out_dir=$4 $cflk_dir/cuffquant \\ -p 8 -u \\ -o $out_dir \\ $gtf_file \\ $in_bam """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % (self.tophat, brief_name) out_dir = "%s/%s" % (self.cuffquant_ercc_k, brief_name) sh_work += "sh %s %s %s %s %s \n" % (sh_file, cflk_dir, in_bam, self['infile']['anno_file'], out_dir) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def merge_novo_known_GTF(self): sh_file = "%s/Generate_Transcriptome.sh" % (self.script_dir) sh_work_file = "%s/Generate_Transcriptome_work.sh" % (self.script_dir) sh_info = """ known_GTF=$1 unknown_GTF=$2 merge_GTF=$3 merge_ERCC_GTF=$4 sed 's/XLOC_/novoXLOC_/g' $unknown_GTF | sed 's/TCONS_/novoTCONS_/g' >$unknown_GTF.tmp grep -P "^chr" $known_GTF | cat /dev/stdin $unknown_GTF.tmp | bedtools sort -i /dev/stdin | grep -P "^chr" >$merge_GTF rm $unknown_GTF.tmp grep -P "^ERCC|^RGC" $known_GTF | cat $merge_GTF /dev/stdin >$merge_ERCC_GTF """ known_GTF = self['infile']['anno_file'] unknown_GTF = "%s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.gtf" % ( self.data_dir) merge_GTF = "%s/all.exon.sort.gtf" % (self.data_dir) merge_ERCC_GTF = "%s/all.exon.sort.ERCC.gtf" % (self.data_dir) self['infile']['anno_file_merge'] = merge_GTF self['infile']['anno_file_merge_ERCC'] = merge_ERCC_GTF sh_work = "sh %s %s %s %s %s" % (sh_file, known_GTF, unknown_GTF, merge_GTF, merge_ERCC_GTF) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=1)
def run_trim(self): home_dir = os.path.abspath('./') cln_dir = self['dir_name']['clean_data'] trim_dir = self['dir_name']['trim_data'] script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) sh_file = "%s/s01.trim.sh" % (script_dir) sh_work_file = "%s/s01.trim_work.sh" % (script_dir) py_trim = "%s/step1.trim.py" % (bin_dir) sh_info = """ py_trim=$1 in_fq1=$2 in_fq2=$3 out_dir=$4 out_prefix=$5 python $py_trim $in_fq1 $in_fq2 $out_dir $out_prefix """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] in_fq1 = "%s/%s/1.cln.fq.gz" % ( cln_dir ,samp ) in_fq2 = "%s/%s/2.cln.fq.gz" % ( cln_dir ,samp ) out_dir = "%s" % ( trim_dir ) out_prefix = brief_name sh_work += "sh %s %s %s %s %s %s\n" % ( sh_file, py_trim, in_fq1, in_fq2, out_dir, out_prefix ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def run_cuffquant(self): sh_file = "%s/s08.cuffquant.sh" % (self.script_dir) sh_work_file = "%s/s08.cuffquant_work.sh" % (self.script_dir) if not os.path.isdir( self.cuffquant ): os.mkdir( self.cuffquant ) sh_info = """ in_bam=$1 gtf_file=$2 out_dir=$3 /data/Analysis/huboqiang/software/cufflinks-2.2.1.Linux_x86_64/cuffquant \\ -p 8 -u \\ -o $out_dir \\ $gtf_file \\ $in_bam """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % ( self.tophat, brief_name ) out_dir = "%s/%s" % ( self.cuffquant ,brief_name ) sh_work += "sh %s %s %s %s \n" % ( sh_file, in_bam, self['infile']['anno_file'], out_dir) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def run_cuffcomp_novo_trans(self): sh_file = "%s/s06.1.cuffcompare_novo.sh" % (self.script_dir) sh_work_file = "%s/s06.1.cuffcompare_novo_work.sh" % (self.script_dir) cflk_dir = self['sftw_name'].cflk_dir sh_info = """ cflk_dir=$1 out_prefix=$2 shift shift $cflk_dir/cuffcompare \\ -o $out_prefix \\ -T $@ \\ """ sh_work = "" out_prefix = "%s/novo_lnc_raw" % (self.data_dir) l_in_samp = [ "%s/%s/transcripts.gtf" % (self.cufflink_u, self['samp_info']['samp_brief'][samp]) for samp in self['samp'] ] sh_work = "sh %s %s %s %s" % (sh_file, cflk_dir, out_prefix, " ".join(l_in_samp)) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def run_cufflinks_u(self): sh_file = "%s/s05.cufflinks_GenomeMapped.sh" % (self.script_dir) sh_work_file = "%s/s05.cufflinks_GenomeMapped_work.sh" % ( self.script_dir) cflk_dir = self['sftw_name'].cflk_dir sh_info = """ cflk_dir=$1 in_bam=$2 gtf_file=$3 out_dir=$4 $cflk_dir/cufflinks \\ -p 8 -u \\ -o $out_dir \\ $in_bam """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.genome.sort.bam" % (self.tophat, brief_name) out_dir = "%s/%s" % (self.cufflink_u, brief_name) sh_work += "sh %s %s %s %s %s \n" % (sh_file, cflk_dir, in_bam, self['infile']['anno_file'], out_dir) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def run_cuffnorm_ERCC_k(self): sh_file = "%s/s09.1.cuffnorm.ERCC_k.sh" % (self.script_dir) sh_work_file = "%s/s09.1.cuffnorm.ERCC_k_work.sh" % (self.script_dir) if not os.path.isdir(self.cuffnorm_ercc_k): os.mkdir(self.cuffnorm_ercc_k) cflk_dir = self['sftw_name'].cflk_dir l_brief = [] l_cxb = [] for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] l_brief.append(brief_name) l_cxb.append("%s/%s/abundances.cxb" % (self.cuffquant_ercc_k, brief_name)) list_brief = ",".join(l_brief) list_cxb = " ".join(l_cxb) sh_info = """ cflk_dir=$1 $cflk_dir/cuffnorm \\ -p 8 -o %s -L %s \\ %s \\ %s """ % (self.cuffnorm_ercc_k, list_brief, self['infile']['anno_file'], list_cxb) sh_work = "sh %s %s" % (sh_file, cflk_dir) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def __get_HTS_clean_split(self): sh_file = "%s/p.HTSeq_split.sh" % (self.script_dir) sh_work_file = "%s/p.HTSeq_split_work.sh" % (self.script_dir) sh_info = """ infile=$1 out_Refseq=$2 out_NONCODE=$3 out_NSMB=$4 grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE head -n 1 $infile >$out_NSMB && grep -P '^XLOC' $infile >>$out_NSMB """ infile = "%s/merge.dexseq_clean.gene.xls" % (self.HTS) out_Refseq = "%s/merge.dexseq_clean_refseq.gene.xls" % (self.HTS) out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls" % (self.HTS) out_NSMB = "%s/merge.dexseq_clean_NSMB.gene.xls" % (self.HTS) sh_work = "sh %s %s %s %s %s " % (sh_file, infile, out_Refseq, out_NONCODE, out_NSMB) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=1)
def run_cuffcomp_novo_trans(self): sh_file = "%s/s06.1.cuffcompare_novo.sh" % (self.script_dir) sh_work_file = "%s/s06.1.cuffcompare_novo_work.sh" % (self.script_dir) cflk_dir = self['sftw_name'].cflk_dir sh_info = """ cflk_dir=$1 out_prefix=$2 shift shift $cflk_dir/cuffcompare \\ -o $out_prefix \\ -T $@ \\ """ sh_work = "" out_prefix = "%s/novo_lnc_raw" % ( self.data_dir ) l_in_samp = [ "%s/%s/transcripts.gtf" % ( self.cufflink_u,self['samp_info']['samp_brief'][samp] ) for samp in self['samp'] ] sh_work = "sh %s %s %s %s" % ( sh_file, cflk_dir, out_prefix, " ".join(l_in_samp) ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def merge_novo_known_GTF(self): sh_file = "%s/Generate_Transcriptome.sh" % (self.script_dir) sh_work_file = "%s/Generate_Transcriptome_work.sh" % (self.script_dir) sh_info = """ known_GTF=$1 unknown_GTF=$2 merge_GTF=$3 merge_ERCC_GTF=$4 sed 's/XLOC_/novoXLOC_/g' $unknown_GTF | sed 's/TCONS_/novoTCONS_/g' >$unknown_GTF.tmp grep -P "^chr" $known_GTF | cat /dev/stdin $unknown_GTF.tmp | bedtools sort -i /dev/stdin | grep -P "^chr" >$merge_GTF rm $unknown_GTF.tmp grep -P "^ERCC|^RGC" $known_GTF | cat $merge_GTF /dev/stdin >$merge_ERCC_GTF """ known_GTF = self['infile']['anno_file'] unknown_GTF = "%s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.gtf" % ( self.data_dir ) merge_GTF = "%s/all.exon.sort.gtf" % ( self.data_dir ) merge_ERCC_GTF= "%s/all.exon.sort.ERCC.gtf" % ( self.data_dir ) self['infile']['anno_file_merge'] = merge_GTF self['infile']['anno_file_merge_ERCC'] = merge_ERCC_GTF sh_work = "sh %s %s %s %s %s" % ( sh_file, known_GTF , unknown_GTF, merge_GTF, merge_ERCC_GTF ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=1 )
def run_cufflinks_u(self): sh_file = "%s/s05.cufflinks_GenomeMapped.sh" % (self.script_dir) sh_work_file = "%s/s05.cufflinks_GenomeMapped_work.sh" % (self.script_dir) sh_info = """ in_bam=$1 gtf_file=$2 out_dir=$3 /data/Analysis/huboqiang/software/cufflinks-2.2.1.Linux_x86_64/cufflinks \\ -p 8 -u \\ -o $out_dir \\ $in_bam """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.genome.sort.bam"% ( self.tophat, brief_name ) out_dir = "%s/%s" % ( self.cufflink_u,brief_name ) sh_work += "sh %s %s %s %s \n" % ( sh_file, in_bam, self['infile']['anno_file'], out_dir) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def run_cuffnorm_ERCC_k(self): sh_file = "%s/s09.1.cuffnorm.ERCC_k.sh" % (self.script_dir) sh_work_file = "%s/s09.1.cuffnorm.ERCC_k_work.sh" % (self.script_dir) if not os.path.isdir( self.cuffnorm_ercc_k ): os.mkdir( self.cuffnorm_ercc_k ) cflk_dir = self['sftw_name'].cflk_dir l_brief = [] l_cxb = [] for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] l_brief.append( brief_name ) l_cxb.append( "%s/%s/abundances.cxb" % (self.cuffquant_ercc_k,brief_name) ) list_brief = ",".join( l_brief ) list_cxb = " ".join( l_cxb ) sh_info = """ cflk_dir=$1 $cflk_dir/cuffnorm \\ -p 8 -o %s -L %s \\ %s \\ %s """ % ( self.cuffnorm_ercc_k, list_brief, self['infile']['anno_file'], list_cxb ) sh_work = "sh %s %s" % (sh_file, cflk_dir) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def run_cuffquant_ERCC_k(self): sh_file = "%s/s08.1.cuffquant_k.ERCC.sh" % (self.script_dir) sh_work_file = "%s/s08.1.cuffquant_k.ERCC_work.sh" % (self.script_dir) cflk_dir = self['sftw_name'].cflk_dir if not os.path.isdir( self.cuffquant_ercc_k ): os.mkdir( self.cuffquant_ercc_k ) sh_info = """ cflk_dir=$1 in_bam=$2 gtf_file=$3 out_dir=$4 $cflk_dir/cuffquant \\ -p 8 -u \\ -o $out_dir \\ $gtf_file \\ $in_bam """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % ( self.tophat, brief_name ) out_dir = "%s/%s" % ( self.cuffquant_ercc_k ,brief_name ) sh_work += "sh %s %s %s %s %s \n" % ( sh_file, cflk_dir,in_bam, self['infile']['anno_file'], out_dir) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def SRA2fastq(self): home_dir = os.path.abspath('./') raw_dir = self['dir_name']['raw_data'] fq_dir = self['dir_name']['fastq_data'] if not os.path.isdir( fq_dir ): os.mkdir( fq_dir ) script_dir = "%s/scripts" % (home_dir) fqDump = self['sftw_name'].fastqDump sh_file = "%s/scripts/s01.SRA2Fastq.sh" % (home_dir) sh_work_file = "%s/scripts/s01.SRA2Fastq_work.sh" % (home_dir) sh_info = """ samp_name=$1 fqDump=$2 raw_dir=$3 fq_dir=$4 $fqDump --split-files --gzip --outdir $fq_dir/${samp_name} $raw_dir/${samp_name}.sra mv $fq_dir/${samp_name}/${samp_name}_1.fastq.gz $fq_dir/${samp_name}/${samp_name}.1.fq.gz && \\ mv $fq_dir/${samp_name}/${samp_name}_2.fastq.gz $fq_dir/${samp_name}/${samp_name}.2.fq.gz """ sh_work = "" for samp_name in self['sample']: sh_work += " sh %s %s %s %s %s\n" % ( sh_file, samp_name,fqDump, raw_dir,fq_dir ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def __get_HTS_clean_split(self): sh_file = "%s/p.HTSeq_split.sh" % (self.script_dir) sh_work_file = "%s/p.HTSeq_split_work.sh" % (self.script_dir) sh_info = """ infile=$1 out_Refseq=$2 out_NONCODE=$3 out_NSMB=$4 inNeo=$5 outNeo=$6 grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE head -n 1 $infile >$out_NSMB && grep -P '^XLOC' $infile >>$out_NSMB head -n 1 $inNeo >$outNeo for i in `cut -f 1 %s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.genlen | uniq`;do grep -w $i $inNeo ;done >>$outNeo """ % ( self.data_dir ) infile = "%s/merge.dexseq_clean.gene.xls" % ( self.HTS ) out_Refseq = "%s/merge.dexseq_clean_refseq.gene.xls" % ( self.HTS ) out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls" % ( self.HTS ) out_NSMB = "%s/merge.dexseq_clean_NSMB.gene.xls" % ( self.HTS ) inNeo = "%s/merge.dexseq_NeoRaw.gene.xls" % ( self.HTS ) outNeo = "%s/merge.dexseq_NeoPass.gene.xls" % ( self.HTS ) sh_work = "sh %s %s %s %s %s %s %s" % ( sh_file,infile,out_Refseq,out_NONCODE,out_NSMB,inNeo,outNeo ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=1 )
def run_cufflinks_u(self): sh_file = "%s/s05.cufflinks_GenomeMapped.sh" % (self.script_dir) sh_work_file = "%s/s05.cufflinks_GenomeMapped_work.sh" % (self.script_dir) cflk_dir = self['sftw_name'].cflk_dir sh_info = """ cflk_dir=$1 in_bam=$2 gtf_file=$3 out_dir=$4 $cflk_dir/cufflinks \\ -p 8 -u \\ -o $out_dir \\ $in_bam """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.genome.sort.bam"% ( self.tophat, brief_name ) out_dir = "%s/%s" % ( self.cufflink_u,brief_name ) sh_work += "sh %s %s %s %s %s \n" % ( sh_file, cflk_dir, in_bam, self['infile']['anno_file'], out_dir) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def run_hisat_mannual(self): home_dir = os.path.abspath('./') fq_dir = self['dir_name']['fastq_data'] hisat_dir = self['dir_name']['hisat_mannual_dir'] if not os.path.isdir( hisat_dir): os.mkdir( hisat_dir ) script_dir = "%s/scripts" % (home_dir) hisat = self['sftw_name'].hisat samtools_exe = self['sftw_name'].samtools sh_file = "%s/s02.4.hisatMannual.sh" % (script_dir) sh_work_file = "%s/s02.4.hisatMannual_work.sh" % (script_dir) sh_info = """ hisat=$1 fq_dir=$2 samp_name=$3 brief_name=$4 hisat_dir=$5 genome=$6 splice_file=$7 samtools_exe=$8 $hisat -p 8 -x $genome --phred64 \\ -1 $fq_dir/$samp_name/$samp_name.1.fq.gz \\ -2 $fq_dir/$samp_name/$samp_name.2.fq.gz \\ -S /dev/null \\ --novel-splicesite-outfile $splice_file \\ 2>$hisat_dir/$brief_name/log && \\ $hisat -p 8 -x $genome --phred64 \\ -1 $fq_dir/$samp_name/$samp_name.1.fq.gz \\ -2 $fq_dir/$samp_name/$samp_name.2.fq.gz \\ -S /dev/stdout \\ --novel-splicesite-infile $splice_file \\ 2>$hisat_dir/$brief_name/log.2 |\\ awk '{if($1 ~ /^@/) print $0; else{ for(i=1;i<=NF;i++) if($i!~/^XS/) printf("%s\\t",$i);else XS0=$i; XS1=((and($2, 0x10) && and($2, 0x40)) || (and($2,0x80) && !and($2,0x10)))?"XS:A:+":"XS:A:-"; print XS1 } }' | awk '{if(length($10)==length($11)){print $0}}' | $samtools_exe view -Sb -q 1 - >$hisat_dir/$brief_name/accepted_hits.raw.bam &&\\ $samtools_exe sort -m 2000000000 $hisat_dir/$brief_name/accepted_hits.raw.bam $hisat_dir/$brief_name/accepted_hits """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] if not os.path.isdir( "%s/%s" % (hisat_dir,brief_name) ): os.mkdir( "%s/%s" % (hisat_dir,brief_name) ) genome = "%s.hisat" % ( self['infile']['genome_file'] ) splice_file= "%s/%s.spliceSites" % ( hisat_dir, brief_name ) sh_work += "sh %s %s %s %s %s %s %s %s %s\n" % ( sh_file, hisat, fq_dir, samp, brief_name, hisat_dir, genome, splice_file, samtools_exe ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=6 )
def run_CIRCexplorer(self): sh_file = "%s/s10.CIRCexplorer.sh" % (self.script_dir) sh_work_file = "%s/s10.CIRCexplorer_work.sh" % (self.script_dir) if not os.path.isdir(self.CIRCexplorer): os.mkdir(self.CIRCexplorer) py_CIRCexplorer = "/data/Analysis/huboqiang/software/CIRCexplorer/CIRCexplorer_PE.py" py_CIRCexplorer_PE_Check = "/datc/huboqiang/cir_dyj_V2/bin/CIRCexplorer_PE_check.py" sh_info = """ py_CIRCexplorer=$1 in_bam=$2 genome=$3 ref_file=$4 out_file=$5 samp=$6 py_CIRCexplorer_PE_check=$7 in_raw_bam=$8 [ ! -d $out_file/$samp ] && mkdir -p $out_file/$samp #python $py_CIRCexplorer \\ # -f $in_bam \\ # -g $genome \\ # -r $ref_file \\ # --tmp \\ # -o $out_file/$samp/CIRCexplorer python $py_CIRCexplorer_PE_check \\ --raw_bam $in_raw_bam \\ --out_prefix $out_file/$samp/CIRCexplorer_circ_PE \\ $out_file/$samp/CIRCexplorer_circ.txt """ sh_work = "" for samp in self['samp']: brief_name = self['sam_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % (self.tophat_fusion, brief_name) genome = self['infile']['genome_file'] ref_file = self['infile']['ref_file'] out_file = self.CIRCexplorer in_raw_bam = "%s/%s/accepted_hits.bam" % (self.tophat, brief_name) sh_work += "sh %s %s %s %s %s %s %s %s %s \n" % ( sh_file, py_CIRCexplorer, in_bam, genome, ref_file, out_file, brief_name, py_CIRCexplorer_PE_Check, in_raw_bam) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work)
def run_HTSeq_known(self): sh_file = "%s/s04.HTSeq_known.sh" % (self.script_dir) sh_work_file = "%s/s04.HTSeq_known_work.sh" % (self.script_dir) py_exe = self['sftw_name'].py samtools_exe = self['sftw_name'].samtools deseq_exe = self['sftw_name'].deseq sh_info = """ py_exe=$1 samtools_exe=$2 deseq_exe=$3 tophat_dir=$4 samp_name=$5 HTS_k_dir=$6 known_GTF=$7 $samtools_exe view -H $tophat_dir/$samp_name/accepted_hits.bam > $tophat_dir/$samp_name/accepted_hits.header.sam $samtools_exe sort -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.bam $tophat_dir/$samp_name/accepted_hits.sort_name $samtools_exe view -o $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam [ ! -d $HTS_k_dir/$samp_name ] && mkdir -p $HTS_k_dir/$samp_name $py_exe $deseq_exe \\ -s no -f sam -a 10 \\ -o $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam \\ $tophat_dir/$samp_name/accepted_hits.sort_name.sam $known_GTF >$HTS_k_dir/$samp_name/$samp_name.dexseq.txt && \\ grep -v -P '^ERCC-|^RGC-|MIR|SNORD|Mir|Snord' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_clean.txt && \\ grep -P '^ERCC-|^RGC-' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_ERCC_RGCPloyA.txt && \\ grep "__no_feature" $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam | grep -v chrM | \\ cat $tophat_dir/$samp_name/accepted_hits.header.sam /dev/stdin | \\ $samtools_exe view -Sb /dev/stdin >$tophat_dir/$samp_name/accepted_hits.genome.bam && \\ $samtools_exe sort -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.bam $tophat_dir/$samp_name/accepted_hits.genome.sort rm $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam $tophat_dir/$samp_name/accepted_hits.genome.bam """ sh_work = "" for samp in self['samp']: tophat_dir = self.tophat samp_name = self['samp_info']['samp_brief'][samp] known_GTF = self['infile']['anno_file'] sh_work += "sh %s %s %s %s %s %s %s %s\n" % ( sh_file, py_exe, samtools_exe, deseq_exe, self.tophat, samp_name, self.HTS_k, known_GTF) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def run_tophat(self): home_dir = os.path.abspath('./') cln_dir = self['dir_name']['clean_data'] tophat_dir = self['dir_name']['tophat_dir'] if not os.path.isdir(tophat_dir): os.mkdir(tophat_dir) script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) tophat_py = self['sftw_name'].tophat sh_file = "%s/s02.tophat.sh" % (script_dir) sh_work_file = "%s/s02.tophat_work.sh" % (script_dir) sh_info = """ tophat_py=$1 cln_dir=$2 samp_name=$3 brief_name=$4 tophat_dir=$5 genome=$6 gtf_file=$7 PE2=$8 $tophat_py \\ -p 8 -G $gtf_file \\ --library-type fr-unstranded \\ --transcriptome-index /datc/huboqiang/cir_dyj_V2/Database/refseqGene.ERCC_RGCPloyA.exon.sort \\ -o $tophat_dir/$brief_name \\ $genome \\ $cln_dir/$samp_name/1.cln.fq.gz $PE2 """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] PE2 = "" if self['sam_info']['data_type'][samp] == "PE": PE2 = "%s/%s/2.cln.fq.gz" % (cln_dir, samp) sh_work += "sh %s %s %s %s %s %s %s %s %s\n" % ( sh_file, tophat_py, cln_dir, samp, brief_name, tophat_dir, self['infile']['genome_file'], self['infile']['anno_file'], PE2) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=6)
def run_CIRCexplorer(self): sh_file = "%s/s10.CIRCexplorer.sh" % (self.script_dir) sh_work_file = "%s/s10.CIRCexplorer_work.sh" % (self.script_dir) if not os.path.isdir( self.CIRCexplorer ): os.mkdir( self.CIRCexplorer ) py_CIRCexplorer = "/data/Analysis/huboqiang/software/CIRCexplorer/CIRCexplorer_PE.py" py_CIRCexplorer_PE_Check= "/datc/huboqiang/cir_dyj_V2/bin/CIRCexplorer_PE_check.py" sh_info = """ py_CIRCexplorer=$1 in_bam=$2 genome=$3 ref_file=$4 out_file=$5 samp=$6 py_CIRCexplorer_PE_check=$7 in_raw_bam=$8 [ ! -d $out_file/$samp ] && mkdir -p $out_file/$samp #python $py_CIRCexplorer \\ # -f $in_bam \\ # -g $genome \\ # -r $ref_file \\ # --tmp \\ # -o $out_file/$samp/CIRCexplorer python $py_CIRCexplorer_PE_check \\ --raw_bam $in_raw_bam \\ --out_prefix $out_file/$samp/CIRCexplorer_circ_PE \\ $out_file/$samp/CIRCexplorer_circ.txt """ sh_work = "" for samp in self['samp']: brief_name = self['sam_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % ( self.tophat_fusion, brief_name ) genome = self['infile']['genome_file'] ref_file = self['infile']['ref_file'] out_file = self.CIRCexplorer in_raw_bam = "%s/%s/accepted_hits.bam" % ( self.tophat, brief_name ) sh_work += "sh %s %s %s %s %s %s %s %s %s \n" % ( sh_file, py_CIRCexplorer, in_bam, genome, ref_file, out_file, brief_name, py_CIRCexplorer_PE_Check,in_raw_bam ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def run_HTSeq_known(self): sh_file = "%s/s03.HTSeq_known.sh" % (self.script_dir) sh_work_file = "%s/s03.HTSeq_known_work.sh" % (self.script_dir) py_exe = self['sftw_name'].py samtools_exe = self['sftw_name'].samtools deseq_exe = self['sftw_name'].deseq sh_info = """ py_exe=$1 samtools_exe=$2 deseq_exe=$3 tophat_dir=$4 samp_name=$5 HTS_k_dir=$6 known_GTF=$7 $samtools_exe view -H $tophat_dir/$samp_name/accepted_hits.bam > $tophat_dir/$samp_name/accepted_hits.header.sam $samtools_exe sort -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.bam $tophat_dir/$samp_name/accepted_hits.sort_name $samtools_exe view -o $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam [ ! -d $HTS_k_dir/$samp_name ] && mkdir -p $HTS_k_dir/$samp_name $py_exe $deseq_exe \\ -s no -f sam -a 10 \\ -o $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam \\ $tophat_dir/$samp_name/accepted_hits.sort_name.sam $known_GTF >$HTS_k_dir/$samp_name/$samp_name.dexseq.txt && \\ grep -v -P '^ERCC-|^RGC-|MIR|SNORD|Mir|Snord' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_clean.txt && \\ grep -P '^ERCC-|^RGC-' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_ERCC_RGCPloyA.txt && \\ grep "__no_feature" $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam | grep -v chrM | \\ cat $tophat_dir/$samp_name/accepted_hits.header.sam /dev/stdin | \\ $samtools_exe view -Sb /dev/stdin >$tophat_dir/$samp_name/accepted_hits.genome.bam && \\ $samtools_exe view -Sb $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam >$tophat_dir/$samp_name/accepted_hits.sort_name.gene.bam && \\ $samtools_exe sort -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.bam $tophat_dir/$samp_name/accepted_hits.genome.sort rm $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam $tophat_dir/$samp_name/accepted_hits.genome.bam $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam """ sh_work = "" for samp in self['samp']: tophat_dir = self.tophat samp_name = self['samp_info']['samp_brief'][samp] known_GTF = self['infile']['anno_file'] sh_work += "sh %s %s %s %s %s %s %s %s\n" % ( sh_file, py_exe,samtools_exe,deseq_exe, self.tophat, samp_name, self.HTS_k, known_GTF) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def run_repeat_count(self): sh_file = "%s/s13.Repeat_Count.sh" % (self.script_dir) sh_work_file = "%s/s13.Repeat_Count_work.sh" % (self.script_dir) py_Repeat_Intersect2Count = "%s/Repeat_Intersect2Count.py" % ( self.bin_dir) samtools_exe = self['sftw_name'].samtools bedtools_exe = self['sftw_name'].bedtools py_exe = self['sftw_name'].py if not os.path.isdir(self.repeatCount): os.mkdir(self.repeatCount) sh_info = """ samtools_exe=$1 bedtools_exe=$2 py_exe=$3 in_bam=$4 gtf_bed=$5 py_Repeat_Intersect2Count=$6 out_dir=$7 $samtools_exe view -F 0x0004 $in_bam | \\ grep -v ERCC-00* | grep -v RGC-CRE| \\ grep -v RGC-GFP | grep -v RGC-mRFP |grep "\\bNH:i:1\\b" | \\ awk '{OFS="\\t"; print $3,$4,$4+length($10),$1 }' >${out_dir}/repeat_result.bed $bedtools_exe intersect -sorted -loj -a $gtf_bed -b ${out_dir}/repeat_result.bed | \\ $py_exe $py_Repeat_Intersect2Count /dev/stdin >${out_dir}/repeat_count.bed """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % (self.tophat, brief_name) out_dir = "%s/%s" % (self.repeatCount, brief_name) sh_work += "sh %s %s %s %s %s %s %s %s\n" % ( sh_file, samtools_exe, bedtools_exe, py_exe, in_bam, self['infile']['rmsk_bed'], py_Repeat_Intersect2Count, out_dir) if not os.path.isdir(out_dir): os.mkdir(out_dir) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def run_repeat_count(self): sh_file = "%s/s09.Repeat_Count.sh" % (self.script_dir) sh_work_file = "%s/s09.Repeat_Count_work.sh" % (self.script_dir) py_Repeat_Intersect2Count = "%s/Repeat_Intersect2Count.py" % (self.bin_dir) samtools_exe = self['sftw_name'].samtools bedtools_exe = self['sftw_name'].bedtools py_exe = self['sftw_name'].py if not os.path.isdir( self.repeatCount ): os.mkdir( self.repeatCount ) sh_info = """ samtools_exe=$1 bedtools_exe=$2 py_exe=$3 in_bam=$4 gtf_bed=$5 py_Repeat_Intersect2Count=$6 out_dir=$7 $samtools_exe view -F 0x0004 $in_bam | \\ grep -v ERCC-00* | grep -v RGC-CRE| \\ grep -v RGC-GFP | grep -v RGC-mRFP |grep NH:i:1 | \\ awk '{OFS="\\t"; print $3,$4,$4+length($10),$1 }' >${out_dir}/repeat_result.bed sort -S 10% -k1V -k2n -k3n ${out_dir}/repeat_result.bed ${out_dir}/repeat_result.sort.bed $bedtools_exe intersect -sorted -loj -a $gtf_bed -b ${out_dir}/repeat_result.sort.bed | \\ $py_exe $py_Repeat_Intersect2Count /dev/stdin >${out_dir}/repeat_count.bed """ sh_work = "" for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] in_bam = "%s/%s/accepted_hits.bam" % ( self.tophat ,brief_name ) out_dir = "%s/%s" % ( self.repeatCount ,brief_name ) sh_work += "sh %s %s %s %s %s %s %s %s\n" % ( sh_file, samtools_exe,bedtools_exe,py_exe, in_bam, self['infile']['rmsk_bed'], py_Repeat_Intersect2Count, out_dir) if not os.path.isdir( out_dir ): os.mkdir( out_dir ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 ) # my_job.running_SGE( vf="500m",maxjob=100 )
def run_tophat(self): home_dir = os.path.abspath('./') cln_dir = self['dir_name']['clean_data'] tophat_dir = self['dir_name']['tophat_dir'] if not os.path.isdir( tophat_dir): os.mkdir( tophat_dir ) script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) tophat_py = self['sftw_name'].tophat sh_file = "%s/s02.tophat.sh" % (script_dir) sh_work_file = "%s/s02.tophat_work.sh" % (script_dir) sh_info = """ tophat_py=$1 cln_dir=$2 samp_name=$3 brief_name=$4 tophat_dir=$5 genome=$6 gtf_file=$7 PE2=$8 $tophat_py \\ -p 8 -G $gtf_file \\ --library-type fr-unstranded \\ --transcriptome-index /datc/huboqiang/cir_dyj_V2/Database/refseqGene.ERCC_RGCPloyA.exon.sort \\ -o $tophat_dir/$brief_name \\ $genome \\ $cln_dir/$samp_name/1.cln.fq.gz $PE2 """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] PE2 = "" if self['sam_info']['data_type'][samp ] == "PE": PE2 = "%s/%s/2.cln.fq.gz" % (cln_dir,samp) sh_work += "sh %s %s %s %s %s %s %s %s %s\n" % ( sh_file, tophat_py, cln_dir, samp, brief_name, tophat_dir, self['infile']['genome_file'],self['infile']['anno_file'],PE2 ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=6 )
def run_tophat_mannual(self): home_dir = os.path.abspath('./') fq_dir = self['dir_name']['fastq_data'] tophat_dir = self['dir_name']['tophat_mannual_dir'] if not os.path.isdir( tophat_dir): os.mkdir( tophat_dir ) script_dir = "%s/scripts" % (home_dir) tophat_py = self['sftw_name'].tophat samtools_exe = self['sftw_name'].samtools sh_file = "%s/s02.2.tophatMannual.sh" % (script_dir) sh_work_file = "%s/s02.2.tophatMannual_work.sh" % (script_dir) sh_info = """ tophat_py=$1 fq_dir=$2 samp_name=$3 brief_name=$4 tophat_dir=$5 genome=$6 gtf_file=$7 samtools_exe=$8 $tophat_py \\ -p 8 \\ --read-edit-dist 3 \\ --read-realign-edit-dist 3 \\ --phred64-quals \\ -o $tophat_dir/$brief_name \\ $genome \\ $fq_dir/$samp_name/$samp_name.1.fq.gz """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] sh_work += "sh %s %s %s %s %s %s %s %s %s\n" % ( sh_file, tophat_py, fq_dir, samp, brief_name, tophat_dir, self['infile']['genome_file'],self['infile']['anno_file'],samtools_exe ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=6 )
def run_QC(self): home_dir = os.path.abspath('./') raw_dir = self['dir_name']['raw_data'] cln_dir = self['dir_name']['clean_data'] if not os.path.isdir(cln_dir): os.mkdir(cln_dir) script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) pl_exe = self['sftw_name'].pl pl_QC = "%s/bin/QC.pl" % (home_dir) sh_file = "%s/scripts/QC.sh" % (home_dir) sh_work_file = "%s/scripts/QC_work.sh" % (home_dir) sh_info = """ pl_exe=$1 pl_QC=$2 in_dir=$3 out_dir=$4 samp=$5 data_type=$6 $pl_exe $pl_QC --indir $in_dir --outdir $out_dir --sample $samp --end $data_type """ sh_work = "" for samp in self['sample']: if not os.path.isdir("%s/%s" % (cln_dir, samp)): os.mkdir("%s/%s" % (cln_dir, samp)) in_dir = raw_dir out_dir = cln_dir data_type = 2 if self['sam_info']['data_type'][samp] == "SE": data_type = 1 sh_work += " sh %s %s %s %s %s %s %d\n" % ( sh_file, pl_exe, pl_QC, in_dir, out_dir, samp, data_type) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def run_cuffnorm_ERCC(self,stage): sh_file = "%s/s08.1.cuffnorm.ERCC.sh" % (self.script_dir) sh_work_file = "%s/s08.1.cuffnorm.ERCC_work.sh" % (self.script_dir) if not os.path.isdir( self.cuffnorm_ercc ): os.mkdir( self.cuffnorm_ercc ) cflk_dir = self['sftw_name'].cflk_dir np_stage= np.array(stage,dtype="string") l_brief = [] l_cxb = [] for samp in self['samp']: brief_name = self['samp_info']['samp_brief'][samp] l_brief.append( brief_name ) l_cxb.append( "%s/%s/abundances.cxb" % (self.cuffquant_ercc,brief_name) ) l_brief = np.array( l_brief,dtype="string" ) l_cxb = np.array( l_cxb ,dtype="string" ) np_stage= np.array( stage,dtype="string" ) sh_info = """ cflk_dir=$1 $cflk_dir/cuffnorm \\ -p 8 -o %s.Tophat -L %s \\ %s \\ %s $cflk_dir/cuffnorm \\ -p 8 -o %s.Hisat -L %s \\ %s \\ %s python %s %s.Tophat/genes.fpkm_table %s.Hisat/genes.fpkm_table | awk '{OFS="\\t";print $1,$2,$4,$3,$5}' >%s/genes.fpkm_table """ % ( self.cuffnorm_ercc, ",".join( l_brief[np_stage=="Tophat"] ), self['infile']['anno_file_merge_ERCC'], " ".join( l_cxb[np_stage=="Tophat"] ), self.cuffnorm_ercc, ",".join( l_brief[np_stage=="Hisat"] ), self['infile']['anno_file_merge_ERCC'], " ".join( l_cxb[np_stage=="Hisat"] ), self.mrg_py,self.cuffnorm_ercc,self.cuffnorm_ercc,self.cuffnorm_ercc ) sh_work = "sh %s %s" % (sh_file, cflk_dir) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def run_HTSeq_unknown(self): sh_file = "%s/s07.HTSeq_unknown.sh" % (self.script_dir) sh_work_file = "%s/s07.HTSeq_unknown_work.sh" % (self.script_dir) py_exe = self['sftw_name'].py samtools_exe = self['sftw_name'].samtools deseq_exe = self['sftw_name'].deseq if not os.path.isdir(self.HTS_u): os.mkdir(self.HTS_u) sh_info = """ py_exe=$1 samtools_exe=$2 deseq_exe=$3 tophat_dir=$4 samp_name=$5 HTS_u_dir=$6 unknown_GTF=$7 $samtools_exe view -H $tophat_dir/$samp_name/accepted_hits.genome.sort.bam > $tophat_dir/$samp_name/accepted_hits.header.sam $samtools_exe sort -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.sort.bam $tophat_dir/$samp_name/accepted_hits.genome.sort_name $samtools_exe view -o $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam $tophat_dir/$samp_name/accepted_hits.genome.sort_name.bam [ ! -d $HTS_u_dir/$samp_name ] && mkdir -p $HTS_u_dir/$samp_name $py_exe $deseq_exe \\ -s no -f sam -a 10 \\ $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam $unknown_GTF >$HTS_u_dir/$samp_name/$samp_name.dexseq_NeoRaw.txt """ sh_work = "" for samp in self['samp']: tophat_dir = self.tophat samp_name = self['samp_info']['samp_brief'][samp] unknown_GTF = "%s/novo_lnc_raw.combined.gtf" % (self.data_dir) sh_work += "sh %s %s %s %s %s %s %s %s\n" % ( sh_file, py_exe, samtools_exe, deseq_exe, self.tophat, samp_name, self.HTS_u, unknown_GTF) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=8)
def run_HTSeq_known(self): sh_file = "%s/s04.HTSeq_known.sh" % (self.script_dir) sh_work_file = "%s/s04.HTSeq_known_work.sh" % (self.script_dir) py_deseq = "/data/Analysis/huboqiang/bin/htseq-count" sh_info = """ tophat_dir=$1 samp_name=$2 py_deseq=$3 HTS_k_dir=$4 known_GTF=$5 samtools view -H $tophat_dir/$samp_name/accepted_hits.bam > $tophat_dir/$samp_name/accepted_hits.header.sam samtools sort -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.bam $tophat_dir/$samp_name/accepted_hits.sort_name samtools view -o $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam [ ! -d $HTS_k_dir/$samp_name ] && mkdir -p $HTS_k_dir/$samp_name python $py_deseq \\ -s no -f sam -a 10 \\ -o $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam \\ $tophat_dir/$samp_name/accepted_hits.sort_name.sam $known_GTF >$HTS_k_dir/$samp_name/$samp_name.dexseq.txt && \\ grep -v -P '^ERCC-|^RGC-|MIR|SNORD|Mir|Snord' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_clean.txt && \\ grep -P '^ERCC-|^RGC-' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_ERCC_RGCPloyA.txt && \\ grep "__no_feature" $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam | grep -v chrM | \\ cat $tophat_dir/$samp_name/accepted_hits.header.sam /dev/stdin | \\ samtools view -Sb /dev/stdin >$tophat_dir/$samp_name/accepted_hits.genome.bam && \\ samtools sort -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.bam $tophat_dir/$samp_name/accepted_hits.genome.sort rm $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam $tophat_dir/$samp_name/accepted_hits.genome.bam """ sh_work = "" for samp in self['samp']: tophat_dir = self.tophat samp_name = self['samp_info']['samp_brief'][samp] known_GTF = self['infile']['anno_file'] sh_work += "sh %s %s %s %s %s %s\n" % ( sh_file, self.tophat, samp_name, py_deseq, self.HTS_k, known_GTF) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def run_QC(self): home_dir = os.path.abspath('./') raw_dir = self['dir_name']['raw_data'] cln_dir = self['dir_name']['clean_data'] if not os.path.isdir( cln_dir ): os.mkdir( cln_dir ) script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) pl_exe = self['sftw_name'].pl pl_QC = "%s/bin/QC.pl" % ( home_dir ) sh_file = "%s/scripts/QC.sh" % (home_dir) sh_work_file = "%s/scripts/QC_work.sh" % (home_dir) sh_info = """ pl_exe=$1 pl_QC=$2 in_dir=$3 out_dir=$4 samp=$5 data_type=$6 $pl_exe $pl_QC --indir $in_dir --outdir $out_dir --sample $samp --end $data_type """ sh_work = "" for samp in self['sample']: if not os.path.isdir( "%s/%s" % (cln_dir,samp) ): os.mkdir( "%s/%s" % (cln_dir,samp) ) in_dir = raw_dir out_dir = cln_dir data_type = 2 if self['sam_info']['data_type'][samp ] == "SE": data_type = 1 sh_work += " sh %s %s %s %s %s %s %d\n" % ( sh_file, pl_exe, pl_QC, in_dir,out_dir,samp,data_type ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def run_HTSeq_unknown(self): sh_file = "%s/s07.HTSeq_unknown.sh" % (self.script_dir) sh_work_file = "%s/s07.HTSeq_unknown_work.sh" % (self.script_dir) py_exe = self['sftw_name'].py samtools_exe = self['sftw_name'].samtools deseq_exe = self['sftw_name'].deseq if not os.path.isdir( self.HTS_u ): os.mkdir( self.HTS_u ) sh_info = """ py_exe=$1 samtools_exe=$2 deseq_exe=$3 tophat_dir=$4 samp_name=$5 HTS_u_dir=$6 unknown_GTF=$7 $samtools_exe view -H $tophat_dir/$samp_name/accepted_hits.genome.sort.bam > $tophat_dir/$samp_name/accepted_hits.header.sam $samtools_exe sort -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.sort.bam $tophat_dir/$samp_name/accepted_hits.genome.sort_name $samtools_exe view -o $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam $tophat_dir/$samp_name/accepted_hits.genome.sort_name.bam [ ! -d $HTS_u_dir/$samp_name ] && mkdir -p $HTS_u_dir/$samp_name $py_exe $deseq_exe \\ -s no -f sam -a 10 \\ $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam $unknown_GTF >$HTS_u_dir/$samp_name/$samp_name.dexseq_NeoRaw.txt """ sh_work = "" for samp in self['samp']: tophat_dir = self.tophat samp_name = self['samp_info']['samp_brief'][samp] unknown_GTF = "%s/novo_lnc_raw.combined.gtf" % ( self.data_dir ) sh_work += "sh %s %s %s %s %s %s %s %s\n" % ( sh_file, py_exe,samtools_exe,deseq_exe, self.tophat, samp_name, self.HTS_u, unknown_GTF) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def makeGTF_withoutERCC(self): sh_file = "%s/RemoveERCC.sh" % (self.script_dir) sh_work_file = "%s/RemoveERCC_work.sh" % (self.script_dir) sh_info = """ known_GTF=$1 remove_ERCC_GTF=$2 grep -P "^chr" $known_GTF >$remove_ERCC_GTF """ known_GTF = self['infile']['anno_file'] remove_ERCC_GTF= "%s.sort.gtf" % ( ".".join( self['infile']['anno_file'].split(".")[:-3] ) ) self['infile']['anno_file_remove_ERCC'] = remove_ERCC_GTF sh_work = "sh %s %s %s" % ( sh_file, known_GTF , remove_ERCC_GTF ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=1 )
def run_cuffcomp_novo_trans(self): sh_file = "%s/s06.1.cuffcompare_novo.sh" % (self.script_dir) sh_work_file = "%s/s06.1.cuffcompare_novo_work.sh" % (self.script_dir) sh_info = """ out_prefix=$1 shift /data/Analysis/huboqiang/software/cufflinks-2.2.1.Linux_x86_64/cuffcompare \\ -o $out_prefix \\ -T $@ \\ """ sh_work = "" out_prefix = "%s/novo_lnc_raw" % ( self.data_dir ) l_in_samp = [ "%s/%s/transcripts.gtf" % ( self.cufflink_u,self['samp_info']['samp_brief'][samp] ) for samp in self['samp'] ] sh_work = "sh %s %s %s" % ( sh_file, out_prefix, " ".join(l_in_samp) ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def makeGTF_withoutERCC(self): sh_file = "%s/RemoveERCC.sh" % (self.script_dir) sh_work_file = "%s/RemoveERCC_work.sh" % (self.script_dir) sh_info = """ known_GTF=$1 remove_ERCC_GTF=$2 grep -P "^chr" $known_GTF >$remove_ERCC_GTF """ known_GTF = self['infile']['anno_file'] remove_ERCC_GTF = "%s.sort.gtf" % (".".join( self['infile']['anno_file'].split(".")[:-3])) self['infile']['anno_file_remove_ERCC'] = remove_ERCC_GTF sh_work = "sh %s %s %s" % (sh_file, known_GTF, remove_ERCC_GTF) my_job = m_jobs.running_jobs(sh_file, sh_work_file) my_job.load_sh_file(sh_info) my_job.load_sh_work_file(sh_work) my_job.running_multi(cpu=1)
def SRA2fastq(self): home_dir = os.path.abspath('./') raw_dir = self['dir_name']['raw_data'] fq_dir = self['dir_name']['fastq_data'] if not os.path.isdir( fq_dir ): os.mkdir( fq_dir ) script_dir = "%s/scripts" % (home_dir) fqDump = self['sftw_name'].fastqDump python_exe = self['sftw_name'].py fq_cvt_py = "%s/qual_cvt.py" % (self['bin_dir']) sh_file = "%s/scripts/s01.SRA2Fastq.sh" % (home_dir) sh_work_file = "%s/scripts/s01.SRA2Fastq_work.sh" % (home_dir) sh_info = """ samp_name=$1 fqDump=$2 raw_dir=$3 fq_dir=$4 fq_cvt_py=$5 python_exe=$6 $fqDump --split-files --gzip --outdir $raw_dir/${samp_name} $raw_dir/${samp_name}.sra #mv $fq_dir/${samp_name}/${samp_name}_1.fastq.gz $fq_dir/${samp_name}/${samp_name}.1.fq.gz $python_exe $fq_cvt_py $raw_dir/${samp_name}/${samp_name}_1.fastq.gz $fq_dir/${samp_name}/${samp_name}.1.fq.gz 59 64 """ sh_work = "" for samp_name in self['sample']: sh_work += " sh %s %s %s %s %s %s %s\n" % ( sh_file, samp_name,fqDump, raw_dir,fq_dir, fq_cvt_py, python_exe ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=8 )
def run_tophat_fusion(self): home_dir = os.path.abspath('./') cln_dir = self['dir_name']['clean_data'] trim_dir = self['dir_name']['trim_data'] tophat_dir = self['dir_name']['tophat_dir'] fusion_dir = self['dir_name']['tophat_fusion'] script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) sh_file = "%s/s03.tophat_fusion.sh" % (script_dir) sh_work_file = "%s/s03.tophat_fusion_work.sh" % (script_dir) sh_info = """ tophat_dir=$1 brief_name=$2 fusion_dir=$3 genome=$4 /data/Analysis/huboqiang/software/bedtools-2.17.0/bin/bedtools bamtofastq -i $tophat_dir/$brief_name/unmapped.bam -fq /dev/stdout | gzip - >$tophat_dir/$brief_name/unmapped.fq.gz /data/Analysis/huboqiang/software/tophat-2.0.12.Linux_x86_64/tophat \\ --fusion-search --keep-fasta-order --bowtie1 \\ --no-coverage-search -p 8 \\ -o $fusion_dir/$brief_name \\ $genome \\ $tophat_dir/$brief_name/unmapped.fq.gz """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] sh_work += "sh %s %s %s %s %s \n" % ( sh_file, tophat_dir,brief_name,fusion_dir,self['infile']['genome_file'] ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_SGE( vf="7g",maxjob=5 )
def run_HTSeq_unknown(self): sh_file = "%s/s07.HTSeq_unknown.sh" % (self.script_dir) sh_work_file = "%s/s07.HTSeq_unknown_work.sh" % (self.script_dir) py_deseq = "/data/Analysis/huboqiang/bin/htseq-count" if not os.path.isdir( self.HTS_u ): os.mkdir( self.HTS_u ) sh_info = """ tophat_dir=$1 samp_name=$2 py_deseq=$3 HTS_u_dir=$4 unknown_GTF=$5 samtools view -H $tophat_dir/$samp_name/accepted_hits.genome.sort.bam > $tophat_dir/$samp_name/accepted_hits.header.sam samtools sort -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.sort.bam $tophat_dir/$samp_name/accepted_hits.genome.sort_name samtools view -o $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam $tophat_dir/$samp_name/accepted_hits.genome.sort_name.bam [ ! -d $HTS_u_dir/$samp_name ] && mkdir -p $HTS_u_dir/$samp_name python $py_deseq \\ -s no -f sam -a 10 \\ $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam $unknown_GTF >$HTS_u_dir/$samp_name/$samp_name.dexseq_NeoRaw.txt """ sh_work = "" for samp in self['samp']: tophat_dir = self.tophat samp_name = self['samp_info']['samp_brief'][samp] unknown_GTF = "%s/novo_lnc_raw.combined.gtf" % ( self.data_dir ) sh_work += "sh %s %s %s %s %s %s\n" % ( sh_file, self.tophat, samp_name, py_deseq, self.HTS_u, unknown_GTF) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def run_tophat(self): home_dir = os.path.abspath('./') cln_dir = self['dir_name']['clean_data'] trim_dir = self['dir_name']['trim_data'] tophat_dir = self['dir_name']['tophat_dir'] script_dir = "%s/scripts" % (home_dir) bin_dir = "%s/bin" % (home_dir) sh_file = "%s/s02.tophat.sh" % (script_dir) sh_work_file = "%s/s02.tophat_work.sh" % (script_dir) sh_info = """ trim_dir=$1 brief_name=$2 tophat_dir=$3 genome=$4 gtf_file=$5 /data/Analysis/huboqiang/software/tophat-2.0.12.Linux_x86_64/tophat \\ -a 6 --microexon-search -m 2 \\ -p 8 -G $gtf_file \\ --library-type fr-unstranded \\ --transcriptome-index /datc/huboqiang/cir_dyj_V2/Database/refseqGene.ERCC_RGCPloyA.exon.sort \\ -o $tophat_dir/$brief_name \\ $genome \\ $trim_dir/TRIMED_${brief_name}.1.clean.fq.gz \\ $trim_dir/TRIMED_${brief_name}.2.clean.fq.gz """ sh_work = "" for samp in self['sample']: brief_name = self['sam_info']['samp_brief'][samp] sh_work += "sh %s %s %s %s %s %s\n" % ( sh_file, trim_dir, brief_name, tophat_dir, self['infile']['genome_file'],self['infile']['anno_file'] ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work )
def __get_HTS_clean_split(self): sh_file = "%s/p.HTSeq_split.sh" % (self.script_dir) sh_work_file = "%s/p.HTSeq_split_work.sh" % (self.script_dir) sh_info = """ infile=$1 out_Refseq=$2 out_NONCODE=$3 out_NSMB=$4 grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE head -n 1 $infile >$out_NSMB && grep -P '^XLOC' $infile >>$out_NSMB """ infile = "%s/merge.dexseq_clean.gene.xls" % ( self.HTS ) out_Refseq = "%s/merge.dexseq_clean_refseq.gene.xls" % ( self.HTS ) out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls" % ( self.HTS ) out_NSMB = "%s/merge.dexseq_clean_NSMB.gene.xls" % ( self.HTS ) sh_work = "sh %s %s %s %s %s " % ( sh_file,infile,out_Refseq,out_NONCODE,out_NSMB ) my_job = m_jobs.running_jobs(sh_file,sh_work_file) my_job.load_sh_file( sh_info ) my_job.load_sh_work_file( sh_work ) my_job.running_multi( cpu=1 )