def run_peakanno(project, peak_call, slurm=False, job_file=None): """ Will run the peak annotation on the peak called regions """ root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) samples = map(str, glob(os.path.join(proj_dir, "*", "*{}*".format(peak_call)))) TSS_cmd = conf.get('anno_TSS', '') NDG_cmd = conf.get('anno_NDG', '') sbatch_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {nm}_peakanno\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 3:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n') '#SBATCH -o {peaks_dir}/{name}_peakcall.stdout\n' '#SBATCH -e {peaks_dir}/{name}_peakcall.stderr\n' template_peakanno = ( '\n## Running peak-annotations\n' 'for bed in $(ls --color=never {peaks_dir}/*narrowPeak);do\n' 'cut -f1-6 $bed > {peaks_dir}/{nm}_annotate \n' '' + TSS_cmd + '\n' '' + NDG_cmd + '\n' 'python ' + col_match.__file__ + ' {anno_dir}/{nm}_annotate.tss {anno_dir}/{nm}_annotate.ndg {anno_dir}/{nm}_merged "merge"\n' 'rm {peaks_dir}/{nm}_annotate\n' 'done\n') for sam in samples: sam_dir = os.path.split(sam)[0] nm = os.path.basename(sam_dir) annotate_dir = os.path.join(sam_dir, "peakannotate") if not os.path.exists(annotate_dir): os.makedirs(annotate_dir) if job_file: with open(job_file, 'a') as jb_fl: jb_fl.write( template_peakanno.format(peaks_dir=sam, nm=nm, anno_dir=annotate_dir)) return if slurm: job_file = os.path.join(sam_dir, "scripts", "{}_peakannotate.sh".format(nm)) template_anno = sbatch_template + template_peakanno with open(job_file, 'w') as jb_fl: jb_fl.write( template_anno.format(peaks_dir=sam, nm=nm, anno_dir=annotate_dir)) subprocess.check_call(['sbatch', job_file]) job_file = None
def bamcov(project, genefile, input_file, mode): """Will run the postqc""" root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir, project) if mode == "scale": assign_mode = conf.get('computematrix_scale','') else: assign_mode = conf.get('computematrix_TSS','') sbatch_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {name}_postqc\n' '#SBATCH -p core -n 3 \n' '#SBATCH -t 4:00:00\n' '#SBATCH -e '+proj_dir+'/{sample}/scripts/{name}_postqc.stderr\n' '#SBATCH -o '+proj_dir+'/{sample}/scripts/{name}_postqc.stdout\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n' 'module load deepTools/2.2.3\n' #'module load ngsplot/2.61\n\n' ) template = ('bamCompare -b1 {treatment} -b2 {control} --binSize 25 --ratio log2 --scaleFactorsMethod "readCount" -o {postqc_dir}/{treat}_Vs_{ctrl}_log2ratio_readcount.bw --normalizeUsingRPKM\n' ''+assign_mode+'\n' 'plotHeatmap -m {postqc_dir}/matrix.mat.gz -out {postqc_dir}/{treat}_Vs_{ctrl}_heatmap_v2.png --heatmapHeight 25 --heatmapWidth 3 --whatToShow \'heatmap and colorbar\' --sortUsing max\n' ) bed_file = genefile pk_file = open(input_file,'r') pk_file.next() for ln in iter(pk_file): ln = ln.strip() ln = ln.split('\t') treat = ln[0] ctrl = ln[1] postqc_dir = os.path.join(proj_dir,treat,"deepTools") if not os.path.exists(postqc_dir): os.mkdir(postqc_dir) treat_fl = glob("{}/{}/alignment_*/bam_files/{}*sorted_rmdup_v1.bam".format(proj_dir,treat,treat)) control_fl = glob("{}/{}/alignment_*/bam_files/{}*sorted_rmdup_v1.bam".format(proj_dir,ctrl,ctrl)) for sam in treat_fl: suf_s = os.path.basename(sam) suf_s = suf_s.replace("_sorted_rmdup_v1.bam","") for con in control_fl: con_c = os.path.basename(con) con_c = con_c.replace("_sorted_rmdup_v1.bam","") name = "{}_Vs_{}".format(suf_s,con_c) job_file = os.path.join(proj_dir,treat,"{}/{}_{}.sh".format("scripts",name,"postqc")) template_pc = sbatch_template+template with open(job_file, 'w') as jb_fl: jb_fl.write(template_pc.format(sample=treat, treat=suf_s, ctrl=con_c, name=name, treatment=sam, control=con, bed_file=bed_file, postqc_dir=postqc_dir))
def run_qc(project): """Will run the QC""" root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir, project) fastqc_sbatch_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_fastqc\n' '#SBATCH -p core -n 2 \n' '#SBATCH -t 2:00:00\n' '#SBATCH -e {sam_dir}/scripts/{sam}_fastqc.stderr\n' '#SBATCH -o {sam_dir}/scripts/{sam}_fastqc.stdout\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n' 'module load FastQC/0.11.5\n' 'cd '+proj_dir+'\n' 'fastqc -o {fastqc_dir} -f fastq {fq_files}\n') samples = find_samples(proj_dir,file_type="fastq") for sam in samples.keys(): fq_fls = samples[sam] sam_dir = os.path.join(proj_dir, sam) src_dir = os.path.join(sam_dir, 'scripts') if not os.path.exists(src_dir): os.mkdir(src_dir) fastqc_dir = os.path.join(sam_dir,'fastqc') if not os.path.exists(fastqc_dir): os.mkdir(fastqc_dir) job_file = os.path.join(src_dir, "{}_fastqc.sh".format(sam)) with open(job_file, 'w') as jb_fl: jb_fl.write(fastqc_sbatch_template.format(sam=sam, sam_dir=sam_dir,fastqc_dir=fastqc_dir, fq_files=" ".join(fq_fls))) subprocess.check_call(['sbatch',job_file])
def run_denovo(project,peak_call,slurm=False,job_file=None): """ Will run the de-novo motif analysis """ root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir,project) sample = map(str,glob(os.path.join(proj_dir, "*", "*{}*".format(peak_call),"*xls"))) motif_r = os.path.join(os.path.dirname(utils.__file__),"motifanalysis.r") sbatch_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {nm}_motifanalysis\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 3:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n') template_denovo = ('\n## Running de-nove motif analysis\n' 'module load bioinfo-tools\n' 'module load MEMEsuite/4.11.1\n' 'Rscript '+motif_r+' {ip_fl} {op_dir} {op_fl}\n' ) for xls in sample: nm = os.path.basename(xls).replace(".xls","") op_dir = os.path.join(os.path.dirname(os.path.dirname(xls)),"motif") if not os.path.exists(op_dir): os.mkdir(op_dir) op_fl = nm +("_seq.fa") job_file = os.path.join(os.path.dirname(os.path.dirname(xls)),"scripts","{}_denovo.sh".format(nm)) template = sbatch_template+template_denovo with open(job_file,'w') as jb_fl: jb_fl.write(template.format(ip_fl=xls,op_dir=op_dir,op_fl=op_fl,nm=nm))
def run_denovo(project, peak_call, slurm=False, job_file=None): """ Will run the de-novo motif analysis """ root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) sample = map( str, glob(os.path.join(proj_dir, "*", "*{}*".format(peak_call), "*xls"))) motif_r = os.path.join(os.path.dirname(utils.__file__), "motifanalysis.r") sbatch_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {nm}_motifanalysis\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 3:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n') template_denovo = ('\n## Running de-nove motif analysis\n' 'module load bioinfo-tools\n' 'module load MEMEsuite/4.11.1\n' 'Rscript ' + motif_r + ' {ip_fl} {op_dir} {op_fl}\n') for xls in sample: nm = os.path.basename(xls).replace(".xls", "") op_dir = os.path.join(os.path.dirname(os.path.dirname(xls)), "motif") if not os.path.exists(op_dir): os.mkdir(op_dir) op_fl = nm + ("_seq.fa") job_file = os.path.join(os.path.dirname(os.path.dirname(xls)), "scripts", "{}_denovo.sh".format(nm)) template = sbatch_template + template_denovo with open(job_file, 'w') as jb_fl: jb_fl.write( template.format(ip_fl=xls, op_dir=op_dir, op_fl=op_fl, nm=nm))
def run_htcuff(project, aligner, sample=None): """Will run the cuuflinks and htseq""" root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) align_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_htcuff\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 10:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' '#SBATCH -e {sam_dir}/scripts/{sam}_htcuff.stderr\n' '#SBATCH -o {sam_dir}/scripts/{sam}_htcuff.stdout\n' 'module load bioinfo-tools\n' 'module load samtools/1.3\n' 'module load cufflinks/2.2.1\n' 'module load htseq/0.6.1\n\n' 'genome_fl=\"/pica/data/uppnex/igenomes/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf\"\n' 'for bam in $(ls --color=never {align_dir}/*_sorted.bam);do\n' 'nm=$(basename ${{bam}})\n' 'nm=${{nm/.bam/}}\n' 'htseq-count -s reverse -q -f bam ${{bam}} ${{genome_fl}} > {ht_dir}/${{nm}}_counts.txt\n' 'cufflinks -p 8 --library-type fr-firststrand -G ${{genome_fl}} -o {cuff_dir}/${{nm}}_cufflinks ${{bam}}\n' 'done\n\n') if sample: if os.path.isdir(os.path.join(proj_dir, sample)): samples = [sample] else: raise SystemExit( "Given sample {} is not found in project directory {}".format( sample, proj_dir)) else: samples = find_samples(proj_dir) for sam in samples: sam_dir = os.path.join(proj_dir, sam) src_dir = os.path.join(sam_dir, 'scripts') align_dir = os.path.join(sam_dir, "alignment_{}".format(aligner), "bam_files") bam_fl = ''.join(glob("{}/{}*")) ht_dir = os.path.join(sam_dir, 'htseq') if not os.path.exists(ht_dir): os.mkdir(ht_dir) cuff_dir = os.path.join(sam_dir, 'cufflinks') if not os.path.exists(cuff_dir): os.mkdir(cuff_dir) job_file = os.path.join(src_dir, "{}_{}.sh".format(sam, "htcuff")) with open(job_file, 'w') as jb_fl: jb_fl.write( align_template.format(sam=sam, sam_dir=sam_dir, align_dir=align_dir, ht_dir=ht_dir, cuff_dir=cuff_dir)) subprocess.check_call(['sbatch', job_file])
def run_b2b(project, aligner, sample=None, slurm=False, job_file=None): """ Will run the bam to bed file conversion """ root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) if sample: if os.path.isdir(os.path.join(proj_dir, sample)): samples = [sample] else: raise SystemExit( "Given sample {} is not found in project directory {}".format( sample, proj_dir)) else: samples = find_samples(proj_dir) for sam in samples: sbatch_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_bam2bed\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 3:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n') template_b2b = ( '## run bam to bed\n' 'module load BEDTools/2.11.2\n' 'for bam in $(ls --color=never {sam_dir}/alignment_{aligner}/bam_files/*sorted_rmdup.bam);do\n' 'bed_fl=${{bam/.bam/.bed}}\n' 'bed_fl=${{bed_fl/bam_files/bedfiles}}\n' 'bed_uniq_fl=${{bed_fl/.bed/_uniq.bed}}\n' 'bamToBed -i ${{bam}} > ${{bed_fl}}\n' 'awk -F\\\\t -v \'OFS=\\t\' \'{{print $1,$2,$3,".",$5,$6}}\' ${{bed_fl}} | sort -u > ${{bed_uniq_fl}}\n' 'rm ${{bed_fl}}\n' 'done\n') sam_dir = os.path.join(proj_dir, sam) bed_dir = os.path.join(sam_dir, "alignment_{}".format(aligner), "bedfiles") if not os.path.exists(bed_dir): #pdb.set_trace() os.mkdir(bed_dir) if job_file: with open(job_file, 'a') as jb_fl: jb_fl.write( template_b2b.format(sam_dir=sam_dir, aligner=aligner)) return if slurm: job_file = os.path.join(sam_dir, "scripts", "{}_{}_bamTobed.sh".format(sam, aligner)) template_b2b = sbatch_template + template_b2b with open(job_file, 'w') as jb_fl: jb_fl.write( template_b2b.format(sam=sam, sam_dir=sam_dir, aligner=aligner)) subprocess.check_call(['sbatch', job_file]) job_file = None
def run_peakanno(project,peak_call,slurm=False,job_file=None): """ Will run the peak annotation on the peak called regions """ root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir,project) samples = map(str,glob(os.path.join(proj_dir, "*", "*{}*".format(peak_call)))) TSS_cmd = conf.get ('anno_TSS','') NDG_cmd = conf.get ('anno_NDG','') sbatch_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {nm}_peakanno\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 3:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n') '#SBATCH -o {peaks_dir}/{name}_peakcall.stdout\n' '#SBATCH -e {peaks_dir}/{name}_peakcall.stderr\n' template_peakanno = ('\n## Running peak-annotations\n' 'for bed in $(ls --color=never {peaks_dir}/*narrowPeak);do\n' 'cut -f1-6 $bed > {peaks_dir}/{nm}_annotate \n' ''+TSS_cmd+'\n' ''+NDG_cmd+'\n' 'python '+col_match.__file__+' {anno_dir}/{nm}_annotate.tss {anno_dir}/{nm}_annotate.ndg {anno_dir}/{nm}_merged "merge"\n' 'rm {peaks_dir}/{nm}_annotate\n' 'done\n') for sam in samples: sam_dir=os.path.split(sam)[0] nm=os.path.basename(sam_dir) annotate_dir = os.path.join(sam_dir,"peakannotate") if not os.path.exists(annotate_dir): os.makedirs(annotate_dir) if job_file: with open(job_file,'a') as jb_fl: jb_fl.write(template_peakanno.format(peaks_dir=sam,nm=nm,anno_dir=annotate_dir)) return if slurm: job_file = os.path.join(sam_dir,"scripts","{}_peakannotate.sh".format(nm)) template_anno = sbatch_template+template_peakanno with open(job_file,'w') as jb_fl: jb_fl.write(template_anno.format(peaks_dir=sam,nm=nm,anno_dir=annotate_dir)) subprocess.check_call(['sbatch',job_file]) job_file = None
def run_align(project,aligner): """Will run the preferred-alignment""" root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir, project) if aligner == "bwa": align_module = 'module load bwa/0.7.12\n' align_index = '/pica/data/uppnex/igenomes/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/genome.fa' align_block = ('bwa aln {align_index} ${{run}} > {align_dir}/${{nm}}.sai\n' 'bwa samse {align_index} {align_dir}/${{nm}}.sai ${{run}}.fastq | samtools view -Sb - > {align_dir}/${{nm}}.bam\n' 'rm ${{nm}}.sai\n') elif aligner == "bowtie2": align_module = 'module load bowtie2/2.2.6\n' align_index = "/pica/data/uppnex/igenomes/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/genome" align_block = ('bowtie2 -t -p 8 -k2 --very-sensitive -x {align_index} -q ${{run}} -S {align_dir}/${{nm}}.sam > {align_dir}/${{nm}}_bowtie2.log\n' 'samtools view -bS -o {align_dir}/${{nm}}.bam {align_dir}/${{nm}}.sam\n' 'rm {align_dir}/${{nm}}.sam\n') align_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_align\n' '#SBATCH -p core -n 2 \n' '#SBATCH -t 10:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n' 'module load samtools/1.3\n' ''+align_module+'' 'cd '+proj_dir+'\n' 'if [[ $(ls {sam}/Rawdata/*gz | wc -l) -gt 0 ]]; then gzip -d {sam}/Rawdata/*gz; fi\n' 'for run in {fq_files};do\n' 'nm=$(basename ${{run}})\n' 'nm=${{nm/.fastq/}}\n' ''+align_block+'' 'done\n' ) samples = find_samples(proj_dir) for sam in samples.keys(): fq_fls = samples[sam] sam_dir = os.path.join(proj_dir, sam) src_dir = os.path.join(sam_dir, 'scripts') if not os.path.exists(src_dir): os.mkdir(src_dir) align_dir = os.path.join(sam_dir,aligner) if not os.path.exists(align_dir): os.mkdir(align_dir) job_file = os.path.join(src_dir, "{}_{}.sh".format(sam,aligner)) with open(job_file, 'w') as jb_fl: jb_fl.write(align_template.format(sam=sam, sam_dir=sam_dir,align_dir=align_dir,proj_dir=proj_dir,align_index=align_index, fq_files=" ".join(fq_fls)))
def run_dr(project, input_file): """ Will run the idr analysis to check for biological replicate consistence """ root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) sbatch_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {name}_peakcall\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 5:00:00\n' '#SBATCH -o ' + proj_dir + '/{rep1}_Vs_{rep2}/scripts/{name}_idr.stdout\n' '#SBATCH -e ' + proj_dir + '/{rep1}_Vs_{rep2}/scripts/{name}_idr.stderr\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n' 'sort -k 8,8nr {rep1_dir}/*.narrowPeak > {rep1_dir}/tmp.regionPeak\n' 'intersectBed -a {rep1_dir}/tmp.regionPeak -b {mm10_blacklisted-regions.bed} > {rep1_dir}/cleanedpeaks.regionPeak\n' 'sort -k 8,8nr {rep2_dir}/*.narrowPeak > {rep2_dir}/tmp.regionPeak\n' 'intersectBed -a {rep2_dir}/tmp.regionPeak -b {mm10_blacklisted-regions.bed} > {rep2_dir}/cleanedpeaks.regionPeak\n' 'Rscript batch-consistency-analysis.r rep1,rep2 -1 idr_op 0 F p.value {mm10.genome}\n' ) pk_file = open(input_file, 'r') pk_file.next() for ln in iter(pk_file): ln = ln.strip() ln = ln.split('\t') rep1 = ln[0] rep2 = ln[1] name = "{}_Vs_{}".format(rep1, rep2) rep1_dir = ''.join(glob("{}/{}/macs2_*".format(proj_dir, rep1))) peaks_dir = os.path.join(proj_dir, treat, "{}_{}".format(peak_call, mode)) if not os.path.exists(peaks_dir): os.makedirs(peaks_dir) job_fl = os.path.join(proj_dir, treat, "scripts", "{}_peakcall.sh".format(name)) template_pc = sbatch_template + template with open(job_fl, 'w') as jb_fl: jb_fl.write( template_pc.format(name=name, treat=treat, treatment=treat_fl, control=control_fl, peaks_dir=peaks_dir)) subprocess.check_call(['sbatch', job_fl])
def run_b2b(project, aligner, sample=None, slurm=False, job_file=None): """ Will run the bam to bed file conversion """ root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir, project) if sample: if os.path.isdir(os.path.join(proj_dir, sample)): samples = [sample] else: raise SystemExit("Given sample {} is not found in project directory {}".format(sample, proj_dir)) else: samples = find_samples(proj_dir) for sam in samples: sbatch_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_bam2bed\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 3:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n') template_b2b = ('## run bam to bed\n' 'module load BEDTools/2.11.2\n' 'for bam in $(ls --color=never {sam_dir}/alignment_{aligner}/bam_files/*sorted_rmdup.bam);do\n' 'bed_fl=${{bam/.bam/.bed}}\n' 'bed_fl=${{bed_fl/bam_files/bedfiles}}\n' 'bed_uniq_fl=${{bed_fl/.bed/_uniq.bed}}\n' 'bamToBed -i ${{bam}} > ${{bed_fl}}\n' 'awk -F\\\\t -v \'OFS=\\t\' \'{{print $1,$2,$3,".",$5,$6}}\' ${{bed_fl}} | sort -u > ${{bed_uniq_fl}}\n' 'rm ${{bed_fl}}\n' 'done\n') sam_dir = os.path.join(proj_dir, sam) bed_dir = os.path.join(sam_dir, "alignment_{}".format(aligner), "bedfiles") if not os.path.exists(bed_dir): #pdb.set_trace() os.mkdir(bed_dir) if job_file: with open(job_file, 'a') as jb_fl: jb_fl.write(template_b2b.format(sam_dir=sam_dir, aligner=aligner)) return if slurm: job_file = os.path.join(sam_dir,"scripts","{}_{}_bamTobed.sh".format(sam, aligner)) template_b2b = sbatch_template + template_b2b with open(job_file, 'w') as jb_fl: jb_fl.write(template_b2b.format(sam=sam, sam_dir=sam_dir, aligner=aligner)) subprocess.check_call(['sbatch',job_file]) job_file = None
def run_peakcall(project, input_file, mode, peak_call,peakannotate): """ Will run the prefered peak-calling software """ root_dir = conf.get('root_dir','') proj_dir = os.path.join (root_dir,project) load_module = ('module load MACS/2.1.0\n') sbatch_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {name}_peakcall\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 5:00:00\n' '#SBATCH -o '+proj_dir+'/{treat}/scripts/{name}_peakcall.stdout\n' '#SBATCH -e '+proj_dir+'/{treat}/scripts/{name}_peakcall.stderr\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n' ) if mode == "TF": if peak_call == "macs2": macs2_cmd = conf.get('macs2_TF','') template = ('## Running Peak-calling for TF-ChIP data\n' ''+load_module+'' ''+macs2_cmd+'' ) else: raise SystemExit("Please mention the type of peak caller - macs2") elif mode == "HM": if peak_call == "macs2": macs2_cmd = conf.get('macs2_HM','') template = ('# Running macs2 peak-calling for HM data\n' ''+load_module+'' ''+macs2_cmd+'') elif peak_call == "danpos2": danpos_path = "cd /home/ashwini/softwares/danpos-2.2.2" danpos_cmd = conf.get('danpos2_dpeak','') template = ('# Running danpos2 peakcalling for HM data\n' ''+danpos_cmd+'') else: raise SystemExit("Please mention the type of peak_Caller (macs2/danpos2)") else: raise SystemExit("Please mention the type of mode - either TF or HM") pk_file = open(input_file,'r') pk_file.next() for ln in iter(pk_file): ln = ln.strip() ln = ln.split('\t') treat = ln[0] ctrl = ln[1] treat_fl = glob("{}/{}/alignment_*/bedfiles/{}*rmdup_uniq.bed".format(proj_dir,treat,treat)) control_fl = glob("{}/{}/alignment_*/bedfiles/{}*rmdup_uniq.bed".format(proj_dir,ctrl,ctrl)) peaks_dir = os.path.join(proj_dir,treat,"{}_{}".format(peak_call,mode)) if not os.path.exists(peaks_dir): os.makedirs(peaks_dir) for sam in treat_fl: suf_s = os.path.basename(sam) suf_s = suf_s.replace("_sorted_rmdup_uniq.bed","") for con in control_fl: con_c = os.path.basename(con) con_c = con_c.replace("_sorted_rmdup_uniq.bed","") name = "{}_Vs_{}".format(suf_s,con_c) job_fl = os.path.join(proj_dir,treat,"scripts","{}_peakcall.sh".format(name)) template_pc = sbatch_template + template with open(job_fl,'w') as jb_fl: jb_fl.write(template_pc.format(name=name,treat=treat, treatment=sam, control=con,peaks_dir=peaks_dir)) if peakannotate: run_peakanno(project=project,peak_call=peak_call,slurm=True,job_file=job_fl) subprocess.check_call(['sbatch',job_fl])
def run_peakcall(project, input_file, mode, peak_call, peakannotate): """ Will run the prefered peak-calling software """ root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) load_module = ('module load MACS/2.1.0\n') sbatch_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {name}_peakcall\n' '#SBATCH -p core -n 1 \n' '#SBATCH -t 5:00:00\n' '#SBATCH -o ' + proj_dir + '/{treat}/scripts/{name}_peakcall.stdout\n' '#SBATCH -e ' + proj_dir + '/{treat}/scripts/{name}_peakcall.stderr\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n') if mode == "TF": if peak_call == "macs2": macs2_cmd = conf.get('macs2_TF', '') template = ('## Running Peak-calling for TF-ChIP data\n' '' + load_module + '' '' + macs2_cmd + '') else: raise SystemExit("Please mention the type of peak caller - macs2") elif mode == "HM": if peak_call == "macs2": macs2_cmd = conf.get('macs2_HM', '') template = ('# Running macs2 peak-calling for HM data\n' '' + load_module + '' '' + macs2_cmd + '') elif peak_call == "danpos2": danpos_path = "cd /home/ashwini/softwares/danpos-2.2.2" danpos_cmd = conf.get('danpos2_dpeak', '') template = ('# Running danpos2 peakcalling for HM data\n' '' + danpos_cmd + '') else: raise SystemExit( "Please mention the type of peak_Caller (macs2/danpos2)") else: raise SystemExit("Please mention the type of mode - either TF or HM") pk_file = open(input_file, 'r') pk_file.next() for ln in iter(pk_file): ln = ln.strip() ln = ln.split('\t') treat = ln[0] ctrl = ln[1] treat_fl = glob("{}/{}/alignment_*/bedfiles/{}*rmdup_uniq.bed".format( proj_dir, treat, treat)) control_fl = glob( "{}/{}/alignment_*/bedfiles/{}*rmdup_uniq.bed".format( proj_dir, ctrl, ctrl)) peaks_dir = os.path.join(proj_dir, treat, "{}_{}".format(peak_call, mode)) if not os.path.exists(peaks_dir): os.makedirs(peaks_dir) for sam in treat_fl: suf_s = os.path.basename(sam) suf_s = suf_s.replace("_sorted_rmdup_uniq.bed", "") for con in control_fl: con_c = os.path.basename(con) con_c = con_c.replace("_sorted_rmdup_uniq.bed", "") name = "{}_Vs_{}".format(suf_s, con_c) job_fl = os.path.join(proj_dir, treat, "scripts", "{}_peakcall.sh".format(name)) template_pc = sbatch_template + template with open(job_fl, 'w') as jb_fl: jb_fl.write( template_pc.format(name=name, treat=treat, treatment=sam, control=con, peaks_dir=peaks_dir)) if peakannotate: run_peakanno(project=project, peak_call=peak_call, slurm=True, job_file=job_fl) subprocess.check_call(['sbatch', job_fl])
def bamcov(project, genefile, input_file, mode): """Will run the postqc""" root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) if mode == "scale": assign_mode = conf.get('computematrix_scale', '') else: assign_mode = conf.get('computematrix_TSS', '') sbatch_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {name}_postqc\n' '#SBATCH -p core -n 3 \n' '#SBATCH -t 4:00:00\n' '#SBATCH -e ' + proj_dir + '/{sample}/scripts/{name}_postqc.stderr\n' '#SBATCH -o ' + proj_dir + '/{sample}/scripts/{name}_postqc.stdout\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' 'module load bioinfo-tools\n' 'module load deepTools/2.2.3\n' #'module load ngsplot/2.61\n\n' ) template = ( 'bamCompare -b1 {treatment} -b2 {control} --binSize 25 --ratio log2 --scaleFactorsMethod "readCount" -o {postqc_dir}/{treat}_Vs_{ctrl}_log2ratio_readcount.bw --normalizeUsingRPKM\n' '' + assign_mode + '\n' 'plotHeatmap -m {postqc_dir}/matrix.mat.gz -out {postqc_dir}/{treat}_Vs_{ctrl}_heatmap_v2.png --heatmapHeight 25 --heatmapWidth 3 --whatToShow \'heatmap and colorbar\' --sortUsing max\n' ) bed_file = genefile pk_file = open(input_file, 'r') pk_file.next() for ln in iter(pk_file): ln = ln.strip() ln = ln.split('\t') treat = ln[0] ctrl = ln[1] postqc_dir = os.path.join(proj_dir, treat, "deepTools") if not os.path.exists(postqc_dir): os.mkdir(postqc_dir) treat_fl = glob( "{}/{}/alignment_*/bam_files/{}*sorted_rmdup_v1.bam".format( proj_dir, treat, treat)) control_fl = glob( "{}/{}/alignment_*/bam_files/{}*sorted_rmdup_v1.bam".format( proj_dir, ctrl, ctrl)) for sam in treat_fl: suf_s = os.path.basename(sam) suf_s = suf_s.replace("_sorted_rmdup_v1.bam", "") for con in control_fl: con_c = os.path.basename(con) con_c = con_c.replace("_sorted_rmdup_v1.bam", "") name = "{}_Vs_{}".format(suf_s, con_c) job_file = os.path.join( proj_dir, treat, "{}/{}_{}.sh".format("scripts", name, "postqc")) template_pc = sbatch_template + template with open(job_file, 'w') as jb_fl: jb_fl.write( template_pc.format(sample=treat, treat=suf_s, ctrl=con_c, name=name, treatment=sam, control=con, bed_file=bed_file, postqc_dir=postqc_dir))
def run_align(project, aligner, genome, sample, bam_to_bed): """Will run the preferred-alignment""" root_dir = conf.get('root_dir','') proj_dir = os.path.join(root_dir, project) bed_dir = '' if aligner == "bwa": align_module = 'module load bwa/0.7.12\n' align_index = conf['genome_index'][genome][aligner] align_block = ('bwa aln {align_index} ${{fq}} > {align_dir}/${{nam}}.sai\n' 'bwa samse {align_index} {align_dir}/${{nam}}.sai ${{fq}} | samtools view -Sb - > {align_dir}/${{nam}}.bam\n' 'rm ${{nam}}.sai\n') elif aligner == "bowtie2": align_module = 'module load bowtie2/2.2.6\n' align_index = conf['genome_index'][genome][aligner] align_block = ('bowtie2 -t -p 8 -k2 --very-sensitive -x {align_index} -q ${{fq}} -S {align_dir}/${{nam}}.sam 2> {align_dir}/${{nam}}_bowtie2.log\n\n' 'samtools view -bS -o {align_dir}/${{nam}}.bam {align_dir}/${{nam}}.sam\n\n' 'rm {align_dir}/${{nam}}.sam\n\n') elif aligner == "bowtie": align_module = 'module load bowtie/1.1.2\n' align_index = conf['genome_index'][genome][aligner] align_block = ('bowtie -q -m 1 -v 3 --best --strata {align_index} ${{fq}} -S {align_dir}/${{nam}}.sam 2>{align_dir}/${{nam}}_bowtie.log\n\n' 'samtools view -bS -o {align_dir}/${{nam}}.bam {align_dir}/${{nam}}.sam\n\n') elif aligner == "STAR": align_module = 'module load star/2.3.1o\n' align_index = conf['genome_index'][genome][aligner] align_block = ("STAR --genomeDir {align_index} --readFilesIn ${{fq}} --outFilterIntronMotifs RemoveNoncanonical --outFileNamePrefix {align_dir}/${{nam}} --outSAMmode Full --runThreadN 8 --outFilterType BySJout --alignSJDBoverhangMin 1 --outFilterMismatchNmax 5\n\n") elif aligner == "tophat": align_module = ('module load tophat/2.0.12\n' 'module load bowtie2/2.2.6\n') align_index = conf['genome_index'][genome][aligner] align_gtf = "/pica/data/uppnex/igenomes/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" align_block = ("tophat -o {align_dir}/${{nam}} -G ${{align_gtf}} -p 8 --library-type fr-firststrand --solexa1.3-quals {align_index} ${{fq}} \n\n" "mv {align_dir}/${{nam}}/accepted_hits.bam {align_dir}/${{nam}}.bam\n") align_template = ('#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_align\n' '#SBATCH -p core -n 4 \n' '#SBATCH -t 10:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' '#SBATCH -e {sam_dir}/scripts/{sam}_align.stderr\n' '#SBATCH -o {sam_dir}/scripts/{sam}_align.stdout\n' 'module load bioinfo-tools\n' ''+align_module+'' 'module load samtools/0.1.19\n' 'if [[ $(ls --color=never {sam_dir}/Rawdata/*.gz | wc -l) -gt 0 ]]; then gzip -d {sam_dir}/Rawdata/*.gz; fi\n' 'if [[ $(ls --color=never {sam_dir}/Rawdata/*zip | wc -l) -gt 0 ]]; then unzip {sam_dir}/Rawdata/*zip; fi\n' 'for fq in $(ls --color=never {sam_dir}/Rawdata/*.fastq);do\n' 'nm=$(basename ${{fq}})\n' 'nm=${{nm/_*/}}\n' 'nam="{sam}_"${{nm}}\n\n' ''+align_block+'' #'samtools view -H {align_dir}/${{nam}}.bam | sed -e \'s/SN:\([0-9XY]\)/SN:chr\\1/\' -e \'s/SN:M/SN:chrM/\' | samtools reheader - {align_dir}/${{nam}}.bam > {align_dir}/${{nam}}_v1.bam\n\n' #'mv {align_dir}/${{nam}}_v1.bam {align_dir}/${{nam}}.bam\n\n' 'samtools sort {align_dir}/${{nam}}.bam {align_dir}/${{nam}}_sorted\n\n' 'java -jar /pica/sw/apps/bioinfo/picard/1.92/milou/MarkDuplicates.jar INPUT={align_dir}/${{nam}}_sorted.bam OUTPUT={align_dir}/${{nam}}_sorted_rmdup.bam METRICS_FILE={align_dir}/${{nam}}_picardmetrics.txt REMOVE_DUPLICATES=True\n\n' 'samtools index {align_dir}/${{nam}}_sorted_rmdup.bam\n\n' 'samtools index {align_dir}/${{nam}}_sorted.bam\n\n' '[ -e {align_dir}/${{nam}}_sorted.bam ] && rm {align_dir}/${{nam}}.bam\n\n' '[ -e {align_dir}/${{nam}}.sam ] && rm {align_dir}/${{nam}}.sam\n\n' 'done\n') if sample: if os.path.isdir(os.path.join(proj_dir, sample)): samples = [sample] else: raise SystemExit("Given sample {} is not found in project directory {}".format(sample, proj_dir)) else: samples = find_samples(proj_dir) for sam in samples: sam_dir = os.path.join(proj_dir, sam) src_dir = os.path.join(sam_dir, 'scripts') if not os.path.exists(src_dir): os.mkdir(src_dir) align_dir = os.path.join(sam_dir,"alignment_{}".format(aligner),"bam_files") if not os.path.exists(align_dir): os.makedirs(align_dir) job_file = os.path.join(src_dir, "{}_{}.sh".format(sam,aligner)) with open(job_file, 'w') as jb_fl: jb_fl.write(align_template.format(sam=sam, sam_dir=sam_dir, align_dir=align_dir,align_index=align_index)) if bam_to_bed: run_b2b(project=project, aligner=aligner, slurm=True, sample=sam, job_file=job_file) subprocess.check_call(['sbatch',job_file])
def run_align(project, aligner, genome, sample, bam_to_bed): """Will run the preferred-alignment""" root_dir = conf.get('root_dir', '') proj_dir = os.path.join(root_dir, project) bed_dir = '' if aligner == "bwa": align_module = 'module load bwa/0.7.12\n' align_index = conf['genome_index'][genome][aligner] align_block = ( 'bwa aln {align_index} ${{fq}} > {align_dir}/${{nam}}.sai\n' 'bwa samse {align_index} {align_dir}/${{nam}}.sai ${{fq}} | samtools view -Sb - > {align_dir}/${{nam}}.bam\n' 'rm ${{nam}}.sai\n') elif aligner == "bowtie2": align_module = 'module load bowtie2/2.2.6\n' align_index = conf['genome_index'][genome][aligner] align_block = ( 'bowtie2 -t -p 8 -k2 --very-sensitive -x {align_index} -q ${{fq}} -S {align_dir}/${{nam}}.sam 2> {align_dir}/${{nam}}_bowtie2.log\n\n' 'samtools view -bS -o {align_dir}/${{nam}}.bam {align_dir}/${{nam}}.sam\n\n' 'rm {align_dir}/${{nam}}.sam\n\n') elif aligner == "bowtie": align_module = 'module load bowtie/1.1.2\n' align_index = conf['genome_index'][genome][aligner] align_block = ( 'bowtie -q -m 1 -v 3 --best --strata {align_index} ${{fq}} -S {align_dir}/${{nam}}.sam 2>{align_dir}/${{nam}}_bowtie.log\n\n' 'samtools view -bS -o {align_dir}/${{nam}}.bam {align_dir}/${{nam}}.sam\n\n' ) elif aligner == "STAR": align_module = 'module load star/2.3.1o\n' align_index = conf['genome_index'][genome][aligner] align_block = ( "STAR --genomeDir {align_index} --readFilesIn ${{fq}} --outFilterIntronMotifs RemoveNoncanonical --outFileNamePrefix {align_dir}/${{nam}} --outSAMmode Full --runThreadN 8 --outFilterType BySJout --alignSJDBoverhangMin 1 --outFilterMismatchNmax 5\n\n" ) elif aligner == "tophat": align_module = ('module load tophat/2.0.12\n' 'module load bowtie2/2.2.6\n') align_index = conf['genome_index'][genome][aligner] align_gtf = "/pica/data/uppnex/igenomes/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" align_block = ( "tophat -o {align_dir}/${{nam}} -G ${{align_gtf}} -p 8 --library-type fr-firststrand --solexa1.3-quals {align_index} ${{fq}} \n\n" "mv {align_dir}/${{nam}}/accepted_hits.bam {align_dir}/${{nam}}.bam\n" ) align_template = ( '#!/bin/bash -l\n' '#SBATCH -A b2012025\n' '#SBATCH -J {sam}_align\n' '#SBATCH -p core -n 4 \n' '#SBATCH -t 10:00:00\n' '#SBATCH --mail-type=FAIL\n' '#SBATCH --mail-user=\'[email protected]\'\n\n' '#SBATCH -e {sam_dir}/scripts/{sam}_align.stderr\n' '#SBATCH -o {sam_dir}/scripts/{sam}_align.stdout\n' 'module load bioinfo-tools\n' '' + align_module + '' 'module load samtools/0.1.19\n' 'if [[ $(ls --color=never {sam_dir}/Rawdata/*.gz | wc -l) -gt 0 ]]; then gzip -d {sam_dir}/Rawdata/*.gz; fi\n' 'if [[ $(ls --color=never {sam_dir}/Rawdata/*zip | wc -l) -gt 0 ]]; then unzip {sam_dir}/Rawdata/*zip; fi\n' 'for fq in $(ls --color=never {sam_dir}/Rawdata/*.fastq);do\n' 'nm=$(basename ${{fq}})\n' 'nm=${{nm/_*/}}\n' 'nam="{sam}_"${{nm}}\n\n' '' + align_block + '' #'samtools view -H {align_dir}/${{nam}}.bam | sed -e \'s/SN:\([0-9XY]\)/SN:chr\\1/\' -e \'s/SN:M/SN:chrM/\' | samtools reheader - {align_dir}/${{nam}}.bam > {align_dir}/${{nam}}_v1.bam\n\n' #'mv {align_dir}/${{nam}}_v1.bam {align_dir}/${{nam}}.bam\n\n' 'samtools sort {align_dir}/${{nam}}.bam {align_dir}/${{nam}}_sorted\n\n' 'java -jar /pica/sw/apps/bioinfo/picard/1.92/milou/MarkDuplicates.jar INPUT={align_dir}/${{nam}}_sorted.bam OUTPUT={align_dir}/${{nam}}_sorted_rmdup.bam METRICS_FILE={align_dir}/${{nam}}_picardmetrics.txt REMOVE_DUPLICATES=True\n\n' 'samtools index {align_dir}/${{nam}}_sorted_rmdup.bam\n\n' 'samtools index {align_dir}/${{nam}}_sorted.bam\n\n' '[ -e {align_dir}/${{nam}}_sorted.bam ] && rm {align_dir}/${{nam}}.bam\n\n' '[ -e {align_dir}/${{nam}}.sam ] && rm {align_dir}/${{nam}}.sam\n\n' 'done\n') if sample: if os.path.isdir(os.path.join(proj_dir, sample)): samples = [sample] else: raise SystemExit( "Given sample {} is not found in project directory {}".format( sample, proj_dir)) else: samples = find_samples(proj_dir) for sam in samples: sam_dir = os.path.join(proj_dir, sam) src_dir = os.path.join(sam_dir, 'scripts') if not os.path.exists(src_dir): os.mkdir(src_dir) align_dir = os.path.join(sam_dir, "alignment_{}".format(aligner), "bam_files") if not os.path.exists(align_dir): os.makedirs(align_dir) job_file = os.path.join(src_dir, "{}_{}.sh".format(sam, aligner)) with open(job_file, 'w') as jb_fl: jb_fl.write( align_template.format(sam=sam, sam_dir=sam_dir, align_dir=align_dir, align_index=align_index)) if bam_to_bed: run_b2b(project=project, aligner=aligner, slurm=True, sample=sam, job_file=job_file) subprocess.check_call(['sbatch', job_file])