def trimmomatic_pe(fastq1, fastq2, crop_length, out_dir_R1, out_dir_R2, nth=1, java_heap=None): prefix_R1 = os.path.join(out_dir_R1, os.path.basename(strip_ext_fastq(fastq1))) prefix_R2 = os.path.join(out_dir_R2, os.path.basename(strip_ext_fastq(fastq2))) cropped_R1 = '{}.crop_{}bp.fastq.gz'.format(prefix_R1, crop_length) cropped_R2 = '{}.crop_{}bp.fastq.gz'.format(prefix_R2, crop_length) tmp_cropped_R1 = '{}.tmp'.format(cropped_R1) tmp_cropped_R2 = '{}.tmp'.format(cropped_R2) if java_heap is None: java_heap_param = '-Xmx6G' else: java_heap_param = '-Xmx{}'.format(java_heap) cmd = 'java -XX:ParallelGCThreads=1 {} -jar {} PE -threads {} ' cmd += '{} {} {} {} {} {} MINLEN:{} CROP:{}' cmd = cmd.format(java_heap_param, locate_trimmomatic(), nth, fastq1, fastq2, cropped_R1, tmp_cropped_R1, cropped_R2, tmp_cropped_R2, crop_length, crop_length) run_shell_cmd(cmd) rm_f([tmp_cropped_R1, tmp_cropped_R2]) return cropped_R1, cropped_R2
def trimmomatic_pe(fastq1, fastq2, crop_length, crop_length_tol, out_dir_R1, out_dir_R2, nth=1, java_heap=None): prefix_R1 = os.path.join(out_dir_R1, os.path.basename(strip_ext_fastq(fastq1))) prefix_R2 = os.path.join(out_dir_R2, os.path.basename(strip_ext_fastq(fastq2))) crop_length_tol = abs(crop_length_tol) min_length = crop_length - crop_length_tol cropped_R1 = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix_R1, cl=crop_length, tol=crop_length_tol) cropped_R2 = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix_R2, cl=crop_length, tol=crop_length_tol) tmp_cropped_R1 = '{}.tmp'.format(cropped_R1) tmp_cropped_R2 = '{}.tmp'.format(cropped_R2) if java_heap is None: java_heap_param = '-Xmx6G' else: java_heap_param = '-Xmx{}'.format(java_heap) cmd = 'java -XX:ParallelGCThreads=1 {param} -jar {jar} PE -threads {nth} ' cmd += '{fq1} {fq2} {cropped1} {tmp_cropped1} {cropped2} {tmp_cropped2} ' cmd += 'MINLEN:{ml} CROP:{cl}' cmd = cmd.format(param=java_heap_param, jar=locate_trimmomatic(), nth=nth, fq1=fastq1, fq2=fastq2, cropped1=cropped_R1, tmp_cropped1=tmp_cropped_R1, cropped2=cropped_R2, tmp_cropped2=tmp_cropped_R2, ml=min_length, cl=crop_length) run_shell_cmd(cmd) rm_f([tmp_cropped_R1, tmp_cropped_R2]) return cropped_R1, cropped_R2
def trimmomatic_se(fastq1, crop_length, crop_length_tol, out_dir, nth=1, java_heap=None): prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1))) crop_length_tol = abs(crop_length_tol) min_length = crop_length - crop_length_tol cropped = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix, cl=crop_length, tol=crop_length_tol) if java_heap is None: java_heap_param = '-Xmx6G' else: java_heap_param = '-Xmx{}'.format(java_heap) cmd = 'java -XX:ParallelGCThreads=1 {param} -jar {jar} SE -threads {nth} ' cmd += '{fq1} {cropped} MINLEN:{ml} CROP:{cl}' cmd = cmd.format(param=java_heap_param, jar=locate_trimmomatic(), nth=nth, fq1=fastq1, cropped=cropped, ml=min_length, cl=crop_length) run_shell_cmd(cmd) return cropped
def bowtie2_pe(fastq1, fastq2, ref_index_prefix, multimapping, nth, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) bam = '{}.bam'.format(prefix) align_log = '{}.align.log'.format(prefix) cmd = 'bowtie2 {} -X2000 --mm --threads {} -x {} ' cmd += '-1 {} -2 {} 2>{} | ' cmd += 'samtools view -Su /dev/stdin | ' cmd += 'samtools sort /dev/stdin -o {} -T {}' cmd = cmd.format( '-k {}'.format(multimapping+1) if multimapping else '', nth, ref_index_prefix, fastq1, fastq2, align_log, bam, prefix) run_shell_cmd(cmd) cmd2 = 'cat {}'.format(align_log) run_shell_cmd(cmd2) return bam, align_log
def make_read_length_file(fastq, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) txt = '{}.read_length.txt'.format(prefix) read_length = get_read_length(fastq) with open(txt, 'w') as fp: fp.write(str(read_length)) return txt
def bwa_aln(fastq, ref_index_prefix, nth, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) sai = '{}.sai'.format(prefix) cmd = 'bwa aln -q 5 -l 32 -k 2 -t {nth} {ref} {fastq} > {sai}'.format( nth=nth, ref=ref_index_prefix, fastq=fastq, sai=sai) run_shell_cmd(cmd) return sai
def create_job_file_pe(samplefile1, samplefile2, out_dir): basename1 = os.path.basename(strip_ext_fastq(samplefile1)) basename2 = os.path.basename(strip_ext_fastq(samplefile2)) prefix = os.path.join(out_dir, basename1) job_header = '#!/bin/bash\n' job_header += '#BSUB -P QFATACseq\n' job_header += '#BSUB -J {}_QFATACseq\n' job_header += '#BSUB -oo {}' + '/ATACseqJ1log.out\n' job_header += '#BSUB -eo {}' + '/ATACseqJ1log.err\n' job_header += '#BSUB -n 1\n' job_header += '#BSUB -N [email protected]\n' job_header = job_header.format(basename1, prefix, prefix) ########## Section 3: Footprinting ### Load all the required module for analysis: module1 = 'module load R/3.6.3 \n' module1 += 'module load conda3/5.1.0 \n' module1 += 'source activate /research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7 \n' module1 += 'export RGTDATA=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_databases/rgtdata \n' apps1 = 'hint=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7/bin/rgt-hint \n' apps1 += 'motifanalysis=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7/bin/rgt-motifanalysis \n' apps1 += 'genomeDir=/research/projects/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/Bowtie2/mm10 \n' apps1 += 'blacklist=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_databases/ENCODE_blacklist/mm10-blacklist.v2.bed \n' apps1 += 'mm10=/research/rgs01/project_space/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/GRCm38.primary_assembly.genome.fa.size \n' job_body8 = '' log.info("Foot Printing......") job_body8 = '$hint footprinting --atac-seq --paired-end --organism=mm10 --output-location={} --output-prefix=04_footPrint {}/02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam {}/03_NucleosomeFree_peaks.narrowPeak\n' job_body8 += '$hint tracks --bc --bigWig --organism=mm10 {}/02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam {}/03_NucleosomeFree_peaks.narrowPeak --output-prefix={}/footprintingTracks\n' job_body8 += '$motifanalysis matching --motif-dbs $RGTDATA/motifs/transfac_mouse --input-files {}/04_footPrint.bed --output-location={} --organism=mm10\n' #if current script don't work, then: Job_job8 += 'conda deactivate \n' then remove the "$" in Rscript [[lack of mpbs_annotation.txt for unknown reason]] job_body8 += 'perl /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/scripts/annotationToNetwork.pl {}/04_footPrint_mpbs.annotate.txt {}/04_networkATACseq.txt\n' job_body8 = job_body8.format(prefix, prefix, prefix, prefix, prefix, prefix, prefix, prefix, prefix, prefix) jobfile = prefix + ".sh" with open(jobfile, "w") as new_file: new_file.write(job_header + module1 + '\n' + apps1 + '\n' + job_body8 + '\n') return jobfile
def trim_adapter_pe(fastq1, fastq2, adapter1, adapter2, adapter_for_all, cutadapt_param, out_dir): if adapter1 and adapter2: prefix1 = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1))) prefix2 = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq2))) trimmed1 = '{}.trim.fastq.gz'.format(prefix1) trimmed2 = '{}.trim.fastq.gz'.format(prefix2) cmd = 'cutadapt {} -a {} -A {} {} {} -o {} -p {}'.format( cutadapt_param, adapter_for_all if adapter_for_all else adapter1, adapter_for_all if adapter_for_all else adapter2, fastq1, fastq2, trimmed1, trimmed2) run_shell_cmd(cmd) return [trimmed1, trimmed2] else: fq1 = copy_f_to_dir(fastq1, out_dir) fq2 = copy_f_to_dir(fastq2, out_dir) return [fq1, fq2]
def merge_fastqs(fastqs, end, out_dir): out_dir = os.path.join(out_dir, end) mkdir_p(out_dir) prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastqs[0]))) merged = '{}.merged.fastq.gz'.format(prefix) if len(fastqs) > 1: cmd = 'zcat -f {} | gzip -nc > {}'.format(' '.join(fastqs), merged) run_shell_cmd(cmd) return merged else: return hard_link(fastqs[0], merged)
def bowtie2_se(fastq, ref_index_prefix, multimapping, nth, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) bam = '{}.bam'.format(prefix) cmd = 'bowtie2 {} --mm --threads {} -x {} -U {} ' cmd += '| samtools view -Su /dev/stdin ' cmd += '| samtools sort /dev/stdin -o {} -T {}' cmd = cmd.format('-k {}'.format(multimapping + 1) if multimapping else '', nth, ref_index_prefix, fastq, bam, prefix) run_shell_cmd(cmd) return bam
def trim_adapter_se(fastq, adapter, adapter_for_all, cutadapt_param, out_dir): if adapter: prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq))) trimmed = '{}.trim.fastq.gz'.format(prefix) cmd = 'cutadapt {} -a {} {} | gzip -nc > {}'.format( cutadapt_param, adapter_for_all if adapter_for_all else adapter, fastq, trimmed) run_shell_cmd(cmd) return trimmed else: return copy_f_to_dir(fastq, out_dir)
def bwa_se(fastq, ref_index_prefix, nth, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) bam = '{}.bam'.format(prefix) sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir) cmd = 'bwa samse {} {} {} | ' cmd += 'samtools view -Su - | samtools sort - -o {} -T {}' cmd = cmd.format(ref_index_prefix, sai, fastq, bam, prefix) run_shell_cmd(cmd) rm_f(sai) return bam
def trim_adapter_pe(fastq1, fastq2, adapter1, adapter2, adapter_for_all, cutadapt_param, out_dir): if adapter1 and adapter2: prefix1 = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1))) prefix2 = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq2))) trimmed1 = '{}.trim.fastq.gz'.format(prefix1) trimmed2 = '{}.trim.fastq.gz'.format(prefix2) cmd = 'cutadapt {} -a {} -A {} {} {} -o {} -p {}'.format( cutadapt_param, adapter_for_all if adapter_for_all else adapter1, adapter_for_all if adapter_for_all else adapter2, fastq1, fastq2, trimmed1, trimmed2) run_shell_cmd(cmd) return [trimmed1, trimmed2] else: # make hard link linked1 = os.path.join(out_dir, os.path.basename(fastq1)) linked2 = os.path.join(out_dir, os.path.basename(fastq2)) os.link(fastq1, linked1) os.link(fastq2, linked2) return [linked1, linked2]
def merge_fastqs(fastqs, end, out_dir): """make merged fastqs on $out_dir/R1, $out_dir/R2 """ out_dir = os.path.join(out_dir, end) mkdir_p(out_dir) prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastqs[0]))) merged = '{}.merged.fastq.gz'.format(prefix) if len(fastqs) > 1: cmd = 'zcat -f {} | gzip -nc > {}'.format(' '.join(fastqs), merged) run_shell_cmd(cmd) return merged else: return copy_f_to_f(fastqs[0], merged)
def trim_fastq(fastq, trim_bp, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq))) trimmed = '{}.trim_{}bp.fastq.gz'.format(prefix, trim_bp) cmd = 'python $(which trimfastq.py) {} {} | gzip -nc > {}'.format( fastq, trim_bp, trimmed) run_shell_cmd(cmd) # if shorter than trim_bp cmd2 = 'zcat -f {} | (grep \'sequences shorter than desired length\' ' cmd2 += '|| true) | wc -l' cmd2 = cmd2.format(trimmed) if int(run_shell_cmd(cmd2)) > 0: copy_f_to_f(fastq, trimmed) return trimmed
def trimmomatic_se(fastq1, crop_length, out_dir, nth=1, java_heap=None): prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1))) cropped = '{}.crop_{}bp.fastq.gz'.format(prefix, crop_length) if java_heap is None: java_heap_param = '-Xmx6G' else: java_heap_param = '-Xmx{}'.format(java_heap) cmd = 'java -XX:ParallelGCThreads=1 {} -jar {} SE -threads {} ' cmd += '{} {} MINLEN:{} CROP:{}' cmd = cmd.format(java_heap_param, locate_trimmomatic(), nth, fastq1, cropped, crop_length, crop_length) run_shell_cmd(cmd) return cropped
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, use_bwa_mem_for_pe, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) if use_bwa_mem_for_pe and read_len >= 70: cmd = 'bwa mem -M -t {} {} {} {} | gzip -nc > {}' cmd = cmd.format(nth, ref_index_prefix, fastq1, fastq2, sam) temp_files.append(sam) else: sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: cmd3 = 'zcat -f {} | grep -v -F -f {} | ' cmd3 += 'samtools view -Su - | samtools sort - -o {} -T {}' cmd3 = cmd3.format(sam, badcigar, bam, prefix) else: cmd3 = 'samtools view -Su {} | samtools sort - -o {} -T {}' cmd3 = cmd3.format(sam, bam, prefix) run_shell_cmd(cmd3) rm_f(temp_files) return bam
def trimmomatic_se(fastq1, crop_length, crop_length_tol, phred_score_format, out_dir, nth=1, java_heap=None): prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1))) crop_length_tol = abs(crop_length_tol) min_length = crop_length - crop_length_tol cropped = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix, cl=crop_length, tol=crop_length_tol) if java_heap is None: java_heap_param = '-Xmx6G' else: java_heap_param = '-Xmx{}'.format(java_heap) phred_score_format = phred_score_format.lower() if phred_score_format == 'auto': phred_score_param = '' elif phred_score_format == 'phred33': phred_score_param = '-phred33' elif phred_score_format == 'phred64': phred_score_param = '-phred64' else: raise ValueError('Wrong phred_score_format!') cmd = 'java -XX:ParallelGCThreads=1 {param} -jar {jar} SE -threads {nth} {phred_score_param} ' \ '{fq1} {cropped} MINLEN:{ml} CROP:{cl}'.format( param=java_heap_param, jar=locate_trimmomatic(), nth=nth, phred_score_param=phred_score_param, fq1=fastq1, cropped=cropped, ml=min_length, cl=crop_length, ) run_shell_cmd(cmd) return cropped
def bowtie2_se(fastq, ref_index_prefix, multimapping, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) run_shell_cmd( 'bowtie2 {multimapping} --mm --threads {nth} -x {ref} ' '-U {fastq} | samtools view -1 -S /dev/stdin > {tmp_bam}'.format( multimapping='-k {mm}'.format(mm=multimapping + 1) if multimapping else '', nth=nth, ref=ref_index_prefix, fastq=fastq, tmp_bam=tmp_bam, )) bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir) rm_f(tmp_bam) return bam
def bwa_se(fastq, ref_index_prefix, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir) run_shell_cmd( 'bwa samse {ref} {sai} {fastq} | ' 'samtools view -bS /dev/stdin {res_param} > {tmp_bam}'.format( ref=ref_index_prefix, sai=sai, fastq=fastq, res_param=get_samtools_res_param('view', nth=nth), tmp_bam=tmp_bam, )) rm_f(sai) bam = samtools_sort(tmp_bam, nth, mem_gb) rm_f(tmp_bam) return bam
def bowtie2_pe(fastq1, fastq2, ref_index_prefix, multimapping, local, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) run_shell_cmd( 'bowtie2 {multimapping} -X2000 {mode_param} --mm --threads {nth} -x {ref} ' '-1 {fastq1} -2 {fastq2} | samtools view -1 -S /dev/stdin > {tmp_bam}'. format( multimapping='-k {mm}'.format(mm=multimapping + 1) if multimapping else '', mode_param='--local ' if local else '', nth=nth, ref=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, tmp_bam=tmp_bam, )) bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir) rm_f(tmp_bam) return bam
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe, bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) log.info('Guessed read length of R1 FASTQ: {read_len}'.format( read_len=read_len, )) if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit: log.info('Use bwa mem.') cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format( extra_param='-P' if rescue_reads_for_bwa_mem else '', nth=nth, ref_index_prefix=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, sam=sam, ) temp_files.append(sam) else: log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.') sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: run_shell_cmd( 'zcat -f {sam} | grep -v -F -f {badcigar} | ' 'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, badcigar=badcigar, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) else: run_shell_cmd( 'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(temp_files) return bam
def create_job_file_pe(samplefile1, samplefile2, adapters, out_dir, trim_reads): basename1 = os.path.basename(strip_ext_fastq(samplefile1)) basename2 = os.path.basename(strip_ext_fastq(samplefile2)) prefix = os.path.join(out_dir, basename1) job_header = '#!/bin/bash\n' job_header += '#BSUB -P QFATACseq\n' job_header += '#BSUB -J {}_QFATACseq\n' job_header += '#BSUB -oo {}' + '/ATACseqJ1log.out\n' job_header += '#BSUB -eo {}' + '/ATACseqJ1log.err\n' job_header += '#BSUB -n 1\n' job_header += '#BSUB -N [email protected]\n' job_header = job_header.format(basename1, prefix, prefix) ### Load all the required module for analysis: module1 = 'module load trimmomatic/0.36\n' module1 += 'module load conda3/5.1.0\n' module1 += 'module load R/3.6.3\n' module1 += 'source activate /research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7\n' apps1 = 'fastqc=/research/rgs01/applications/hpcf/apps/fastqc/install/0.11.5/fastqc\n' apps1 += 'Bowtie2=/research/rgs01/applications/hpcf/apps/bowtie/install/2.2.9/bin/bowtie2\n' apps1 += 'samtools=/research/rgs01/applications/hpcf/apps/samtools/install/1.2/bin/samtools\n' apps1 += 'java=/research/rgs01/applications/hpcf/apps/java/jdk1.8.0_66/bin/java\n' apps1 += 'picard=/research/rgs01/applications/hpcf/apps/picard/install/2.16.0/picard.jar\n' apps1 += 'bedtools=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_apps/apps/bedtools2/bin/bedtools\n' apps1 += 'macs2=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7/bin/macs2\n' apps1 += 'genomeDir=/research/projects/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/Bowtie2/mm10\n' apps1 += 'blacklist=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_databases/ENCODE_blacklist/mm10-blacklist.v2.bed\n' apps1 += 'mm10=/research/rgs01/project_space/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/GRCm38.primary_assembly.genome.fa.size\n' job_body1 = '' job_body2 = '' job_body3 = '' job_body4 = '' job_body5 = '' job_body6 = '' job_body7 = '' ### Write job body to run each wrapper for the sample: log.info("Trimming with trimmomatic...") job_body1 = 'python /home/xzhen/pipelines/atac_pipeline/bin/trimmomatic_run.py --fastqs {} {} --nth 8 --adapters {} --out-dir {} --paired-end --trim-reads {}\n' job_body1 = job_body1.format(samplefile1, samplefile2, adapters, prefix, trim_reads) log.info("Doing Fastqc...") job_body2 = '$fastqc {} -o {}\n' job_body2 += '$fastqc {} -o {}\n' job_body2 = job_body2.format( prefix + '/' + basename1 + '.trim.paired.fastq.gz', prefix, prefix + '/' + basename2 + '.trim.paired.fastq.gz', prefix) log.info("Mapping paired-end files to reference genome...") job_body3 = '$Bowtie2 -x $genomeDir --very-sensitive --phred33 -X 2000 --no-mixed --no-discordant -p 8 -1 {} -2 {} | $samtools view -bS - > {}tmp.aligned.bam' job_body3 = job_body3.format( prefix + '/' + basename1 + '.trim.paired.fastq.gz', prefix + '/' + basename2 + '.trim.paired.fastq.gz', prefix + '/') log.info("PICARD is processing......") job_body4 = '$java -jar $picard AddOrReplaceReadGroups I= {}tmp.aligned.bam O= {}tmp.RGadded.bam SO=coordinate RGID={} RGLB={} RGPL=ATAC RGPU=Illumina RGSM={}\n' job_body4 += '$java -jar $picard MarkDuplicates I= {}tmp.RGadded.bam O= {}02_alignment.raw.bam M= {}tmp.dupMark.txt' job_body4 = job_body4.format(prefix + '/', prefix + '/', basename1, basename1, basename1, prefix + '/', prefix + '/', prefix + '/') log.info("Filtration is processing...") job_body5 = '$samtools view -b -F 780 -q 1 {}02_alignment.raw.bam > {}02_alignment.mapped.bam && $samtools flagstat {}02_alignment.mapped.bam > {}02_alignment.mapped.bam.txt\n' job_body5 += '$samtools view -b -F 1024 {}02_alignment.mapped.bam > {}02_alignment.mapped_rmdup.bam && $samtools flagstat {}02_alignment.mapped_rmdup.bam > {}02_alignment.mapped_rmdup.bam.txt\n' job_body5 += '$bedtools intersect -v -abam {}02_alignment.mapped_rmdup.bam -b $blacklist > {}02_alignment.mapped_rmdup_rmBLK.bam\n' job_body5 += '$samtools index {}02_alignment.mapped_rmdup_rmBLK.bam\n' job_body5 += '$samtools view -b {}02_alignment.mapped_rmdup_rmBLK.bam chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam\n' job_body5 = job_body5.format(prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/') log.info("Tn5 shift...") job_body6 = "$samtools view -h {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam | awk \'substr($0,1,1)==\"@\" || ($9<=146 && $9>=-146)\' | $samtools view -bS - > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.bam\n" job_body6 += "$samtools view -h {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam | awk \'substr($0,1,1)==\"@\" || ($9>146 && $9<=307) || ($9<-146 && $9>=-307)\' | $samtools view -bS - > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoMono.bam\n" job_body6 += "$samtools view -h {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam | awk \'substr($0,1,1)==\"@\" || ($9>307) || ($9<-307)\' | $samtools view -bS - > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoTwoplus.bam\n" job_body6 += "$samtools sort -n {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.bam {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName\n" job_body6 = "$bedtools bamtobed -i {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.bam | " #notice that '{...}' should be wrote as \'{{,,,}}\' job_body6 += 'awk -v OFS="\\t" \'{{if($6==\"+\"){{print $1,$2+4,$3+4,$4,$5,$6}}else if($6==\"-\"){{print $1,$2-5,$3-5,$4,$5,$6}}}}\' > ' job_body6 += '{}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bed\n' job_body6 += "$bedtools bedtobam -ubam -i {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bed -g $mm10 > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bam\n" job_body6 += "$samtools sort {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bam {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos\n" job_body6 += "$samtools index {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam\n" job_body6 = job_body6.format(prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/', prefix + '/') log.info("Footprinting......") job_body7 = 'conda deactivate\n' job_body7 += 'source activate atac \n' job_body7 += 'rgt-hint footprinting --atac-seq --paired-end --organism=mm10 --output-location={} --output-prefix=04_footPrint 02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam 03_NucleosomeFree_peaks.narrowPeak\n' job_body7 += 'rgt-hint tracks --bc --bigWig --organism=mm10 02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam 03_NucleosomeFree_peaks.narrowPeak --output-prefix={}/footprintingTracks\n' job_body7 += 'rgt-motifanalysis matching --motif-dbs /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/data/rgtdata/motifs/transfac_mouse --input-files {}/04_footPrint.bed --output-location={} --organism=mm10\n' job_body7 += 'Rscript /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/scripts/peakAnnotation.R {}/04_footPrint_mpbs.bed mm10\n' job_body7 += 'perl /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/scripts/annotationToNetwork.pl {}/04_footPrint_mpbs.annotate.txt {}/04_networkATACseq.txt\n' job_body7 = job_body7.format(prefix, prefix, prefix, prefix, prefix, prefix, prefix) jobfile = prefix + ".sh" with open(jobfile, "w") as new_file: new_file.write(job_header + module1 + '\n' + apps1 + '\n' + job_body1 + '\n' + job_body2 + '\n' + job_body3 + '\n' + job_body4 + '\n' + job_body5 + '\n' + job_body6 + '\n' + job_body7 + '\n') return jobfile