コード例 #1
0
def trimmomatic_pe(fastq1,
                   fastq2,
                   crop_length,
                   out_dir_R1,
                   out_dir_R2,
                   nth=1,
                   java_heap=None):
    prefix_R1 = os.path.join(out_dir_R1,
                             os.path.basename(strip_ext_fastq(fastq1)))
    prefix_R2 = os.path.join(out_dir_R2,
                             os.path.basename(strip_ext_fastq(fastq2)))
    cropped_R1 = '{}.crop_{}bp.fastq.gz'.format(prefix_R1, crop_length)
    cropped_R2 = '{}.crop_{}bp.fastq.gz'.format(prefix_R2, crop_length)
    tmp_cropped_R1 = '{}.tmp'.format(cropped_R1)
    tmp_cropped_R2 = '{}.tmp'.format(cropped_R2)

    if java_heap is None:
        java_heap_param = '-Xmx6G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    cmd = 'java -XX:ParallelGCThreads=1 {} -jar {} PE -threads {} '
    cmd += '{} {} {} {} {} {} MINLEN:{} CROP:{}'
    cmd = cmd.format(java_heap_param, locate_trimmomatic(), nth, fastq1,
                     fastq2, cropped_R1, tmp_cropped_R1, cropped_R2,
                     tmp_cropped_R2, crop_length, crop_length)
    run_shell_cmd(cmd)
    rm_f([tmp_cropped_R1, tmp_cropped_R2])

    return cropped_R1, cropped_R2
コード例 #2
0
def trimmomatic_pe(fastq1,
                   fastq2,
                   crop_length,
                   crop_length_tol,
                   out_dir_R1,
                   out_dir_R2,
                   nth=1,
                   java_heap=None):
    prefix_R1 = os.path.join(out_dir_R1,
                             os.path.basename(strip_ext_fastq(fastq1)))
    prefix_R2 = os.path.join(out_dir_R2,
                             os.path.basename(strip_ext_fastq(fastq2)))

    crop_length_tol = abs(crop_length_tol)
    min_length = crop_length - crop_length_tol

    cropped_R1 = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix_R1,
                                                         cl=crop_length,
                                                         tol=crop_length_tol)
    cropped_R2 = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix_R2,
                                                         cl=crop_length,
                                                         tol=crop_length_tol)
    tmp_cropped_R1 = '{}.tmp'.format(cropped_R1)
    tmp_cropped_R2 = '{}.tmp'.format(cropped_R2)

    if java_heap is None:
        java_heap_param = '-Xmx6G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    cmd = 'java -XX:ParallelGCThreads=1 {param} -jar {jar} PE -threads {nth} '
    cmd += '{fq1} {fq2} {cropped1} {tmp_cropped1} {cropped2} {tmp_cropped2} '
    cmd += 'MINLEN:{ml} CROP:{cl}'
    cmd = cmd.format(param=java_heap_param,
                     jar=locate_trimmomatic(),
                     nth=nth,
                     fq1=fastq1,
                     fq2=fastq2,
                     cropped1=cropped_R1,
                     tmp_cropped1=tmp_cropped_R1,
                     cropped2=cropped_R2,
                     tmp_cropped2=tmp_cropped_R2,
                     ml=min_length,
                     cl=crop_length)
    run_shell_cmd(cmd)
    rm_f([tmp_cropped_R1, tmp_cropped_R2])

    return cropped_R1, cropped_R2
コード例 #3
0
def trimmomatic_se(fastq1,
                   crop_length,
                   crop_length_tol,
                   out_dir,
                   nth=1,
                   java_heap=None):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1)))
    crop_length_tol = abs(crop_length_tol)
    min_length = crop_length - crop_length_tol
    cropped = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix,
                                                      cl=crop_length,
                                                      tol=crop_length_tol)

    if java_heap is None:
        java_heap_param = '-Xmx6G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    cmd = 'java -XX:ParallelGCThreads=1 {param} -jar {jar} SE -threads {nth} '
    cmd += '{fq1} {cropped} MINLEN:{ml} CROP:{cl}'
    cmd = cmd.format(param=java_heap_param,
                     jar=locate_trimmomatic(),
                     nth=nth,
                     fq1=fastq1,
                     cropped=cropped,
                     ml=min_length,
                     cl=crop_length)
    run_shell_cmd(cmd)

    return cropped
コード例 #4
0
def bowtie2_pe(fastq1, fastq2, ref_index_prefix,
               multimapping, nth, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    bam = '{}.bam'.format(prefix)
    align_log = '{}.align.log'.format(prefix)

    cmd = 'bowtie2 {} -X2000 --mm --threads {} -x {} '
    cmd += '-1 {} -2 {} 2>{} | '
    cmd += 'samtools view -Su /dev/stdin | '
    cmd += 'samtools sort /dev/stdin -o {} -T {}'
    cmd = cmd.format(
        '-k {}'.format(multimapping+1) if multimapping else '',
        nth,
        ref_index_prefix,
        fastq1,
        fastq2,
        align_log,
        bam,
        prefix)
    run_shell_cmd(cmd)

    cmd2 = 'cat {}'.format(align_log)
    run_shell_cmd(cmd2)
    return bam, align_log
コード例 #5
0
def make_read_length_file(fastq, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    txt = '{}.read_length.txt'.format(prefix)
    read_length = get_read_length(fastq)
    with open(txt, 'w') as fp:
        fp.write(str(read_length))
    return txt
コード例 #6
0
def bwa_aln(fastq, ref_index_prefix, nth, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    sai = '{}.sai'.format(prefix)

    cmd = 'bwa aln -q 5 -l 32 -k 2 -t {nth} {ref} {fastq} > {sai}'.format(
        nth=nth, ref=ref_index_prefix, fastq=fastq, sai=sai)
    run_shell_cmd(cmd)
    return sai
コード例 #7
0
def create_job_file_pe(samplefile1, samplefile2, out_dir):
    basename1 = os.path.basename(strip_ext_fastq(samplefile1))
    basename2 = os.path.basename(strip_ext_fastq(samplefile2))
    prefix = os.path.join(out_dir, basename1)

    job_header = '#!/bin/bash\n'
    job_header += '#BSUB -P QFATACseq\n'
    job_header += '#BSUB -J {}_QFATACseq\n'
    job_header += '#BSUB -oo {}' + '/ATACseqJ1log.out\n'
    job_header += '#BSUB -eo {}' + '/ATACseqJ1log.err\n'
    job_header += '#BSUB -n 1\n'
    job_header += '#BSUB -N [email protected]\n'
    job_header = job_header.format(basename1, prefix, prefix)

    ########## Section 3: Footprinting
    ### Load all the required module for analysis:
    module1 = 'module load R/3.6.3 \n'
    module1 += 'module load conda3/5.1.0 \n'
    module1 += 'source activate /research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7 \n'
    module1 += 'export RGTDATA=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_databases/rgtdata \n'

    apps1 = 'hint=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7/bin/rgt-hint \n'
    apps1 += 'motifanalysis=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7/bin/rgt-motifanalysis \n'
    apps1 += 'genomeDir=/research/projects/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/Bowtie2/mm10 \n'
    apps1 += 'blacklist=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_databases/ENCODE_blacklist/mm10-blacklist.v2.bed \n'
    apps1 += 'mm10=/research/rgs01/project_space/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/GRCm38.primary_assembly.genome.fa.size \n'

    job_body8 = ''

    log.info("Foot Printing......")
    job_body8 = '$hint footprinting --atac-seq --paired-end --organism=mm10 --output-location={} --output-prefix=04_footPrint {}/02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam {}/03_NucleosomeFree_peaks.narrowPeak\n'
    job_body8 += '$hint tracks --bc --bigWig --organism=mm10 {}/02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam {}/03_NucleosomeFree_peaks.narrowPeak --output-prefix={}/footprintingTracks\n'
    job_body8 += '$motifanalysis matching --motif-dbs $RGTDATA/motifs/transfac_mouse --input-files {}/04_footPrint.bed --output-location={} --organism=mm10\n'
    #if current script don't work, then: Job_job8 += 'conda deactivate \n' then remove the "$" in Rscript [[lack of mpbs_annotation.txt for unknown reason]]
    job_body8 += 'perl /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/scripts/annotationToNetwork.pl {}/04_footPrint_mpbs.annotate.txt {}/04_networkATACseq.txt\n'
    job_body8 = job_body8.format(prefix, prefix, prefix, prefix, prefix,
                                 prefix, prefix, prefix, prefix, prefix)

    jobfile = prefix + ".sh"
    with open(jobfile, "w") as new_file:
        new_file.write(job_header + module1 + '\n' + apps1 + '\n' + job_body8 +
                       '\n')
    return jobfile
コード例 #8
0
def trim_adapter_pe(fastq1, fastq2, adapter1, adapter2, adapter_for_all,
                    cutadapt_param, out_dir):
    if adapter1 and adapter2:
        prefix1 = os.path.join(out_dir,
                               os.path.basename(strip_ext_fastq(fastq1)))
        prefix2 = os.path.join(out_dir,
                               os.path.basename(strip_ext_fastq(fastq2)))
        trimmed1 = '{}.trim.fastq.gz'.format(prefix1)
        trimmed2 = '{}.trim.fastq.gz'.format(prefix2)

        cmd = 'cutadapt {} -a {} -A {} {} {} -o {} -p {}'.format(
            cutadapt_param, adapter_for_all if adapter_for_all else adapter1,
            adapter_for_all if adapter_for_all else adapter2, fastq1, fastq2,
            trimmed1, trimmed2)
        run_shell_cmd(cmd)
        return [trimmed1, trimmed2]
    else:
        fq1 = copy_f_to_dir(fastq1, out_dir)
        fq2 = copy_f_to_dir(fastq2, out_dir)
        return [fq1, fq2]
コード例 #9
0
def merge_fastqs(fastqs, end, out_dir):
    out_dir = os.path.join(out_dir, end)
    mkdir_p(out_dir)
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_fastq(fastqs[0])))
    merged = '{}.merged.fastq.gz'.format(prefix)

    if len(fastqs) > 1:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(' '.join(fastqs), merged)
        run_shell_cmd(cmd)
        return merged
    else:
        return hard_link(fastqs[0], merged)
コード例 #10
0
def bowtie2_se(fastq, ref_index_prefix, multimapping, nth, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    bam = '{}.bam'.format(prefix)

    cmd = 'bowtie2 {} --mm --threads {} -x {} -U {} '
    cmd += '| samtools view -Su /dev/stdin '
    cmd += '| samtools sort /dev/stdin -o {} -T {}'
    cmd = cmd.format('-k {}'.format(multimapping + 1) if multimapping else '',
                     nth, ref_index_prefix, fastq, bam, prefix)
    run_shell_cmd(cmd)

    return bam
コード例 #11
0
def trim_adapter_se(fastq, adapter, adapter_for_all, cutadapt_param, out_dir):
    if adapter:
        prefix = os.path.join(out_dir,
                              os.path.basename(strip_ext_fastq(fastq)))
        trimmed = '{}.trim.fastq.gz'.format(prefix)

        cmd = 'cutadapt {} -a {} {} | gzip -nc > {}'.format(
            cutadapt_param, adapter_for_all if adapter_for_all else adapter,
            fastq, trimmed)
        run_shell_cmd(cmd)
        return trimmed
    else:
        return copy_f_to_dir(fastq, out_dir)
コード例 #12
0
def bwa_se(fastq, ref_index_prefix, nth, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    bam = '{}.bam'.format(prefix)

    sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir)

    cmd = 'bwa samse {} {} {} | '
    cmd += 'samtools view -Su - | samtools sort - -o {} -T {}'
    cmd = cmd.format(ref_index_prefix, sai, fastq, bam, prefix)
    run_shell_cmd(cmd)

    rm_f(sai)
    return bam
コード例 #13
0
def trim_adapter_pe(fastq1, fastq2, adapter1, adapter2, adapter_for_all,
                    cutadapt_param, out_dir):
    if adapter1 and adapter2:
        prefix1 = os.path.join(out_dir,
                               os.path.basename(strip_ext_fastq(fastq1)))
        prefix2 = os.path.join(out_dir,
                               os.path.basename(strip_ext_fastq(fastq2)))
        trimmed1 = '{}.trim.fastq.gz'.format(prefix1)
        trimmed2 = '{}.trim.fastq.gz'.format(prefix2)

        cmd = 'cutadapt {} -a {} -A {} {} {} -o {} -p {}'.format(
            cutadapt_param, adapter_for_all if adapter_for_all else adapter1,
            adapter_for_all if adapter_for_all else adapter2, fastq1, fastq2,
            trimmed1, trimmed2)
        run_shell_cmd(cmd)
        return [trimmed1, trimmed2]
    else:
        # make hard link
        linked1 = os.path.join(out_dir, os.path.basename(fastq1))
        linked2 = os.path.join(out_dir, os.path.basename(fastq2))
        os.link(fastq1, linked1)
        os.link(fastq2, linked2)
        return [linked1, linked2]
コード例 #14
0
def merge_fastqs(fastqs, end, out_dir):
    """make merged fastqs on $out_dir/R1, $out_dir/R2
    """
    out_dir = os.path.join(out_dir, end)
    mkdir_p(out_dir)
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_fastq(fastqs[0])))
    merged = '{}.merged.fastq.gz'.format(prefix)

    if len(fastqs) > 1:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(' '.join(fastqs), merged)
        run_shell_cmd(cmd)
        return merged
    else:
        return copy_f_to_f(fastqs[0], merged)
コード例 #15
0
def trim_fastq(fastq, trim_bp, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq)))
    trimmed = '{}.trim_{}bp.fastq.gz'.format(prefix, trim_bp)

    cmd = 'python $(which trimfastq.py) {} {} | gzip -nc > {}'.format(
        fastq, trim_bp, trimmed)
    run_shell_cmd(cmd)

    # if shorter than trim_bp
    cmd2 = 'zcat -f {} | (grep \'sequences shorter than desired length\' '
    cmd2 += '|| true) | wc -l'
    cmd2 = cmd2.format(trimmed)
    if int(run_shell_cmd(cmd2)) > 0:
        copy_f_to_f(fastq, trimmed)

    return trimmed
コード例 #16
0
def trimmomatic_se(fastq1, crop_length, out_dir, nth=1, java_heap=None):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1)))
    cropped = '{}.crop_{}bp.fastq.gz'.format(prefix, crop_length)

    if java_heap is None:
        java_heap_param = '-Xmx6G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    cmd = 'java -XX:ParallelGCThreads=1 {} -jar {} SE -threads {} '
    cmd += '{} {} MINLEN:{} CROP:{}'
    cmd = cmd.format(java_heap_param, locate_trimmomatic(), nth, fastq1,
                     cropped, crop_length, crop_length)
    run_shell_cmd(cmd)

    return cropped
コード例 #17
0
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, use_bwa_mem_for_pe, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    sam = '{}.sam'.format(prefix)
    badcigar = '{}.badReads'.format(prefix)
    bam = '{}.bam'.format(prefix)

    temp_files = []
    read_len = get_read_length(fastq1)
    if use_bwa_mem_for_pe and read_len >= 70:
        cmd = 'bwa mem -M -t {} {} {} {} | gzip -nc > {}'
        cmd = cmd.format(nth, ref_index_prefix, fastq1, fastq2, sam)
        temp_files.append(sam)
    else:
        sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
        sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)

        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
            ref_index_prefix, sai1, sai2, fastq1, fastq2, sam)
        temp_files.extend([sai1, sai2, sam])
    run_shell_cmd(cmd)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
    cmd2 += 'sort | uniq > {}'
    cmd2 = cmd2.format(sam, badcigar)
    run_shell_cmd(cmd2)

    # Remove bad CIGAR read pairs
    if get_num_lines(badcigar) > 0:
        cmd3 = 'zcat -f {} | grep -v -F -f {} | '
        cmd3 += 'samtools view -Su - | samtools sort - -o {} -T {}'
        cmd3 = cmd3.format(sam, badcigar, bam, prefix)
    else:
        cmd3 = 'samtools view -Su {} | samtools sort - -o {} -T {}'
        cmd3 = cmd3.format(sam, bam, prefix)
    run_shell_cmd(cmd3)

    rm_f(temp_files)
    return bam
コード例 #18
0
def trimmomatic_se(fastq1,
                   crop_length,
                   crop_length_tol,
                   phred_score_format,
                   out_dir,
                   nth=1,
                   java_heap=None):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_fastq(fastq1)))
    crop_length_tol = abs(crop_length_tol)
    min_length = crop_length - crop_length_tol
    cropped = '{p}.crop_{cl}-{tol}bp.fastq.gz'.format(p=prefix,
                                                      cl=crop_length,
                                                      tol=crop_length_tol)

    if java_heap is None:
        java_heap_param = '-Xmx6G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    phred_score_format = phred_score_format.lower()
    if phred_score_format == 'auto':
        phred_score_param = ''
    elif phred_score_format == 'phred33':
        phred_score_param = '-phred33'
    elif phred_score_format == 'phred64':
        phred_score_param = '-phred64'
    else:
        raise ValueError('Wrong phred_score_format!')

    cmd = 'java -XX:ParallelGCThreads=1 {param} -jar {jar} SE -threads {nth} {phred_score_param} ' \
          '{fq1} {cropped} MINLEN:{ml} CROP:{cl}'.format(
        param=java_heap_param,
        jar=locate_trimmomatic(),
        nth=nth,
        phred_score_param=phred_score_param,
        fq1=fastq1,
        cropped=cropped,
        ml=min_length,
        cl=crop_length,
    )
    run_shell_cmd(cmd)

    return cropped
コード例 #19
0
def bowtie2_se(fastq, ref_index_prefix, multimapping, nth, mem_gb, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    run_shell_cmd(
        'bowtie2 {multimapping} --mm --threads {nth} -x {ref} '
        '-U {fastq} | samtools view -1 -S /dev/stdin > {tmp_bam}'.format(
            multimapping='-k {mm}'.format(mm=multimapping +
                                          1) if multimapping else '',
            nth=nth,
            ref=ref_index_prefix,
            fastq=fastq,
            tmp_bam=tmp_bam,
        ))
    bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir)
    rm_f(tmp_bam)

    return bam
コード例 #20
0
def bwa_se(fastq, ref_index_prefix, nth, mem_gb, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir)

    run_shell_cmd(
        'bwa samse {ref} {sai} {fastq} | '
        'samtools view -bS /dev/stdin {res_param} > {tmp_bam}'.format(
            ref=ref_index_prefix,
            sai=sai,
            fastq=fastq,
            res_param=get_samtools_res_param('view', nth=nth),
            tmp_bam=tmp_bam,
        ))
    rm_f(sai)

    bam = samtools_sort(tmp_bam, nth, mem_gb)
    rm_f(tmp_bam)

    return bam
コード例 #21
0
def bowtie2_pe(fastq1, fastq2, ref_index_prefix, multimapping, local, nth,
               mem_gb, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    run_shell_cmd(
        'bowtie2 {multimapping} -X2000 {mode_param} --mm --threads {nth} -x {ref} '
        '-1 {fastq1} -2 {fastq2} | samtools view -1 -S /dev/stdin > {tmp_bam}'.
        format(
            multimapping='-k {mm}'.format(mm=multimapping +
                                          1) if multimapping else '',
            mode_param='--local ' if local else '',
            nth=nth,
            ref=ref_index_prefix,
            fastq1=fastq1,
            fastq2=fastq2,
            tmp_bam=tmp_bam,
        ))
    bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir)
    rm_f(tmp_bam)

    return bam
コード例 #22
0
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe,
           bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    sam = '{}.sam'.format(prefix)
    badcigar = '{}.badReads'.format(prefix)
    bam = '{}.bam'.format(prefix)

    temp_files = []
    read_len = get_read_length(fastq1)

    log.info('Guessed read length of R1 FASTQ: {read_len}'.format(
        read_len=read_len, ))
    if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit:
        log.info('Use bwa mem.')

        cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format(
            extra_param='-P' if rescue_reads_for_bwa_mem else '',
            nth=nth,
            ref_index_prefix=ref_index_prefix,
            fastq1=fastq1,
            fastq2=fastq2,
            sam=sam,
        )
        temp_files.append(sam)

    else:
        log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.')
        sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
        sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)

        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
            ref_index_prefix, sai1, sai2, fastq1, fastq2, sam)
        temp_files.extend([sai1, sai2, sam])
    run_shell_cmd(cmd)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
    cmd2 += 'sort | uniq > {}'
    cmd2 = cmd2.format(sam, badcigar)
    run_shell_cmd(cmd2)

    # Remove bad CIGAR read pairs
    if get_num_lines(badcigar) > 0:
        run_shell_cmd(
            'zcat -f {sam} | grep -v -F -f {badcigar} | '
            'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                badcigar=badcigar,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
    else:
        run_shell_cmd(
            'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    rm_f(temp_files)
    return bam
コード例 #23
0
def create_job_file_pe(samplefile1, samplefile2, adapters, out_dir,
                       trim_reads):
    basename1 = os.path.basename(strip_ext_fastq(samplefile1))
    basename2 = os.path.basename(strip_ext_fastq(samplefile2))
    prefix = os.path.join(out_dir, basename1)

    job_header = '#!/bin/bash\n'
    job_header += '#BSUB -P QFATACseq\n'
    job_header += '#BSUB -J {}_QFATACseq\n'
    job_header += '#BSUB -oo {}' + '/ATACseqJ1log.out\n'
    job_header += '#BSUB -eo {}' + '/ATACseqJ1log.err\n'
    job_header += '#BSUB -n 1\n'
    job_header += '#BSUB -N [email protected]\n'
    job_header = job_header.format(basename1, prefix, prefix)

    ### Load all the required module for analysis:
    module1 = 'module load trimmomatic/0.36\n'
    module1 += 'module load conda3/5.1.0\n'
    module1 += 'module load R/3.6.3\n'
    module1 += 'source activate /research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7\n'

    apps1 = 'fastqc=/research/rgs01/applications/hpcf/apps/fastqc/install/0.11.5/fastqc\n'
    apps1 += 'Bowtie2=/research/rgs01/applications/hpcf/apps/bowtie/install/2.2.9/bin/bowtie2\n'
    apps1 += 'samtools=/research/rgs01/applications/hpcf/apps/samtools/install/1.2/bin/samtools\n'
    apps1 += 'java=/research/rgs01/applications/hpcf/apps/java/jdk1.8.0_66/bin/java\n'
    apps1 += 'picard=/research/rgs01/applications/hpcf/apps/picard/install/2.16.0/picard.jar\n'
    apps1 += 'bedtools=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_apps/apps/bedtools2/bin/bedtools\n'
    apps1 += 'macs2=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/conda_env/yulab_env_2.7/bin/macs2\n'
    apps1 += 'genomeDir=/research/projects/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/Bowtie2/mm10\n'
    apps1 += 'blacklist=/research/rgs01/project_space/yu3grp/software_JY/yu3grp/yulab_databases/ENCODE_blacklist/mm10-blacklist.v2.bed\n'
    apps1 += 'mm10=/research/rgs01/project_space/yu3grp/scRNASeq/yu3grp/qpan/Database/References/mm10/Gencode/GRCm38.primary_assembly.genome.fa.size\n'

    job_body1 = ''
    job_body2 = ''
    job_body3 = ''
    job_body4 = ''
    job_body5 = ''
    job_body6 = ''
    job_body7 = ''

    ### Write job body to run each wrapper for the sample:

    log.info("Trimming with trimmomatic...")
    job_body1 = 'python /home/xzhen/pipelines/atac_pipeline/bin/trimmomatic_run.py --fastqs {} {} --nth 8 --adapters {} --out-dir {} --paired-end --trim-reads {}\n'
    job_body1 = job_body1.format(samplefile1, samplefile2, adapters, prefix,
                                 trim_reads)

    log.info("Doing Fastqc...")
    job_body2 = '$fastqc {} -o {}\n'
    job_body2 += '$fastqc {} -o {}\n'
    job_body2 = job_body2.format(
        prefix + '/' + basename1 + '.trim.paired.fastq.gz', prefix,
        prefix + '/' + basename2 + '.trim.paired.fastq.gz', prefix)

    log.info("Mapping paired-end files to reference genome...")
    job_body3 = '$Bowtie2 -x $genomeDir --very-sensitive --phred33 -X 2000 --no-mixed --no-discordant -p 8 -1 {} -2 {} | $samtools view -bS - > {}tmp.aligned.bam'
    job_body3 = job_body3.format(
        prefix + '/' + basename1 + '.trim.paired.fastq.gz',
        prefix + '/' + basename2 + '.trim.paired.fastq.gz', prefix + '/')

    log.info("PICARD is processing......")

    job_body4 = '$java -jar $picard AddOrReplaceReadGroups I= {}tmp.aligned.bam O= {}tmp.RGadded.bam SO=coordinate RGID={} RGLB={} RGPL=ATAC RGPU=Illumina RGSM={}\n'
    job_body4 += '$java -jar $picard MarkDuplicates I= {}tmp.RGadded.bam O= {}02_alignment.raw.bam M= {}tmp.dupMark.txt'
    job_body4 = job_body4.format(prefix + '/', prefix + '/', basename1,
                                 basename1, basename1, prefix + '/',
                                 prefix + '/', prefix + '/')

    log.info("Filtration is processing...")
    job_body5 = '$samtools view -b -F 780 -q 1 {}02_alignment.raw.bam > {}02_alignment.mapped.bam && $samtools flagstat {}02_alignment.mapped.bam > {}02_alignment.mapped.bam.txt\n'
    job_body5 += '$samtools view -b -F 1024 {}02_alignment.mapped.bam > {}02_alignment.mapped_rmdup.bam && $samtools flagstat {}02_alignment.mapped_rmdup.bam > {}02_alignment.mapped_rmdup.bam.txt\n'
    job_body5 += '$bedtools intersect -v -abam {}02_alignment.mapped_rmdup.bam -b $blacklist > {}02_alignment.mapped_rmdup_rmBLK.bam\n'
    job_body5 += '$samtools index {}02_alignment.mapped_rmdup_rmBLK.bam\n'
    job_body5 += '$samtools view -b {}02_alignment.mapped_rmdup_rmBLK.bam chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam\n'
    job_body5 = job_body5.format(prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/')

    log.info("Tn5 shift...")
    job_body6 = "$samtools view -h {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam | awk \'substr($0,1,1)==\"@\" || ($9<=146 && $9>=-146)\' | $samtools view -bS - > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.bam\n"
    job_body6 += "$samtools view -h {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam | awk \'substr($0,1,1)==\"@\" || ($9>146 && $9<=307) || ($9<-146 && $9>=-307)\' | $samtools view -bS - > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoMono.bam\n"
    job_body6 += "$samtools view -h {}02_alignment.mapped_rmdup_rmBLK_mitoFree.bam | awk \'substr($0,1,1)==\"@\" || ($9>307) || ($9<-307)\' | $samtools view -bS - > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoTwoplus.bam\n"
    job_body6 += "$samtools sort -n {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.bam {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName\n"
    job_body6 = "$bedtools bamtobed -i {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.bam | "
    #notice that '{...}' should be wrote as \'{{,,,}}\'
    job_body6 += 'awk -v OFS="\\t" \'{{if($6==\"+\"){{print $1,$2+4,$3+4,$4,$5,$6}}else if($6==\"-\"){{print $1,$2-5,$3-5,$4,$5,$6}}}}\' > '
    job_body6 += '{}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bed\n'
    job_body6 += "$bedtools bedtobam -ubam -i {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bed -g $mm10 > {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bam\n"
    job_body6 += "$samtools sort {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.bam {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos\n"
    job_body6 += "$samtools index {}02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam\n"
    job_body6 = job_body6.format(prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/',
                                 prefix + '/', prefix + '/', prefix + '/')

    log.info("Footprinting......")
    job_body7 = 'conda deactivate\n'
    job_body7 += 'source activate atac \n'
    job_body7 += 'rgt-hint footprinting --atac-seq --paired-end --organism=mm10 --output-location={} --output-prefix=04_footPrint 02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam 03_NucleosomeFree_peaks.narrowPeak\n'
    job_body7 += 'rgt-hint tracks --bc --bigWig --organism=mm10 02_alignment.mapped_rmdup_rmBLK_mitoFree.nucleoFree.sortedByName.shifted.sortedByPos.bam 03_NucleosomeFree_peaks.narrowPeak --output-prefix={}/footprintingTracks\n'
    job_body7 += 'rgt-motifanalysis matching --motif-dbs /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/data/rgtdata/motifs/transfac_mouse --input-files {}/04_footPrint.bed --output-location={} --organism=mm10\n'
    job_body7 += 'Rscript /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/scripts/peakAnnotation.R {}/04_footPrint_mpbs.bed mm10\n'
    job_body7 += 'perl /research/rgs01/project_space/yu3grp/software_JY/yu3grp/git_repo/ATACseq_pipeline/scripts/annotationToNetwork.pl {}/04_footPrint_mpbs.annotate.txt {}/04_networkATACseq.txt\n'
    job_body7 = job_body7.format(prefix, prefix, prefix, prefix, prefix,
                                 prefix, prefix)

    jobfile = prefix + ".sh"
    with open(jobfile, "w") as new_file:
        new_file.write(job_header + module1 + '\n' + apps1 + '\n' + job_body1 +
                       '\n' + job_body2 + '\n' + job_body3 + '\n' + job_body4 +
                       '\n' + job_body5 + '\n' + job_body6 + '\n' + job_body7 +
                       '\n')
    return jobfile