def make_read_length_file(fastq, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) txt = '{}.read_length.txt'.format(prefix) read_length = get_read_length(fastq) with open(txt, 'w') as fp: fp.write(str(read_length)) return txt
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, use_bwa_mem_for_pe, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) if use_bwa_mem_for_pe and read_len >= 70: cmd = 'bwa mem -M -t {} {} {} {} | gzip -nc > {}' cmd = cmd.format(nth, ref_index_prefix, fastq1, fastq2, sam) temp_files.append(sam) else: sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: cmd3 = 'zcat -f {} | grep -v -F -f {} | ' cmd3 += 'samtools view -Su - | samtools sort - -o {} -T {}' cmd3 = cmd3.format(sam, badcigar, bam, prefix) else: cmd3 = 'samtools view -Su {} | samtools sort - -o {} -T {}' cmd3 = cmd3.format(sam, bam, prefix) run_shell_cmd(cmd3) rm_f(temp_files) return bam
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe, bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) log.info('Guessed read length of R1 FASTQ: {read_len}'.format( read_len=read_len, )) if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit: log.info('Use bwa mem.') cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format( extra_param='-P' if rescue_reads_for_bwa_mem else '', nth=nth, ref_index_prefix=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, sam=sam, ) temp_files.append(sam) else: log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.') sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: run_shell_cmd( 'zcat -f {sam} | grep -v -F -f {badcigar} | ' 'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, badcigar=badcigar, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) else: run_shell_cmd( 'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(temp_files) return bam