def checking(args, params, filenames): log.logger.debug('started.') try: read_num_limit = params.quick_check_read_num no_hhv_threshold = 3 / 1000000 need_check_threshold = 20 / 1000000 dr_threshold = 150 / 1000000 n_false = 0 n_need_check = 0 n_dr = 0 n_full = 0 finalfile = open(filenames.final_result, 'w') finalfile.write( '#file\tnum_unmapped_read_analyzed\tnum_read_mapped_to_HHV6\tHHV6_exists?\n' ) for f in filenames.fpaths: if args.file_type == 'rb': infile = pysam.AlignmentFile(f, 'rb', check_sq=False) elif args.file_type == 'rc': infile = pysam.AlignmentFile(f, 'rc', reference_filename=args.fa) n = 0 with open(filenames.unmapped, 'w') as outfile: tmp = [] for read in infile.fetch('*', until_eof=True): if read.is_unmapped: if not 'TAACCC' in read.query_sequence and not 'GGGTTA' in read.query_sequence: if read.is_read1 is True: header = '@%s/1' % read.query_name else: header = '@%s/2' % read.query_name tmp.append( '%s\n%s\n+\n%s\n' % (header, read.query_sequence, read.qual)) n += 1 if len(tmp) == 100_000: outfile.write(''.join(tmp)) tmp = [] if n == read_num_limit: break if len(tmp) >= 1: outfile.write(''.join(tmp)) outfile.flush() os.fdatasync(outfile.fileno()) infile.close() if n == 0: log.logger.info( 'No unmapped reads found in %s. Will continue anyway.' % (n, f)) finalfile.write('%s\t%d\tNA\tNA\n' % (f, n)) utils.gzip_or_del(args, params, filenames.unmapped) continue elif n < read_num_limit: log.logger.warning( 'Only %d unmapped reads were found in %s. Will continue anyway.' % (n, f)) # mapping cmd = 'hisat2 --mp %s -t -x %s -p %d -U %s --no-spliced-alignment > %s' % ( params.hisat2_mismatch_penalties, args.vrefindex, args.p, filenames.unmapped, filenames.mapped_sam) out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during mapping.') exit(1) utils.gzip_or_del(args, params, filenames.unmapped) # count mapped mapped_n = 0 with open(filenames.mapped_sam) as infile: for line in infile: if not line[0] == '@': ls = line.split() if not ls[5] == '*': readlen = len(ls[9]) if ls[5] == '%dM' % readlen: mapped_n += 1 mapped_ratio = mapped_n / n if mapped_ratio < no_hhv_threshold: judge = 'False' n_false += 1 elif mapped_ratio < need_check_threshold: judge = 'Need_further_check' n_need_check += 1 elif mapped_ratio < dr_threshold: judge = 'likely_solo-DR' n_dr += 1 else: judge = 'likely_Full-length' n_full += 1 finalfile.write('%s\t%d\t%d\t%s\n' % (f, n, mapped_n, judge)) utils.gzip_or_del(args, params, filenames.mapped_sam) finalfile.flush() os.fdatasync(finalfile.fileno()) log.logger.info( '\n\n\033[34mQuick check result:\n\n No HHV-6 = %d\n Need check = %d\n Likely solo-DR = %d\n Likely Full-length = %d\033[0m\n\n \033[31mCaveats: This result is estimation and only for a screening purpose. This is not a conclusive result.\033[0m\n' % (n_false, n_need_check, n_dr, n_full)) except: log.logger.error('\n' + traceback.format_exc()) exit(1)
elif args.fastqin is True: log.logger.info( 'Unmapped read retrieval skipped. Read1=%s, read2=%s.' % (args.fq1, args.fq2)) if args.single is False: filenames.unmapped_merged_1 = args.fq1 filenames.unmapped_merged_2 = args.fq2 else: filenames.unmapped_merged_1 = args.fq1 # 1. mapping import mapping log.logger.info('Mapping of unmapped reads started.') mapping.map_to_viruses(args, params, filenames) if args.alignmentin is True: utils.gzip_or_del(args, params, filenames.unmapped_merged_1) utils.gzip_or_del(args, params, filenames.unmapped_merged_2) if (args.ONT_bamin is False and mapping.read_mapped is True) or args.ONT_bamin is True: if args.ONT_bamin is True: import mapping filenames.mapped_to_virus_bam = args.ONT_bam if args.remove_chr_with_no_read is True: log.logger.info('Removing chrs without reads.') mapping.remove_chrs_no_read(args, params, filenames, hhv6a_refid, hhv6b_refid) log.logger.info('BAM to bedgraph conversion started.') mapping.bam_to_bedgraph(args, params, filenames) # 2. identify high coverage viruses
def retrieve_unmapped_reads(args, params, filenames): log.logger.debug('started.') try: if args.p <= 2: thread_n = args.p elif args.p >= 3: thread_n = args.p - 1 # retrieve discordant reads, default if args.use_mate_mapped is False and args.all_discordant is False: if not args.b is None: pysam.view('-@', '%d' % thread_n, '-f', '12', '-F', '3842', '-b', '-o', filenames.discordant_bam, args.b, catch_stdout=False) elif not args.c is None: pysam.view('-@', '%d' % thread_n, '-f', '12', '-F', '3842', '-b', '-o', filenames.discordant_bam, '--reference', args.fa, args.c, catch_stdout=False) pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_pre1, '-2', filenames.unmapped_merged_pre2, '-s', '/dev/null', filenames.discordant_bam) if args.keep is False: os.remove(filenames.discordant_bam) # retrieve discordant reads, non-default else: if not args.b is None: pysam.view('-@', '%d' % thread_n, '-f', '1', '-F', '3842', '-b', '-o', filenames.discordant_bam, args.b, catch_stdout=False) elif not args.c is None: pysam.view('-@', '%d' % thread_n, '-f', '1', '-F', '3842', '-b', '-o', filenames.discordant_bam, '--reference', args.fa, args.c, catch_stdout=False) pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.discordant_sort_bam, filenames.discordant_bam) if args.keep is False: os.remove(filenames.discordant_bam) if args.all_discordant is True: pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_pre1, '-2', filenames.unmapped_merged_pre2, '-s', '/dev/null', filenames.discordant_sort_bam) else: pysam.fastq('-@', '%d' % thread_n, '-f', '12', '-F', '3328', '-N', '-0', '/dev/null', '-1', filenames.unmapped_1, '-2', filenames.unmapped_2, '-s', '/dev/null', filenames.discordant_sort_bam) if args.use_mate_mapped is True: pysam.view('-@', '%d' % thread_n, '-f', '8', '-F', '3332', '-b', '-o', filenames.unmapped_bam_3, filenames.discordant_sort_bam, catch_stdout=False) pysam.view('-@', '%d' % thread_n, '-f', '4', '-F', '3336', '-b', '-o', filenames.unmapped_bam_4, filenames.discordant_sort_bam, catch_stdout=False) pysam.merge('-@', '%d' % thread_n, '-f', filenames.unmapped_bam_34, filenames.unmapped_bam_3, filenames.unmapped_bam_4) pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.unmapped_sorted_34, filenames.unmapped_bam_34) pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_3, '-2', filenames.unmapped_4, '-s', '/dev/null', filenames.unmapped_sorted_34) # concatenate fastq with open(filenames.unmapped_merged_pre1, 'w') as outfile: for f in [filenames.unmapped_1, filenames.unmapped_3]: if os.path.exists(f) is True: with open(f) as infile: for line in infile: outfile.write(line) utils.gzip_or_del(args, params, f) with open(filenames.unmapped_merged_pre2, 'w') as outfile: for f in [filenames.unmapped_2, filenames.unmapped_4]: if os.path.exists(f) is True: with open(f) as infile: for line in infile: outfile.write(line) utils.gzip_or_del(args, params, f) # remove short reads infile1 = open(filenames.unmapped_merged_pre1) infile2 = open(filenames.unmapped_merged_pre2) outfile1 = open(filenames.unmapped_merged_1, 'w') outfile2 = open(filenames.unmapped_merged_2, 'w') min_seq_len = params.min_seq_len tmp1, tmp2 = [], [] for line1, line2 in zip(infile1, infile2): tmp1.append(line1) tmp2.append(line2) if len(tmp1) == 4: seqlen1 = len(tmp1[1].strip()) seqlen2 = len(tmp2[1].strip()) if seqlen1 >= min_seq_len and seqlen2 >= min_seq_len: outfile1.write(''.join(tmp1)) outfile2.write(''.join(tmp2)) tmp1, tmp2 = [], [] infile1.close() infile2.close() outfile1.close() outfile2.close() utils.gzip_or_del(args, params, filenames.unmapped_merged_pre1) utils.gzip_or_del(args, params, filenames.unmapped_merged_pre2) if args.keep is False: if os.path.exists(filenames.discordant_sort_bam) is True: os.remove(filenames.discordant_sort_bam) if args.use_mate_mapped is True: os.remove(filenames.unmapped_bam_3) os.remove(filenames.unmapped_bam_4) os.remove(filenames.unmapped_bam_34) os.remove(filenames.unmapped_sorted_34) except: log.logger.error('\n' + traceback.format_exc()) exit(1)