def checking(args, params, filenames):
    log.logger.debug('started.')
    try:
        read_num_limit = params.quick_check_read_num
        no_hhv_threshold = 3 / 1000000
        need_check_threshold = 20 / 1000000
        dr_threshold = 150 / 1000000
        n_false = 0
        n_need_check = 0
        n_dr = 0
        n_full = 0

        finalfile = open(filenames.final_result, 'w')
        finalfile.write(
            '#file\tnum_unmapped_read_analyzed\tnum_read_mapped_to_HHV6\tHHV6_exists?\n'
        )
        for f in filenames.fpaths:
            if args.file_type == 'rb':
                infile = pysam.AlignmentFile(f, 'rb', check_sq=False)
            elif args.file_type == 'rc':
                infile = pysam.AlignmentFile(f,
                                             'rc',
                                             reference_filename=args.fa)
            n = 0
            with open(filenames.unmapped, 'w') as outfile:
                tmp = []
                for read in infile.fetch('*', until_eof=True):
                    if read.is_unmapped:
                        if not 'TAACCC' in read.query_sequence and not 'GGGTTA' in read.query_sequence:
                            if read.is_read1 is True:
                                header = '@%s/1' % read.query_name
                            else:
                                header = '@%s/2' % read.query_name
                            tmp.append(
                                '%s\n%s\n+\n%s\n' %
                                (header, read.query_sequence, read.qual))
                            n += 1
                    if len(tmp) == 100_000:
                        outfile.write(''.join(tmp))
                        tmp = []
                    if n == read_num_limit:
                        break
                if len(tmp) >= 1:
                    outfile.write(''.join(tmp))
                outfile.flush()
                os.fdatasync(outfile.fileno())
            infile.close()
            if n == 0:
                log.logger.info(
                    'No unmapped reads found in %s. Will continue anyway.' %
                    (n, f))
                finalfile.write('%s\t%d\tNA\tNA\n' % (f, n))
                utils.gzip_or_del(args, params, filenames.unmapped)
                continue
            elif n < read_num_limit:
                log.logger.warning(
                    'Only %d unmapped reads were found in %s. Will continue anyway.'
                    % (n, f))
            # mapping
            cmd = 'hisat2 --mp %s -t -x %s -p %d -U %s --no-spliced-alignment > %s' % (
                params.hisat2_mismatch_penalties, args.vrefindex, args.p,
                filenames.unmapped, filenames.mapped_sam)
            out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
            log.logger.debug(
                '\n' + '\n'.join([l.decode()
                                  for l in out.stderr.splitlines()]))
            if not out.returncode == 0:
                log.logger.error('Error occurred during mapping.')
                exit(1)
            utils.gzip_or_del(args, params, filenames.unmapped)
            # count mapped
            mapped_n = 0
            with open(filenames.mapped_sam) as infile:
                for line in infile:
                    if not line[0] == '@':
                        ls = line.split()
                        if not ls[5] == '*':
                            readlen = len(ls[9])
                            if ls[5] == '%dM' % readlen:
                                mapped_n += 1
            mapped_ratio = mapped_n / n
            if mapped_ratio < no_hhv_threshold:
                judge = 'False'
                n_false += 1
            elif mapped_ratio < need_check_threshold:
                judge = 'Need_further_check'
                n_need_check += 1
            elif mapped_ratio < dr_threshold:
                judge = 'likely_solo-DR'
                n_dr += 1
            else:
                judge = 'likely_Full-length'
                n_full += 1
            finalfile.write('%s\t%d\t%d\t%s\n' % (f, n, mapped_n, judge))
        utils.gzip_or_del(args, params, filenames.mapped_sam)
        finalfile.flush()
        os.fdatasync(finalfile.fileno())
        log.logger.info(
            '\n\n\033[34mQuick check result:\n\n  No HHV-6 = %d\n  Need check = %d\n  Likely solo-DR = %d\n  Likely Full-length = %d\033[0m\n\n  \033[31mCaveats: This result is estimation and only for a screening purpose. This is not a conclusive result.\033[0m\n'
            % (n_false, n_need_check, n_dr, n_full))

    except:
        log.logger.error('\n' + traceback.format_exc())
        exit(1)
Esempio n. 2
0
    elif args.fastqin is True:
        log.logger.info(
            'Unmapped read retrieval skipped. Read1=%s, read2=%s.' %
            (args.fq1, args.fq2))
        if args.single is False:
            filenames.unmapped_merged_1 = args.fq1
            filenames.unmapped_merged_2 = args.fq2
        else:
            filenames.unmapped_merged_1 = args.fq1

    # 1. mapping
    import mapping
    log.logger.info('Mapping of unmapped reads started.')
    mapping.map_to_viruses(args, params, filenames)
    if args.alignmentin is True:
        utils.gzip_or_del(args, params, filenames.unmapped_merged_1)
        utils.gzip_or_del(args, params, filenames.unmapped_merged_2)

if (args.ONT_bamin is False
        and mapping.read_mapped is True) or args.ONT_bamin is True:
    if args.ONT_bamin is True:
        import mapping
        filenames.mapped_to_virus_bam = args.ONT_bam
    if args.remove_chr_with_no_read is True:
        log.logger.info('Removing chrs without reads.')
        mapping.remove_chrs_no_read(args, params, filenames, hhv6a_refid,
                                    hhv6b_refid)
    log.logger.info('BAM to bedgraph conversion started.')
    mapping.bam_to_bedgraph(args, params, filenames)

    # 2. identify high coverage viruses
Esempio n. 3
0
def retrieve_unmapped_reads(args, params, filenames):
    log.logger.debug('started.')
    try:
        if args.p <= 2:
            thread_n = args.p
        elif args.p >= 3:
            thread_n = args.p - 1
        # retrieve discordant reads, default
        if args.use_mate_mapped is False and args.all_discordant is False:
            if not args.b is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '12',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           args.b,
                           catch_stdout=False)
            elif not args.c is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '12',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           '--reference',
                           args.fa,
                           args.c,
                           catch_stdout=False)
            pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1',
                        filenames.unmapped_merged_pre1, '-2',
                        filenames.unmapped_merged_pre2, '-s', '/dev/null',
                        filenames.discordant_bam)
            if args.keep is False:
                os.remove(filenames.discordant_bam)
        # retrieve discordant reads, non-default
        else:
            if not args.b is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '1',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           args.b,
                           catch_stdout=False)
            elif not args.c is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '1',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           '--reference',
                           args.fa,
                           args.c,
                           catch_stdout=False)
            pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                       filenames.discordant_sort_bam, filenames.discordant_bam)
            if args.keep is False:
                os.remove(filenames.discordant_bam)
            if args.all_discordant is True:
                pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null',
                            '-1', filenames.unmapped_merged_pre1, '-2',
                            filenames.unmapped_merged_pre2, '-s', '/dev/null',
                            filenames.discordant_sort_bam)
            else:
                pysam.fastq('-@', '%d' % thread_n, '-f', '12', '-F', '3328',
                            '-N', '-0', '/dev/null', '-1',
                            filenames.unmapped_1, '-2', filenames.unmapped_2,
                            '-s', '/dev/null', filenames.discordant_sort_bam)
                if args.use_mate_mapped is True:
                    pysam.view('-@',
                               '%d' % thread_n,
                               '-f',
                               '8',
                               '-F',
                               '3332',
                               '-b',
                               '-o',
                               filenames.unmapped_bam_3,
                               filenames.discordant_sort_bam,
                               catch_stdout=False)
                    pysam.view('-@',
                               '%d' % thread_n,
                               '-f',
                               '4',
                               '-F',
                               '3336',
                               '-b',
                               '-o',
                               filenames.unmapped_bam_4,
                               filenames.discordant_sort_bam,
                               catch_stdout=False)
                    pysam.merge('-@', '%d' % thread_n, '-f',
                                filenames.unmapped_bam_34,
                                filenames.unmapped_bam_3,
                                filenames.unmapped_bam_4)
                    pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                               filenames.unmapped_sorted_34,
                               filenames.unmapped_bam_34)
                    pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null',
                                '-1', filenames.unmapped_3, '-2',
                                filenames.unmapped_4, '-s', '/dev/null',
                                filenames.unmapped_sorted_34)
                # concatenate fastq
                with open(filenames.unmapped_merged_pre1, 'w') as outfile:
                    for f in [filenames.unmapped_1, filenames.unmapped_3]:
                        if os.path.exists(f) is True:
                            with open(f) as infile:
                                for line in infile:
                                    outfile.write(line)
                            utils.gzip_or_del(args, params, f)
                with open(filenames.unmapped_merged_pre2, 'w') as outfile:
                    for f in [filenames.unmapped_2, filenames.unmapped_4]:
                        if os.path.exists(f) is True:
                            with open(f) as infile:
                                for line in infile:
                                    outfile.write(line)
                            utils.gzip_or_del(args, params, f)
        # remove short reads
        infile1 = open(filenames.unmapped_merged_pre1)
        infile2 = open(filenames.unmapped_merged_pre2)
        outfile1 = open(filenames.unmapped_merged_1, 'w')
        outfile2 = open(filenames.unmapped_merged_2, 'w')
        min_seq_len = params.min_seq_len
        tmp1, tmp2 = [], []
        for line1, line2 in zip(infile1, infile2):
            tmp1.append(line1)
            tmp2.append(line2)
            if len(tmp1) == 4:
                seqlen1 = len(tmp1[1].strip())
                seqlen2 = len(tmp2[1].strip())
                if seqlen1 >= min_seq_len and seqlen2 >= min_seq_len:
                    outfile1.write(''.join(tmp1))
                    outfile2.write(''.join(tmp2))
                tmp1, tmp2 = [], []
        infile1.close()
        infile2.close()
        outfile1.close()
        outfile2.close()
        utils.gzip_or_del(args, params, filenames.unmapped_merged_pre1)
        utils.gzip_or_del(args, params, filenames.unmapped_merged_pre2)
        if args.keep is False:
            if os.path.exists(filenames.discordant_sort_bam) is True:
                os.remove(filenames.discordant_sort_bam)
            if args.use_mate_mapped is True:
                os.remove(filenames.unmapped_bam_3)
                os.remove(filenames.unmapped_bam_4)
                os.remove(filenames.unmapped_bam_34)
                os.remove(filenames.unmapped_sorted_34)

    except:
        log.logger.error('\n' + traceback.format_exc())
        exit(1)