Esempio n. 1
0
def Run(args):
    tree = bed_tree_from(bed_file_path=args.bed_fn)

    logging.info("Counting the number of Truth Variants in %s ..." % args.tensor_var_fn)
    v = 0
    d = {}
    f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_var_fn)))
    for row in f.stdout:
        row = row.strip().split()
        ctgName = row[0]
        pos = int(row[1])
        key = "-".join([ctgName, str(pos)])
        v += 1
        d[key] = 1
    f.stdout.close()
    f.wait()

    logging.info("%d Truth Variants" % v)
    t = v * args.amp
    logging.info("%d non-variants to be picked" % t)

    logging.info("Counting the number of usable non-variants in %s ..." % args.tensor_can_fn)
    c = 0
    f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_can_fn)))
    for row in f.stdout:
        row = row.strip().split()
        ctgName = row[0]
        pos = int(row[1])
        if args.bed_fn != None:
            if not is_region_in(tree, ctgName, pos):
                continue
        key = "-".join([ctgName, str(pos)])
        if key in d:
            continue
        c += 1
    f.stdout.close()
    f.wait()
    logging.info("%d usable non-variant" % c)

    r = float(t) / c
    r = r if r <= 1 else 1
    logging.info("%.2f of all non-variants are selected" % r)

    o1 = 0
    o2 = 0
    output_fpo = open(args.output_fn, "wb")
    output_fh = subprocess_popen(shlex.split("gzip -c"), stdin=PIPE, stdout=output_fpo)
    f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_var_fn)))
    for row in f.stdout:
        row = row.strip()
        output_fh.stdin.write(row)
        output_fh.stdin.write("\n")
        o1 += 1
    f.stdout.close()
    f.wait()
    f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_can_fn)))
    for row in f.stdout:
        rawRow = row.strip()
        row = rawRow.split()
        ctgName = row[0]
        pos = int(row[1])
        if args.bed_fn != None:
            if not is_region_in(tree, ctgName, pos):
                continue
        key = "-".join([ctgName, str(pos)])
        if key in d:
            continue
        if random() < r:
            output_fh.stdin.write(rawRow)
            output_fh.stdin.write("\n")
            o2 += 1
    f.stdout.close()
    f.wait()
    output_fh.stdin.close()
    output_fh.wait()
    output_fpo.close()
    logging.info("%.2f/%.2f Truth Variants/Non-variants outputed" % (o1, o2))
Esempio n. 2
0
def CreateTensorPileup(args):
    """
    Create pileup tensor for pileup model training or calling.
    Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency
    and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate
    regions once, we could directly get all variant candidates directly.
    """
    ctg_start = args.ctgStart
    ctg_end = args.ctgEnd
    fasta_file_path = args.ref_fn
    ctg_name = args.ctgName
    samtools_execute_command = args.samtools
    bam_file_path = args.bam_fn
    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
    chunk_num = args.chunk_num
    tensor_can_output_path = args.tensor_can_fn
    minimum_af_for_candidate = args.min_af
    minimum_snp_af_for_candidate = args.snp_min_af
    minimum_indel_af_for_candidate = args.indel_min_af
    min_coverage = args.minCoverage
    platform = args.platform
    confident_bed_fn = args.bed_fn
    is_confident_bed_file_given = confident_bed_fn is not None
    alt_fn = args.indel_fn
    extend_bed = args.extend_bed
    is_extend_bed_file_given = extend_bed is not None
    min_mapping_quality = args.minMQ
    min_base_quality = args.minBQ
    fast_mode = args.fast_mode
    vcf_fn = args.vcf_fn
    is_known_vcf_file_provided = vcf_fn is not None
    call_snp_only = args.call_snp_only

    global test_pos
    test_pos = None

    # 1-based regions [start, end] (start and end inclusive)
    ref_regions = []
    reads_regions = []
    known_variants_set = set()
    tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed,
                                             contig_name=ctg_name,
                                             return_bed_region=True)

    fai_fn = file_path_from(fasta_file_path,
                            suffix=".fai",
                            exit_on_not_found=True,
                            sep='.')
    if not is_confident_bed_file_given and chunk_id is not None:
        contig_length = 0
        with open(fai_fn, 'r') as fai_fp:
            for row in fai_fp:
                columns = row.strip().split("\t")

                contig_name = columns[0]
                if contig_name != ctg_name:
                    continue
                contig_length = int(columns[1])
        chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num
        ctg_start = chunk_size * chunk_id  # 0-base to 1-base
        ctg_end = ctg_start + chunk_size

    if is_confident_bed_file_given and chunk_id is not None:
        chunk_size = (bed_end - bed_start) // chunk_num + 1 if (
            bed_end - bed_start) % chunk_num else (bed_end -
                                                   bed_start) // chunk_num
        ctg_start = bed_start + 1 + chunk_size * chunk_id  # 0-base to 1-base
        ctg_end = ctg_start + chunk_size

    if is_known_vcf_file_provided and chunk_id is not None:
        known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn,
                                                  contig_name=ctg_name)
        total_variants_size = len(known_variants_list)
        chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1
        chunk_start_pos = chunk_id * chunk_variants_size
        known_variants_set = set(
            known_variants_list[chunk_start_pos:chunk_start_pos +
                                chunk_variants_size])
        if len(known_variants_set) == 0:
            return
        ctg_start, ctg_end = min(known_variants_set), max(known_variants_set)

    is_ctg_name_given = ctg_name is not None
    is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None
    if is_ctg_range_given:
        extend_start = ctg_start - no_of_positions
        extend_end = ctg_end + no_of_positions
        reads_regions.append(
            region_from(ctg_name=ctg_name,
                        ctg_start=extend_start,
                        ctg_end=extend_end))
        reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion
        reference_start = 1 if reference_start < 1 else reference_start
        ref_regions.append(
            region_from(ctg_name=ctg_name,
                        ctg_start=reference_start,
                        ctg_end=reference_end))
    elif is_ctg_name_given:
        reads_regions.append(region_from(ctg_name=ctg_name))
        ref_regions.append(region_from(ctg_name=ctg_name))
        reference_start = 1

    reference_sequence = reference_sequence_from(
        samtools_execute_command=samtools_execute_command,
        fasta_file_path=fasta_file_path,
        regions=ref_regions)

    if reference_sequence is None or len(reference_sequence) == 0:
        sys.exit(
            log_error(
                "[ERROR] Failed to load reference sequence from file ({}).".
                format(fasta_file_path)))

    if is_confident_bed_file_given and ctg_name not in tree:
        sys.exit(
            log_error("[ERROR] ctg_name {} not exists in bed file({}).".format(
                ctg_name, confident_bed_fn)))

    # samtools mpileup options
    # reverse-del: deletion in forward/reverse strand were marked as '*'/'#'
    min_base_quality = 0 if args.gvcf else min_base_quality
    max_depth = param.max_depth_dict[
        args.platform] if args.platform else args.max_depth
    mq_option = ' --min-MQ {}'.format(min_mapping_quality)
    bq_option = ' --min-BQ {}'.format(min_base_quality)
    flags_option = ' --excl-flags {}'.format(param.SAMTOOLS_VIEW_FILTER_FLAG)
    max_depth_option = ' --max-depth {}'.format(max_depth)
    bed_option = ' -l {}'.format(
        extend_bed) if is_extend_bed_file_given else ""
    gvcf_option = ' -a' if args.gvcf else ""
    samtools_mpileup_process = subprocess_popen(
        shlex.split("{} mpileup  {} -r {} --reverse-del".format(
            samtools_execute_command,
            bam_file_path,
            " ".join(reads_regions),
        ) + mq_option + bq_option + bed_option + flags_option +
                    max_depth_option + gvcf_option))

    if tensor_can_output_path != "PIPE":
        tensor_can_fpo = open(tensor_can_output_path, "wb")
        tensor_can_fp = subprocess_popen(shlex.split("{} -c".format(
            param.zstd)),
                                         stdin=PIPE,
                                         stdout=tensor_can_fpo)
    else:
        tensor_can_fp = TensorStdout(sys.stdout)

    # whether save all alternative information, only for debug mode
    if alt_fn:
        alt_fp = open(alt_fn, 'w')

    pos_offset = 0
    pre_pos = -1
    tensor = [[]] * sliding_window_size
    candidate_position = []
    all_alt_dict = {}
    depth_dict = {}
    af_dict = {}

    # to generate gvcf, it is needed to record whole genome statistical information
    if args.gvcf:
        nonVariantCaller = variantInfoCalculator(
            gvcfWritePath=args.temp_file_dir,
            ref_path=args.ref_fn,
            bp_resolution=args.bp_resolution,
            ctgName=ctg_name,
            sample_name='.'.join(
                [args.sampleName, ctg_name,
                 str(ctg_start),
                 str(ctg_end)]),
            p_err=args.base_err,
            gq_bin_size=args.gq_bin_size)

    confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn,
                                       contig_name=ctg_name,
                                       bed_ctg_start=extend_start,
                                       bed_ctg_end=extend_end)

    empty_pileup_flag = True
    for row in samtools_mpileup_process.stdout:
        empty_pileup_flag = False
        columns = row.strip().split('\t', maxsplit=5)
        pos = int(columns[1])
        pileup_bases = columns[4]
        reference_base = reference_sequence[pos - reference_start].upper()
        valid_reference_flag = True
        within_flag = True
        if args.gvcf:
            if not valid_reference_flag:
                nonVariantCaller.make_gvcf_online({}, push_current=True)
            if ctg_start != None and ctg_end != None:
                within_flag = pos >= ctg_start and pos < ctg_end
            elif ctg_start != None and ctg_end == None:
                within_flag = pos >= ctg_start
            elif ctg_start == None and ctg_end != None:
                within_flag = pos <= ctg_end
            else:
                within_flag = True
            if columns[3] == '0' and within_flag and valid_reference_flag:
                cur_site_info = {
                    'chr': columns[0],
                    'pos': pos,
                    'ref': reference_base,
                    'n_total': 0,
                    'n_ref': 0
                }
                nonVariantCaller.make_gvcf_online(cur_site_info)
                continue

        # start with a new region, clear all sliding windows cache, avoid memory occupation
        if pre_pos + 1 != pos:
            pos_offset = 0
            tensor = [[]] * sliding_window_size
            candidate_position = []
        pre_pos = pos

        # a condition to skip some positions creating tensor,but return allele summary
        # allele count function
        pileup_tensor, alt_dict, af, depth, pass_af, pileup_list, max_del_length = generate_tensor(
            pos=pos,
            pileup_bases=pileup_bases,
            reference_sequence=reference_sequence,
            reference_start=reference_start,
            reference_base=reference_base,
            minimum_af_for_candidate=minimum_af_for_candidate,
            minimum_snp_af_for_candidate=minimum_snp_af_for_candidate,
            minimum_indel_af_for_candidate=minimum_indel_af_for_candidate,
            platform=platform,
            fast_mode=fast_mode,
            call_snp_only=call_snp_only)
        if args.gvcf and within_flag and valid_reference_flag:
            cur_n_total = 0
            cur_n_ref = 0
            for _key, _value in pileup_list:
                if (_key == reference_base):
                    cur_n_ref = _value
                cur_n_total += _value

            cur_site_info = {
                'chr': columns[0],
                'pos': pos,
                'ref': reference_base,
                'n_total': cur_n_total,
                'n_ref': cur_n_ref
            }
            nonVariantCaller.make_gvcf_online(cur_site_info)

        pass_confident_bed = not is_confident_bed_file_given or is_region_in(
            tree=confident_bed_tree,
            contig_name=ctg_name,
            region_start=pos - 1,
            region_end=pos + max_del_length + 1)  # 0-based
        if (pass_confident_bed and reference_base in 'ACGT' and
            (pass_af and depth >= min_coverage)
                and not is_known_vcf_file_provided) or (
                    is_known_vcf_file_provided and pos in known_variants_set):
            candidate_position.append(pos)
            all_alt_dict[pos] = alt_dict
            depth_dict[pos] = depth
            af_dict[pos] = af
        tensor[pos_offset] = pileup_tensor

        # save pileup tensor for each candidate position with nearby flanking_base_num bp distance
        pos_offset = (pos_offset + 1) % sliding_window_size
        if len(candidate_position
               ) and pos - candidate_position[0] == flanking_base_num:
            center = candidate_position.pop(0)
            has_empty_tensor = sum([True for item in tensor if not len(item)])
            if not has_empty_tensor:
                depth = depth_dict[center]
                ref_seq = reference_sequence[center - (flanking_base_num) -
                                             reference_start:center +
                                             flanking_base_num + 1 -
                                             reference_start]
                concat_tensor = tensor[pos_offset:] + tensor[0:pos_offset]

                alt_info = str(depth) + '-' + ' '.join([
                    ' '.join([item[0], str(item[1])])
                    for item in list(all_alt_dict[center].items())
                ])
                l = "%s\t%d\t%s\t%s\t%s" % (
                    ctg_name, center, ref_seq, " ".join(
                        " ".join("%d" % x for x in innerlist)
                        for innerlist in concat_tensor), alt_info)
                tensor_can_fp.stdin.write(l)
                tensor_can_fp.stdin.write("\n")
                if alt_fn:
                    alt_info = ' '.join([
                        ' '.join([item[0], str(item[1])])
                        for item in list(all_alt_dict[center].items())
                    ])
                    alt_fp.write('\t'.join([
                        ctg_name + ' ' + str(center),
                        str(depth), alt_info,
                        str(af_dict[center])
                    ]) + '\n')
                del all_alt_dict[center], depth_dict[center], af_dict[center]

    if args.gvcf and len(nonVariantCaller.current_block) != 0:
        nonVariantCaller.write_to_gvcf_batch(nonVariantCaller.current_block,
                                             nonVariantCaller.cur_min_DP,
                                             nonVariantCaller.cur_raw_gq)

    if args.gvcf and empty_pileup_flag:
        nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end)
    if args.gvcf:
        nonVariantCaller.close_vcf_writer()

    samtools_mpileup_process.stdout.close()
    samtools_mpileup_process.wait()

    if tensor_can_output_path != "PIPE":
        tensor_can_fp.stdin.close()
        tensor_can_fp.wait()
        tensor_can_fpo.close()

    if alt_fn:
        alt_fp.close()
Esempio n. 3
0
def Run(args):
    basedir = os.path.dirname(__file__)

    callVarBamBin = basedir + "/../clair.py callVarBam"
    pypyBin = executable_command_string_from(args.pypy, exit_on_not_found=True)
    samtoolsBin = executable_command_string_from(args.samtools, exit_on_not_found=True)

    chkpnt_fn = file_path_from(args.chkpnt_fn, suffix=".meta", exit_on_not_found=True)
    bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True)
    ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True)
    fai_fn = file_path_from(args.ref_fn + ".fai", exit_on_not_found=True)
    bed_fn = file_path_from(args.bed_fn)
    vcf_fn = file_path_from(args.vcf_fn)

    output_prefix = args.output_prefix
    af_threshold = args.threshold

    tree = bed_tree_from(bed_file_path=bed_fn)

    minCoverage = args.minCoverage
    sampleName = args.sampleName
    delay = args.delay
    threads = args.tensorflowThreads
    qual = args.qual
    is_include_all_contigs = args.includingAllContigs
    region_chunk_size = args.refChunkSize

    stop_consider_left_edge = command_option_from(args.stop_consider_left_edge, 'stop_consider_left_edge')
    log_path = command_option_from(args.log_path, 'log_path', option_value=args.log_path)
    pysam_for_all_indel_bases = command_option_from(args.pysam_for_all_indel_bases, 'pysam_for_all_indel_bases')
    haploid_mode = command_option_from(args.haploid, 'haploid')
    output_for_ensemble = command_option_from(args.output_for_ensemble, 'output_for_ensemble')
    debug = command_option_from(args.debug, 'debug')
    qual = command_option_from(args.qual, 'qual', option_value=args.qual)
    fast_plotting = command_option_from(args.fast_plotting, 'fast_plotting')

    call_var_bam_command_options = [
        ExecuteCommand('python3', callVarBamBin),
        CommandOption('chkpnt_fn', chkpnt_fn),
        CommandOption('ref_fn', ref_fn),
        CommandOption('bam_fn', bam_fn),
        CommandOption('threshold', af_threshold),
        CommandOption('minCoverage', minCoverage),
        CommandOption('pypy', pypyBin),
        CommandOption('samtools', samtoolsBin),
        CommandOption('delay', delay),
        CommandOption('threads', threads),
        CommandOption('sampleName', sampleName),
        # optional command options
        CommandOption('vcf_fn', vcf_fn) if vcf_fn is not None else None,
        qual,
        stop_consider_left_edge,
        debug,
        pysam_for_all_indel_bases,
        haploid_mode,
        output_for_ensemble,
    ]

    activation_only_command_options = [
        CommandOptionWithNoValue('activation_only'),
        log_path,
        CommandOption('max_plot', args.max_plot),
        CommandOption('parallel_level', args.parallel_level),
        CommandOption('workers', args.workers),
        fast_plotting,
    ] if args.activation_only else []

    is_bed_file_provided = bed_fn is not None
    command_string = command_string_from(call_var_bam_command_options + activation_only_command_options)

    with open(fai_fn, 'r') as fai_fp:
        for row in fai_fp:
            columns = row.strip().split("\t")

            contig_name = columns[0]
            if not is_include_all_contigs and str(contig_name) not in major_contigs:
                continue

            region_start, region_end = 0, 0
            contig_length = int(columns[1])
            while region_end < contig_length:
                region_start = region_end
                region_end = region_start + region_chunk_size
                if region_end > contig_length:
                    region_end = contig_length
                output_fn = "%s.%s_%d_%d.vcf" % (output_prefix, contig_name, region_start, region_end)

                is_region_in_bed = is_bed_file_provided and is_region_in(tree, contig_name, region_start, region_end)
                need_output_command = not is_bed_file_provided or is_region_in_bed
                if not need_output_command:
                    continue

                additional_command_options = [
                    CommandOption('ctgName', contig_name),
                    CommandOption('ctgStart', region_start),
                    CommandOption('ctgEnd', region_end),
                    CommandOption('call_fn', output_fn),
                    CommandOption('bed_fn', bed_fn) if is_region_in_bed else None
                ]
                print(command_string + " " + command_string_from(additional_command_options))
Esempio n. 4
0
def make_candidates(args):

    gen4Training = args.gen4Training
    variant_file_path = args.var_fn
    bed_file_path = args.bed_fn
    fasta_file_path = args.ref_fn
    ctg_name = args.ctgName
    ctg_start = args.ctgStart
    ctg_end = args.ctgEnd
    output_probability = args.outputProb
    samtools_execute_command = args.samtools
    minimum_depth_for_candidate = args.minCoverage
    minimum_af_for_candidate = args.threshold
    minimum_mapping_quality = args.minMQ
    bam_file_path = args.bam_fn
    candidate_output_path = args.can_fn
    is_using_stdout_for_output_candidate = candidate_output_path == "PIPE"

    is_building_training_dataset = gen4Training == True
    is_variant_file_given = variant_file_path is not None
    is_bed_file_given = bed_file_path is not None
    is_ctg_name_given = ctg_name is not None
    is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None

    if is_building_training_dataset:
        # minimum_depth_for_candidate = 0
        minimum_af_for_candidate = 0

    # preparation for candidates near variants
    need_consider_candidates_near_variant = is_building_training_dataset and is_variant_file_given
    variants_map = variants_map_from(
        variant_file_path) if need_consider_candidates_near_variant else {}
    non_variants_map = non_variants_map_near_variants_from(variants_map)
    no_of_candidates_near_variant = 0
    no_of_candidates_outside_variant = 0

    # update output probabilities for candidates near variants
    # original: (7000000.0 * 2.0 / 3000000000)
    ratio_of_candidates_near_variant_to_candidates_outside_variant = 1.0
    output_probability_near_variant = (
        3500000.0 *
        ratio_of_candidates_near_variant_to_candidates_outside_variant *
        RATIO_OF_NON_VARIANT_TO_VARIANT / 14000000)
    output_probability_outside_variant = 3500000.0 * RATIO_OF_NON_VARIANT_TO_VARIANT / (
        3000000000 - 14000000)

    if not isfile("{}.fai".format(fasta_file_path)):
        print("Fasta index {}.fai doesn't exist.".format(fasta_file_path),
              file=sys.stderr)
        sys.exit(1)

    # 1-based regions [start, end] (start and end inclusive)
    regions = []
    reference_start, reference_end = None, None
    if is_ctg_range_given:
        reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion
        reference_start = 1 if reference_start < 1 else reference_start
        regions.append(
            region_from(ctg_name=ctg_name,
                        ctg_start=reference_start,
                        ctg_end=reference_end))
    elif is_ctg_name_given:
        regions.append(region_from(ctg_name=ctg_name))

    reference_sequence = reference_sequence_from(
        samtools_execute_command=samtools_execute_command,
        fasta_file_path=fasta_file_path,
        regions=regions)
    if reference_sequence is None or len(reference_sequence) == 0:
        print(
            "[ERROR] Failed to load reference seqeunce from file ({}).".format(
                fasta_file_path),
            file=sys.stderr)
        sys.exit(1)

    tree = bed_tree_from(bed_file_path=bed_file_path)
    if is_bed_file_given and ctg_name not in tree:
        print("[ERROR] ctg_name({}) not exists in bed file({}).".format(
            ctg_name, bed_file_path),
              file=sys.stderr)
        sys.exit(1)

    samtools_view_process = subprocess_popen(
        shlex.split("{} view -F {} {} {}".format(
            samtools_execute_command, param.SAMTOOLS_VIEW_FILTER_FLAG,
            bam_file_path, " ".join(regions))))

    if is_using_stdout_for_output_candidate:
        can_fp = CandidateStdout(sys.stdout)
    else:
        can_fpo = open(candidate_output_path, "wb")
        can_fp = subprocess_popen(shlex.split("gzip -c"),
                                  stdin=PIPE,
                                  stdout=can_fpo)

    pileup = defaultdict(lambda: {
        "A": 0,
        "C": 0,
        "G": 0,
        "T": 0,
        "I": 0,
        "D": 0,
        "N": 0
    })
    POS = 0
    number_of_reads_processed = 0

    while True:
        row = samtools_view_process.stdout.readline()
        is_finish_reading_output = row == '' and samtools_view_process.poll(
        ) is not None

        if row:
            columns = row.strip().split()
            if columns[0][0] == "@":
                continue

            RNAME = columns[2]
            if RNAME != ctg_name:
                continue

            POS = int(
                columns[3]
            ) - 1  # switch from 1-base to 0-base to match sequence index
            MAPQ = int(columns[4])
            CIGAR = columns[5]
            SEQ = columns[9].upper(
            )  # uppercase for SEQ (regexp is \*|[A-Za-z=.]+)

            reference_position = POS
            query_position = 0

            if MAPQ < minimum_mapping_quality:
                continue
            if CIGAR == "*" or is_too_many_soft_clipped_bases_for_a_read_from(
                    CIGAR):
                continue

            number_of_reads_processed += 1

            advance = 0
            for c in str(CIGAR):
                if c.isdigit():
                    advance = advance * 10 + int(c)
                    continue

                if c == "S":
                    query_position += advance

                elif c == "M" or c == "=" or c == "X":
                    for _ in range(advance):
                        base = evc_base_from(SEQ[query_position])
                        pileup[reference_position][base] += 1

                        # those CIGAR operations consumes query and reference
                        reference_position += 1
                        query_position += 1

                elif c == "I":
                    pileup[reference_position - 1]["I"] += 1

                    # insertion consumes query
                    query_position += advance

                elif c == "D":
                    pileup[reference_position - 1]["D"] += 1

                    # deletion consumes reference
                    reference_position += advance

                # reset advance
                advance = 0

        positions = [x for x in pileup.keys() if x < POS
                     ] if not is_finish_reading_output else list(pileup.keys())
        positions.sort()
        for zero_based_position in positions:
            base_count = depth = reference_base = temp_key = None

            # ctg and bed checking (region [ctg_start, ctg_end] is 1-based, inclusive start and end positions)
            pass_ctg = not is_ctg_range_given or ctg_start <= zero_based_position + 1 <= ctg_end
            pass_bed = not is_bed_file_given or is_region_in(
                tree, ctg_name, zero_based_position)
            if not pass_bed or not pass_ctg:
                continue

            # output probability checking
            pass_output_probability = True
            if is_building_training_dataset and is_variant_file_given:
                temp_key = ctg_name + ":" + str(zero_based_position + 1)
                pass_output_probability = (temp_key not in variants_map and (
                    (temp_key in non_variants_map and
                     random.uniform(0, 1) <= output_probability_near_variant)
                    or (temp_key not in non_variants_map and random.uniform(
                        0, 1) <= output_probability_outside_variant)))
            elif is_building_training_dataset:
                pass_output_probability = random.uniform(
                    0, 1) <= output_probability
            if not pass_output_probability:
                continue

            # for depth checking and af checking
            try:
                reference_base = evc_base_from(reference_sequence[
                    zero_based_position -
                    (0 if reference_start is None else (reference_start - 1))])
                position_dict = pileup[zero_based_position]
            except:
                continue

            # depth checking
            base_count = list(position_dict.items())
            depth = sum(
                x[1]
                for x in base_count) - position_dict["I"] - position_dict["D"]
            if depth < minimum_depth_for_candidate:
                continue

            # af checking
            denominator = depth if depth > 0 else 1
            base_count.sort(
                key=lambda x: -x[1])  # sort base_count descendingly
            pass_af = (base_count[0][0] != reference_base
                       or (float(base_count[1][1]) / denominator) >=
                       minimum_af_for_candidate)
            if not pass_af:
                continue

            # output 1-based candidate
            if temp_key is not None and temp_key in non_variants_map:
                no_of_candidates_near_variant += 1
            elif temp_key is not None and temp_key not in non_variants_map:
                no_of_candidates_outside_variant += 1

            output = [ctg_name, zero_based_position + 1, reference_base, depth]
            output.extend(["%s %d" % x for x in base_count])
            output = " ".join([str(x) for x in output]) + "\n"

            can_fp.stdin.write(output)

        for zero_based_position in positions:
            del pileup[zero_based_position]

        if is_finish_reading_output:
            break

    if need_consider_candidates_near_variant:
        print("# of candidates near variant: ", no_of_candidates_near_variant)
        print("# of candidates outside variant: ",
              no_of_candidates_outside_variant)

    samtools_view_process.stdout.close()
    samtools_view_process.wait()

    if not is_using_stdout_for_output_candidate:
        can_fp.stdin.close()
        can_fp.wait()
        can_fpo.close()

    if number_of_reads_processed == 0:
        print(
            "No read has been process, either the genome region you specified has no read cover, or please check the correctness of your BAM input (%s)."
            % (bam_file_path),
            file=sys.stderr)
        sys.exit(0)
Esempio n. 5
0
def reads_realignment(args):
    bed_file_path = args.full_aln_regions
    extend_bed = args.extend_bed
    fasta_file_path = args.ref_fn
    ctg_name = args.ctgName
    ctg_start = args.ctgStart
    ctg_end = args.ctgEnd
    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
    chunk_num = args.chunk_num
    samtools_execute_command = args.samtools
    bam_file_path = args.bam_fn
    minMQ = args.minMQ
    min_coverage = args.minCoverage
    is_bed_file_given = bed_file_path is not None
    is_ctg_name_given = ctg_name is not None
    read_fn = args.read_fn

    global test_pos
    test_pos = None
    if is_bed_file_given:
        candidate_file_path_process = subprocess_popen(
            shlex.split("gzip -fdc %s" % (bed_file_path)))
        candidate_file_path_output = candidate_file_path_process.stdout

        ctg_start, ctg_end = float('inf'), 0
        for row in candidate_file_path_output:
            row = row.rstrip().split('\t')
            if row[0] != ctg_name: continue
            position = int(row[1]) + 1
            end = int(row[2]) + 1
            ctg_start = min(position, ctg_start)
            ctg_end = max(end, ctg_end)

        candidate_file_path_output.close()
        candidate_file_path_process.wait()

    if chunk_id is not None:
        fai_fn = file_path_from(fasta_file_path,
                                suffix=".fai",
                                exit_on_not_found=True,
                                sep='.')
        contig_length = 0
        with open(fai_fn, 'r') as fai_fp:
            for row in fai_fp:
                columns = row.strip().split("\t")

                contig_name = columns[0]
                if contig_name != ctg_name:
                    continue
                contig_length = int(columns[1])
        chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num
        ctg_start = chunk_size * chunk_id  # 0-base to 1-base
        ctg_end = ctg_start + chunk_size

    is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None

    # 1-based regions [start, end] (start and end inclusive)
    ref_regions = []
    reads_regions = []
    reference_start, reference_end = None, None

    if is_ctg_range_given:
        extend_start = ctg_start - max_window_size
        extend_end = ctg_end + max_window_size
        reads_regions.append(
            region_from(ctg_name=ctg_name,
                        ctg_start=extend_start,
                        ctg_end=extend_end))
        reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion
        reference_start = 1 if reference_start < 1 else reference_start
        ref_regions.append(
            region_from(ctg_name=ctg_name,
                        ctg_start=reference_start,
                        ctg_end=reference_end))
    elif is_ctg_name_given:
        reads_regions.append(region_from(ctg_name=ctg_name))
        ref_regions.append(region_from(ctg_name=ctg_name))
        reference_start = 1

    reference_sequence = reference_sequence_from(
        samtools_execute_command=samtools_execute_command,
        fasta_file_path=fasta_file_path,
        regions=ref_regions)
    if reference_sequence is None or len(reference_sequence) == 0:
        sys.exit(
            "[ERROR] Failed to load reference sequence from file ({}).".format(
                fasta_file_path))

    tree = bed_tree_from(bed_file_path=bed_file_path)
    if is_bed_file_given and ctg_name not in tree:
        sys.exit("[ERROR] ctg_name({}) not exists in bed file({}).".format(
            ctg_name, bed_file_path))

    bed_option = ' -L {}'.format(extend_bed) if extend_bed else ""
    bed_option = ' -L {}'.format(
        bed_file_path) if is_bed_file_given else bed_option
    mq_option = ' -q {}'.format(minMQ) if minMQ > 0 else ""
    samtools_view_command = "{} view -h {} {}".format(
        samtools_execute_command, bam_file_path,
        " ".join(reads_regions)) + mq_option + bed_option
    samtools_view_process = subprocess_popen(
        shlex.split(samtools_view_command))

    if read_fn and read_fn == 'PIPE':
        save_file_fp = TensorStdout(sys.stdout)
    elif read_fn:
        save_file_fp = subprocess_popen(shlex.split(
            "{} view -bh - -o {}".format(
                samtools_execute_command,
                read_fn + ('.{}_{}'.format(ctg_start, ctg_end)
                           if is_ctg_range_given and not test_pos else ""))),
                                        stdin=PIPE,
                                        stdout=PIPE)

    reference_start_0_based = 0 if reference_start is None else (
        reference_start - 1)

    header = []
    add_header = False
    aligned_reads = defaultdict()
    pileup = defaultdict(lambda: {"X": 0})
    samtools_view_generator = samtools_view_generator_from(
        samtools_view_process=samtools_view_process,
        aligned_reads=aligned_reads,
        pileup=pileup,
        ctg_name=ctg_name,
        reference_sequence=reference_sequence,
        reference_start_0_based=reference_start_0_based,
        header=header)
    pre_aligned_reads = defaultdict()

    while True:
        chunk_start, chunk_end = next(samtools_view_generator)
        if chunk_start is None:
            break
        if not add_header:
            save_file_fp.stdin.write(''.join(header))
            add_header = True

        variant_allele_list = [[position, pileup[position]["X"]]
                               for position in list(pileup.keys())]
        candidate_position_list = [
            (position, support_allele_count)
            for position, support_allele_count in variant_allele_list
            if support_allele_count >= min_coverage
            and position >= chunk_start - region_expansion_in_bp -
            1 and position <= chunk_end + region_expansion_in_bp - 1
        ]
        candidate_position_list.sort(key=(lambda x: x[0]))

        if not len(aligned_reads) or not len(candidate_position_list):
            continue
        if len(pre_aligned_reads):  # update the read in previous chunk
            for read_name, read in pre_aligned_reads.items():
                aligned_reads[read_name] = read

        region_dict = {}
        split_region_size = max_window_size
        region_tree = IntervalTree()
        for split_idx in range((chunk_end - chunk_start) // split_region_size):
            split_start = chunk_start + split_idx * split_region_size - region_expansion_in_bp - 1
            split_end = split_start + split_region_size + region_expansion_in_bp * 2 + 1
            region_dict[(split_start, split_end)] = []
            region_tree.addi(split_start, split_end)
        for candidate_position in candidate_position_list:
            for region in region_tree.at(candidate_position[0]):
                region_dict[(region.begin,
                             region.end)].append(candidate_position[0])

        for key, split_candidate_position_list in region_dict.items():
            start_pos, end_pos = None, None
            windows = []
            read_windows_dict = {}
            for pos in split_candidate_position_list:
                if start_pos is None:
                    start_pos = pos
                    end_pos = pos

                elif pos > end_pos + 2 * min_windows_distance:
                    temp_window = (start_pos - min_windows_distance,
                                   end_pos + min_windows_distance)
                    windows.append(temp_window)
                    read_windows_dict[temp_window] = []

                    start_pos = pos
                    end_pos = pos
                else:
                    end_pos = pos

            if start_pos is not None:
                temp_window = (start_pos - min_windows_distance,
                               end_pos + min_windows_distance)
                windows.append(temp_window)
                read_windows_dict[temp_window] = []
            if not len(windows): continue
            windows = sorted(windows, key=lambda x: x[0])
            max_window_end = max([item[1] for item in windows])
            # #find read windows overlap_pair
            for read_name, read in aligned_reads.items():
                if read.read_start > max_window_end: continue
                argmax_window_idx = find_max_overlap_index(
                    (read.read_start, read.read_end), windows)
                if argmax_window_idx is not None:
                    read_windows_dict[windows[argmax_window_idx]].append(
                        read_name)

            # realignment
            for window in windows:
                start_pos, end_pos = window
                if end_pos - start_pos > max_window_size:  # or (window not in need_align_windows_set):
                    continue

                ref_start = start_pos - reference_start_0_based
                ref_end = end_pos - reference_start_0_based
                ref = reference_sequence[ref_start:ref_end]
                reads = []
                low_base_quality_pos_list = []
                # pypy binding with ctypes for DBG building
                for read_name in read_windows_dict[window]:
                    read = aligned_reads[read_name]
                    if (
                            not read.graph_mq
                    ) or read.read_start > end_pos or read.read_end < start_pos:
                        continue
                    reads.append(read.seq)
                    low_base_quality_pos_list.append(' '.join([
                        str(bq_idx)
                        for bq_idx, item in enumerate(read.base_quality)
                        if int(item) < 15
                    ]))
                totoal_read_num = len(reads)
                c_ref = byte(ref)
                read_list1 = ctypes.c_char_p(byte(','.join(reads)))
                low_base_quality_pos_array = ctypes.c_char_p(
                    byte(','.join(low_base_quality_pos_list)))

                dbg.get_consensus.restype = ctypes.POINTER(DBGPointer)
                dbg.get_consensus.argtypes = [
                    ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p,
                    ctypes.c_int
                ]

                dbg_p = dbg.get_consensus(ctypes.c_char_p(c_ref), read_list1,
                                          low_base_quality_pos_array,
                                          totoal_read_num)

                c_consensus, consensus_size = dbg_p.contents.consensus, dbg_p.contents.consensus_size
                consensus = [
                    item.decode() for item in c_consensus[:consensus_size]
                ]

                if len(consensus) == 0 or len(
                        consensus) == 1 and consensus[0] == ref or len(
                            read_windows_dict[window]) == 0:
                    continue
                min_read_start = min([
                    aligned_reads[item].read_start
                    for item in read_windows_dict[window]
                ])
                max_read_end = max([
                    aligned_reads[item].read_end
                    for item in read_windows_dict[window]
                ])
                tmp_ref_start = max(
                    0,
                    min(min_read_start, start_pos) - expand_align_ref_region)
                tmp_ref_end = max(max_read_end,
                                  end_pos) + expand_align_ref_region

                ref_prefix = get_reference_seq(reference_sequence,
                                               tmp_ref_start, start_pos,
                                               reference_start_0_based)
                ref_center = get_reference_seq(reference_sequence, start_pos,
                                               end_pos,
                                               reference_start_0_based)
                if tmp_ref_end < end_pos:
                    continue
                ref_suffix = get_reference_seq(reference_sequence, end_pos,
                                               tmp_ref_end,
                                               reference_start_0_based)
                ref_seq = ref_prefix + ref_center + ref_suffix

                # pypy binding with ctypes for realignment
                read_name_list = []
                totoal_read_num = min(max_region_reads_num,
                                      len(read_windows_dict[window]))
                seq_list = (ctypes.c_char_p * totoal_read_num)()
                position_list = (ctypes.c_int * totoal_read_num)()
                cigars_list = (ctypes.c_char_p * totoal_read_num)()

                for read_idx, read_name in enumerate(
                        read_windows_dict[window]):
                    read = aligned_reads[read_name]
                    if read_idx >= totoal_read_num: break
                    seq_list[read_idx] = byte(read.seq.upper())
                    position_list[read_idx] = read.read_start
                    cigars_list[read_idx] = byte(read.cigar)
                    read_name_list.append(read_name)
                haplotypes_list = [
                    ref_prefix + cons + ref_suffix for cons in consensus
                ]
                haplotypes = ' '.join(haplotypes_list)

                realigner.realign_reads.restype = ctypes.POINTER(StructPointer)
                realigner.realign_reads.argtypes = [
                    ctypes.c_char_p * totoal_read_num,
                    ctypes.c_int * totoal_read_num,
                    ctypes.c_char_p * totoal_read_num, ctypes.c_char_p,
                    ctypes.c_char_p, ctypes.c_int, ctypes.c_int, ctypes.c_int,
                    ctypes.c_int
                ]

                realigner_p = realigner.realign_reads(
                    seq_list, position_list, cigars_list,
                    ctypes.c_char_p(byte(ref_seq)),
                    ctypes.c_char_p(byte(haplotypes)), tmp_ref_start,
                    len(ref_prefix), len(ref_suffix), totoal_read_num)

                realign_positions, realign_cigars = realigner_p.contents.position, realigner_p.contents.cigar_string
                read_position_list = realign_positions[:totoal_read_num]
                read_cigar_list = [
                    item.decode() for item in realign_cigars[:totoal_read_num]
                ]

                if len(read_name_list):
                    for read_id, read_name in enumerate(read_name_list):
                        if read_cigar_list[read_id] == "" or (
                                aligned_reads[read_name].cigar
                                == read_cigar_list[read_id]
                                and aligned_reads[read_name].read_start
                                == read_position_list[read_id]):
                            continue
                        # update cigar and read start position
                        aligned_reads[read_name].test_pos = test_pos
                        realignment_start = read_position_list[read_id]
                        realignment_cigar = read_cigar_list[read_id].replace(
                            'X', 'M')
                        if realignment_cigar == aligned_reads[
                                read_name].cigar and realignment_start == aligned_reads[
                                    read_name].read_start:
                            continue
                        aligned_reads[read_name].set_realignment_info(
                            split_start, read_cigar_list[read_id],
                            read_position_list[read_id])

                realigner.free_memory.restype = ctypes.POINTER(ctypes.c_void_p)
                realigner.free_memory.argtypes = [
                    ctypes.POINTER(StructPointer), ctypes.c_int
                ]
                realigner.free_memory(realigner_p, totoal_read_num)
        # # realignment end

        if read_fn:
            sorted_key = sorted([(key, item.best_pos)
                                 for key, item in aligned_reads.items()],
                                key=lambda x: x[1])
            for read_name, read_start in sorted_key:
                read = aligned_reads[read_name]
                if read_start < chunk_start - region_expansion_in_bp - max_window_size:  # safe distance for save reads
                    phasing_info = 'HP:i:{}'.format(
                        read.phasing) if read.phasing else ""
                    pass
                    read_str = '\t'.join([
                        read_name, read.flag, ctg_name,
                        str(read_start + 1),
                        str(read.mapping_quality), read.best_cigar, read.RNEXT,
                        read.PNEXT, read.TLEN, read.seq, read.raw_base_quality,
                        phasing_info
                    ])
                    save_file_fp.stdin.write(read_str + '\n')
                    del aligned_reads[read_name]
                for pile_pos in list(pileup.keys()):
                    if pile_pos < chunk_start - region_expansion_in_bp - max_window_size:
                        del pileup[pile_pos]

    if read_fn and aligned_reads:
        sorted_key = sorted([(key, item.best_pos)
                             for key, item in aligned_reads.items()],
                            key=lambda x: x[1])
        for read_name, read_start in sorted_key:
            read = aligned_reads[read_name]
            phasing_info = 'HP:i:{}'.format(
                read.phasing) if read.phasing else ""
            read_str = '\t'.join([
                read_name, read.flag, ctg_name,
                str(read_start + 1),
                str(read.mapping_quality), read.best_cigar, read.RNEXT,
                read.PNEXT, read.TLEN, read.seq, read.raw_base_quality,
                phasing_info
            ])
            save_file_fp.stdin.write(read_str + '\n')
            del aligned_reads[read_name]
        if read_fn != 'PIPE':
            save_file_fp.stdin.close()
            save_file_fp.wait()
    samtools_view_process.stdout.close()
    samtools_view_process.wait()

    if test_pos:
        save_file_fp = subprocess_popen(shlex.split("samtools index {}".format(
            read_fn + ('.{}_{}'.format(ctg_start, ctg_end)
                       if is_ctg_range_given and not test_pos else ""))),
                                        stdin=PIPE,
                                        stdout=PIPE)
        save_file_fp.stdin.close()
        save_file_fp.wait()
Esempio n. 6
0
def get_training_array(tensor_fn,
                       var_fn,
                       bed_fn,
                       shuffle=True,
                       is_allow_duplicate_chr_pos=False):
    tree = bed_tree_from(bed_file_path=bed_fn)
    is_tree_empty = len(tree.keys()) == 0

    Y = variant_map_from(var_fn, tree, is_tree_empty)

    X = {}
    f = subprocess_popen(shlex.split("gzip -fdc %s" % (tensor_fn)))
    total = 0
    mat = np.empty(input_tensor_size, dtype=np.float32)
    for row in f.stdout:
        chrom, coord, seq, mat = unpack_a_tensor_record(*(row.split()))
        if not (is_tree_empty or is_region_in(tree, chrom, int(coord))):
            continue
        seq = seq.upper()
        if seq[param.flankingBaseNum] not in BASIC_BASES:
            continue
        key = chrom + ":" + coord

        x = np.reshape(mat, (no_of_positions, matrix_row, matrix_num))
        for i in range(1, matrix_num):
            x[:, :, i] -= x[:, :, 0]

        if key not in X:
            X[key] = np.copy(x)
        elif is_allow_duplicate_chr_pos:
            new_key = ""
            for character in PREFIX_CHAR_STR:
                tmp_key = character + key
                if tmp_key not in X:
                    new_key = tmp_key
                    break
            if len(new_key) > 0:
                X[new_key] = np.copy(x)

        is_reference = key not in Y
        if is_reference:
            Y[key] = output_labels_from_reference(
                BASE2ACGT[seq[param.flankingBaseNum]])

        total += 1
        if total % 100000 == 0:
            print("Processed %d tensors" % total, file=sys.stderr)
    f.stdout.close()
    f.wait()

    # print "[INFO] size of X: {}, size of Y: {}".format(len(X), len(Y))

    all_chr_pos = sorted(X.keys())
    if shuffle == True:
        np.random.shuffle(all_chr_pos)

    X_compressed, Y_compressed, pos_compressed = [], [], []
    X_array, Y_array, pos_array = [], [], []
    count = 0
    total = 0
    for key in all_chr_pos:
        total += 1

        X_array.append(X[key])
        del X[key]

        if key in Y:
            Y_array.append(Y[key])
            pos_array.append(key)
            if not is_allow_duplicate_chr_pos:
                del Y[key]
        elif is_allow_duplicate_chr_pos:
            tmp_key = key[1:]
            Y_array.append(Y[tmp_key])
            pos_array.append(tmp_key)

        count += 1
        if count == param.bloscBlockSize:
            X_compressed.append(blosc_pack_array(np.array(X_array)))
            Y_compressed.append(blosc_pack_array(np.array(Y_array)))
            pos_compressed.append(blosc_pack_array(np.array(pos_array)))
            X_array, Y_array, pos_array = [], [], []
            count = 0

        if total % 50000 == 0:
            print("Compressed %d/%d tensor" % (total, len(all_chr_pos)),
                  file=sys.stderr)

    if count > 0:
        X_compressed.append(blosc_pack_array(np.array(X_array)))
        Y_compressed.append(blosc_pack_array(np.array(Y_array)))
        pos_compressed.append(blosc_pack_array(np.array(pos_array)))

    return total, X_compressed, Y_compressed, pos_compressed
Esempio n. 7
0
def MergeVcf_illumina(args):
    # region vcf merge for illumina, as read realignment will make candidate varaints shift and missing.
    bed_fn_prefix = args.bed_fn_prefix
    output_fn = args.output_fn
    full_alignment_vcf_fn = args.full_alignment_vcf_fn
    pileup_vcf_fn = args.pileup_vcf_fn  # true vcf var
    contig_name = args.ctgName
    QUAL = args.qual
    bed_fn = None
    if not os.path.exists(bed_fn_prefix):
        exit(
            log_error("[ERROR] Input directory: {} not exists!").format(
                bed_fn_prefix))

    all_files = os.listdir(bed_fn_prefix)
    all_files = [
        item for item in all_files if item.startswith(contig_name + '.')
    ]
    if len(all_files) != 0:
        bed_fn = os.path.join(bed_fn_prefix,
                              "full_aln_regions_{}".format(contig_name))
        with open(bed_fn, 'w') as output_file:
            for file in all_files:
                with open(os.path.join(bed_fn_prefix, file)) as f:
                    output_file.write(f.read())

    is_haploid_precise_mode_enabled = args.haploid_precise
    is_haploid_sensitive_mode_enabled = args.haploid_sensitive
    print_ref = args.print_ref_calls

    tree = bed_tree_from(bed_file_path=bed_fn,
                         padding=param.no_of_positions,
                         contig_name=contig_name)
    unzip_process = subprocess_popen(
        shlex.split("gzip -fdc %s" % (pileup_vcf_fn)))
    output_dict = {}
    header = []
    pileup_count = 0
    for row in unzip_process.stdout:
        if row[0] == '#':
            header.append(row)
            continue
        columns = row.strip().split()
        ctg_name = columns[0]
        if contig_name != None and ctg_name != contig_name:
            continue
        pos = int(columns[1])
        qual = float(columns[5])
        pass_bed = is_region_in(tree, ctg_name, pos)
        ref_base, alt_base = columns[3], columns[4]
        is_reference = (alt_base == "." or ref_base == alt_base)
        if is_haploid_precise_mode_enabled:
            row = update_haploid_precise_genotype(columns)
        if is_haploid_sensitive_mode_enabled:
            row = update_haploid_sensitive_genotype(columns)

        if not pass_bed:
            if not is_reference:
                row = MarkLowQual(row, QUAL, qual)
                output_dict[pos] = row
                pileup_count += 1
            elif print_ref:
                output_dict[pos] = row
                pileup_count += 1

    unzip_process.stdout.close()
    unzip_process.wait()

    realigned_vcf_unzip_process = subprocess_popen(
        shlex.split("gzip -fdc %s" % (full_alignment_vcf_fn)))
    realiged_read_num = 0
    for row in realigned_vcf_unzip_process.stdout:
        if row[0] == '#':
            continue
        columns = row.strip().split()
        ctg_name = columns[0]
        if contig_name != None and ctg_name != contig_name:
            continue

        pos = int(columns[1])
        qual = float(columns[5])
        ref_base, alt_base = columns[3], columns[4]
        is_reference = (alt_base == "." or ref_base == alt_base)

        if is_haploid_precise_mode_enabled:
            row = update_haploid_precise_genotype(columns)
        if is_haploid_sensitive_mode_enabled:
            row = update_haploid_sensitive_genotype(columns)

        if is_region_in(tree, ctg_name, pos):
            if not is_reference:
                row = MarkLowQual(row, QUAL, qual)
                output_dict[pos] = row
                realiged_read_num += 1
            elif print_ref:
                output_dict[pos] = row
                realiged_read_num += 1

    logging.info('[INFO] Pileup positions variants proceeded in {}: {}'.format(
        contig_name, pileup_count))
    logging.info(
        '[INFO] Realigned positions variants proceeded in {}: {}'.format(
            contig_name, realiged_read_num))
    realigned_vcf_unzip_process.stdout.close()
    realigned_vcf_unzip_process.wait()

    with open(output_fn, 'w') as output_file:
        output_list = header + [
            output_dict[pos] for pos in sorted(output_dict.keys())
        ]
        output_file.write(''.join(output_list))
Esempio n. 8
0
def CheckEnvs(args):
    basedir = os.path.dirname(__file__)
    bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True)
    ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True)
    fai_fn = file_path_from(args.ref_fn, suffix=".fai", exit_on_not_found=True, sep='.')
    bai_fn = file_path_from(args.bam_fn, suffix=".bai", sep='.')
    csi_fn = file_path_from(args.bam_fn, suffix=".csi", sep='.')
    if bai_fn is None and csi_fn is None:
        sys.exit(log_error("[ERROR] Neither Bam index file {} or {} not found".format(file_name + '.bai', file_name + '.csi')))
    bed_fn = file_path_from(args.bed_fn)
    vcf_fn = file_path_from(args.vcf_fn)
    tree = bed_tree_from(bed_file_path=bed_fn)

    # create temp file folder
    output_fn_prefix = args.output_fn_prefix
    output_fn_prefix = folder_path_from(output_fn_prefix, create_not_found=True)
    log_path = folder_path_from(os.path.join(output_fn_prefix, 'log'), create_not_found=True)
    tmp_file_path = folder_path_from(os.path.join(output_fn_prefix, 'tmp'), create_not_found=True)
    split_bed_path = folder_path_from(os.path.join(tmp_file_path, 'split_beds'),
                                      create_not_found=True) if bed_fn or vcf_fn else None
    pileup_vcf_path = folder_path_from(os.path.join(tmp_file_path, 'pileup_output'), create_not_found=True)
    merge_vcf_path = folder_path_from(os.path.join(tmp_file_path, 'merge_output'), create_not_found=True)
    phase_output_path = folder_path_from(os.path.join(tmp_file_path, 'phase_output'), create_not_found=True)
    gvcf_temp_output_path = folder_path_from(os.path.join(tmp_file_path, 'gvcf_tmp_output'), create_not_found=True)
    full_alignment_output_path = folder_path_from(os.path.join(tmp_file_path, 'full_alignment_output'),
                                                  create_not_found=True)
    phase_vcf_path = folder_path_from(os.path.join(phase_output_path, 'phase_vcf'), create_not_found=True)
    phase_bam_path = folder_path_from(os.path.join(phase_output_path, 'phase_bam'), create_not_found=True)
    candidate_bed_path = folder_path_from(os.path.join(full_alignment_output_path, 'candidate_bed'),
                                          create_not_found=True)

    # environment parameters
    pypy = args.pypy
    samtools = args.samtools
    whatshap = args.whatshap
    parallel = args.parallel
    qual = args.qual
    var_pct_full = args.var_pct_full
    ref_pct_full = args.ref_pct_full
    snp_min_af = args.snp_min_af
    indel_min_af = args.indel_min_af
    min_contig_size = args.min_contig_size
    sample_name = args.sampleName
    contig_name_list = os.path.join(tmp_file_path, 'CONTIGS')
    chunk_list = os.path.join(tmp_file_path, 'CHUNK_LIST')

    legal_range_from(param_name="qual", x=qual, min_num=0, exit_out_of_range=True)
    legal_range_from(param_name="var_pct_full", x=var_pct_full, min_num=0, max_num=1, exit_out_of_range=True)
    legal_range_from(param_name="ref_pct_full", x=ref_pct_full, min_num=0, max_num=1, exit_out_of_range=True)
    legal_range_from(param_name="snp_min_af", x=snp_min_af, min_num=0, max_num=1, exit_out_of_range=True)
    legal_range_from(param_name="indel_min_af", x=indel_min_af, min_num=0, max_num=1, exit_out_of_range=True)
    if ref_pct_full > 0.3:
        print(log_warning(
            "[WARNING] For efficiency, we use a maximum 30% reference candidates for full-alignment calling"))
    tool_version = {
        'python': LooseVersion(sys.version.split()[0]),
        'pypy': check_version(tool=pypy, pos=0, is_pypy=True),
        'samtools': check_version(tool=samtools, pos=1),
        'whatshap': check_version(tool=whatshap, pos=1),
        'parallel': check_version(tool=parallel, pos=2),
    }
    check_tools_version(tool_version, required_tool_version)

    is_include_all_contigs = args.include_all_ctgs
    is_bed_file_provided = bed_fn is not None
    is_known_vcf_file_provided = vcf_fn is not None

    if is_known_vcf_file_provided and is_bed_file_provided:
        sys.exit(log_error("[ERROR] Please provide either --vcf_fn or --bed_fn only"))

    if is_known_vcf_file_provided:
        know_vcf_contig_set = split_extend_vcf(vcf_fn=vcf_fn, output_fn=split_bed_path)

    ctg_name_list = args.ctg_name
    is_ctg_name_list_provided = ctg_name_list is not None and ctg_name_list != "EMPTY"
    contig_set = set(ctg_name_list.split(',')) if is_ctg_name_list_provided else set()

    if is_ctg_name_list_provided and is_bed_file_provided:
        print(log_warning("[WARNING] both --ctg_name and --bed_fn provided, will only proceed contigs in intersection"))

    if is_ctg_name_list_provided and is_known_vcf_file_provided:
        print(log_warning("[WARNING] both --ctg_name and --vcf_fn provided, will only proceed contigs in intersection"))

    if is_ctg_name_list_provided:

        contig_set = contig_set.intersection(
            set(tree.keys())) if is_bed_file_provided else contig_set

        contig_set = contig_set.intersection(
            know_vcf_contig_set) if is_known_vcf_file_provided else contig_set
    else:
        contig_set = contig_set.union(
            set(tree.keys())) if is_bed_file_provided else contig_set

        contig_set = contig_set.union(
            know_vcf_contig_set) if is_known_vcf_file_provided else contig_set

    # if each split region is too small(long) for given default chunk num, will increase(decrease) the total chunk num
    default_chunk_num = args.chunk_num
    DEFAULT_CHUNK_SIZE = args.chunk_size
    contig_length_list = []
    contig_chunk_num = {}

    with open(fai_fn, 'r') as fai_fp:
        for row in fai_fp:
            columns = row.strip().split("\t")
            contig_name, contig_length = columns[0], int(columns[1])
            if not is_include_all_contigs and (
            not (is_bed_file_provided or is_ctg_name_list_provided or is_known_vcf_file_provided)) and str(
                    contig_name) not in major_contigs:
                continue

            if is_bed_file_provided and contig_name not in tree:
                continue
            if is_ctg_name_list_provided and contig_name not in contig_set:
                continue
            if is_known_vcf_file_provided and contig_name not in contig_set:
                continue

            if min_contig_size > 0 and contig_length < min_contig_size:
                print(log_warning(
                    "[WARNING] {} contig length {} is smaller than minimum contig size {}, will skip it!".format(contig_name, contig_length, min_contig_size)))
                if contig_name in contig_set:
                    contig_set.remove(contig_name)
                continue
            contig_set.add(contig_name)
            contig_length_list.append(contig_length)
            chunk_num = int(
                contig_length / float(DEFAULT_CHUNK_SIZE)) + 1 if contig_length % DEFAULT_CHUNK_SIZE else int(
                contig_length / float(DEFAULT_CHUNK_SIZE))
            contig_chunk_num[contig_name] = max(chunk_num, 1)

    if default_chunk_num > 0:
        min_chunk_length = min(contig_length_list) / float(default_chunk_num)
        max_chunk_length = max(contig_length_list) / float(default_chunk_num)

    contigs_order = major_contigs_order + list(contig_set)

    sorted_contig_list = sorted(list(contig_set), key=lambda x: contigs_order.index(x))

    found_contig = True
    if not len(contig_set):
        if is_bed_file_provided:
            all_contig_in_bed = ' '.join(list(tree.keys()))
            print(log_warning("[WARNING] No contig intersection found by --bed_fn, contigs in BED {}: {}".format(bed_fn, all_contig_in_bed)))
        if is_known_vcf_file_provided:
            all_contig_in_vcf = ' '.join(list(know_vcf_contig_set))
            print(log_warning("[WARNING] No contig intersection found by --vcf_fn, contigs in VCF {}: {}".format(vcf_fn, all_contig_in_vcf)))
        if is_ctg_name_list_provided:
            all_contig_in_ctg_name = ' '.join(ctg_name_list.split(','))
            print(log_warning("[WARNING] No contig intersection found by --ctg_name, contigs in contigs list: {}".format(all_contig_in_ctg_name)))
        found_contig = False
    else:
        for c in sorted_contig_list:
            if c not in contig_chunk_num:
                print(log_warning(("[WARNING] Contig {} given but not found in reference fai file".format(c))))

        # check contig in bam have support reads
        sorted_contig_list, found_contig = check_contig_in_bam(bam_fn=bam_fn, sorted_contig_list=sorted_contig_list,
                                                               samtools=samtools)

    if not found_contig:
        # output header only to merge_output.vcf.gz
        output_fn = os.path.join(output_fn_prefix, "merge_output.vcf")
        output_header(output_fn=output_fn, reference_file_path=ref_fn, sample_name=sample_name)
        compress_index_vcf(output_fn)
        print(log_warning(
            ("[WARNING] No contig intersection found, output header only in {}").format(output_fn + ".gz")))
        with open(contig_name_list, 'w') as output_file:
            output_file.write("")
        return

    print('[INFO] Call variant in contigs: {}'.format(' '.join(sorted_contig_list)))
    print('[INFO] Chunk number for each contig: {}'.format(
        ' '.join([str(contig_chunk_num[c]) for c in sorted_contig_list])))

    if default_chunk_num > 0 and max_chunk_length > MAX_CHUNK_LENGTH:
        print(log_warning(
            '[WARNING] Current maximum chunk size {} is larger than default maximum chunk size {}, You may set a larger chunk_num by setting --chunk_num=$ for better parallelism.'.format(
                min_chunk_length, MAX_CHUNK_LENGTH)))

    elif default_chunk_num > 0 and min_chunk_length < MIN_CHUNK_LENGTH:
        print(log_warning(
            '[WARNING] Current minimum chunk size {} is smaller than default minimum chunk size {}, You may set a smaller chunk_num by setting --chunk_num=$.'.format(
                min_chunk_length, MIN_CHUNK_LENGTH)))

    if default_chunk_num == 0 and max(contig_length_list) < DEFAULT_CHUNK_SIZE / 5:
        print(log_warning(
            '[WARNING] Current maximum contig length {} is much smaller than default chunk size {}, You may set a smaller chunk size by setting --chunk_size=$ for better parallelism.'.format(
                max(contig_length_list), DEFAULT_CHUNK_SIZE)))

    if is_bed_file_provided:
        split_extend_bed(bed_fn=bed_fn, output_fn=split_bed_path, contig_set=contig_set)

    with open(contig_name_list, 'w') as output_file:
        output_file.write('\n'.join(sorted_contig_list))

    with open(chunk_list, 'w') as output_file:
        for contig_name in sorted_contig_list:
            chunk_num = contig_chunk_num[contig_name]
            for chunk_id in range(1, chunk_num + 1):
                output_file.write(contig_name + ' ' + str(chunk_id) + ' ' + str(chunk_num) + '\n')
Esempio n. 9
0
def get_training_array(tensor_fn, var_fn, bed_fn, bin_fn, shuffle=True, is_allow_duplicate_chr_pos=True, chunk_id=None,
                       chunk_num=None, platform='ont', pileup=False, maximum_non_variant_ratio=None, candidate_details_fn_prefix=None):

    """
    Generate training array for training. here pytables with blosc:lz4hc are used for extreme fast compression and decompression,
    which can meet the requirement of gpu utilization. lz4hc decompression allows speed up training array decompression 4~5x compared
    with tensorflow tfrecord file format, current gpu utilization could reach over 85% with only 10G memory.
    tensor_fn: string format tensor acquired from CreateTensorPileup or CreateTensorFullAlign, include contig name position, tensor matrix, alternative information.
    var_fn: simplified variant(vcf) format from GetTruths, which include contig name, position, reference base, alternative base, genotype.
    bin_fn: pytables format output bin file name.
    shuffle: whether apply index shuffling when generating training data, default True, which would promote robustness.
    is_allow_duplicate_chr_pos: whether allow duplicate positions when training, if there exists downsampled data, lower depth will add a random prefix character.
    chunk_id: specific chunk id works with total chunk_num for parallel execution. Here will merge all tensor file with sampe prefix.
    chunk_num: total chunk number for parallel execution. Each chunk refer to a smaller reference regions.
    platform: platform for tensor shape, ont give a larger maximum depth compared with pb and illumina.
    pileup: whether in pileup mode. Define two calling mode, pileup or full alignment.
    maximum_non_variant_ratio: define a maximum non variant ratio for training, we always expect use more non variant data, while it would greatly increase training
    time, especially in ont data, here we usually use 1:1 or 1:2 for variant candidate: non variant candidate.
    candidate_details_fn_prefix: a counter to calculate total variant and non variant from the information in alternative file.
    """

    tree = bed_tree_from(bed_file_path=bed_fn)
    is_tree_empty = len(tree.keys()) == 0
    Y_true_var, miss_variant_set, truth_alt_dict = variant_map_from(var_fn, tree, is_tree_empty)
    Y = copy.deepcopy(Y_true_var)

    global param
    float_type = 'int32'
    if pileup:
        import shared.param_p as param
    else:
        import shared.param_f as param
        float_type = 'int8'

    import tables
    FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5)
    tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape

    subprocess_list = []
    if tensor_fn == 'PIPE':
        subprocess_list.append(sys.stdin)
    elif os.path.exists(tensor_fn):
        subprocess_list.append(subprocess_popen(shlex.split("{} -fdc {}".format(param.zstd, tensor_fn))).stdout)
    # select all match prefix if file path not exists
    else:
        tensor_fn = tensor_fn.split('/')
        directry, file_prefix = '/'.join(tensor_fn[:-1]), tensor_fn[-1]
        all_file_name = []
        for file_name in os.listdir(directry):
            if file_name.startswith(file_prefix + '_') or file_name.startswith(
                    file_prefix + '.'):  # add '_.' to avoid add other prefix chr
                all_file_name.append(file_name)
        all_file_name = sorted(all_file_name)
        if chunk_id is not None:
            chunk_size = len(all_file_name) // chunk_num if len(all_file_name) % chunk_num == 0 else len(
                all_file_name) // chunk_num + 1
            chunk_start = chunk_size * chunk_id
            chunk_end = chunk_start + chunk_size
            all_file_name = all_file_name[chunk_start:chunk_end]
        if not len(all_file_name):
            print("[INFO] chunk_id exceed total file number, skip chunk", file=sys.stderr)
            return 0
        for file_name in all_file_name:
            subprocess_list.append(
                subprocess_popen(shlex.split("{} -fdc {}".format(param.zstd, os.path.join(directry, file_name)))).stdout)

    tables.set_blosc_max_threads(64)
    int_atom = tables.Atom.from_dtype(np.dtype(float_type))
    string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50)
    long_string_atom = tables.StringAtom(itemsize=5000)  # max alt_info length
    table_file = tables.open_file(bin_fn, mode='w', filters=FILTERS)
    table_file.create_earray(where='/', name='position_matrix', atom=int_atom, shape=[0] + tensor_shape,
                             filters=FILTERS)
    table_file.create_earray(where='/', name='position', atom=string_atom, shape=(0, 1), filters=FILTERS)
    table_file.create_earray(where='/', name='label', atom=int_atom, shape=(0, param.label_size), filters=FILTERS)
    table_file.create_earray(where='/', name='alt_info', atom=long_string_atom, shape=(0, 1), filters=FILTERS)

    table_dict = update_table_dict()

    # generator to avoid high memory occupy
    bin_reader_generator = partial(bin_reader_generator_from, 
                                   Y_true_var=Y_true_var,
                                   Y=Y,
                                   is_tree_empty=is_tree_empty,
                                   tree=tree,
                                   miss_variant_set=miss_variant_set,
                                   truth_alt_dict=truth_alt_dict,
                                   is_allow_duplicate_chr_pos=is_allow_duplicate_chr_pos,
                                   maximum_non_variant_ratio=maximum_non_variant_ratio)

    total_compressed = 0
    for fin in subprocess_list:
        bin_g = bin_reader_generator(tensor_fn=fin)
        completed = False
        while not completed:
            try:
                X, total, completed = next(bin_g)
            except StopIteration:
                completed = True
        
            if X is None or not len(X):
                break
            all_chr_pos = sorted(X.keys())
            if shuffle == True:
                np.random.shuffle(all_chr_pos)
            for key in all_chr_pos:

                string, alt_info, seq = X[key]
                del X[key]
                label = None
                if key in Y:
                    label = Y[key]
                    pos = key + ':' + seq
                    if not is_allow_duplicate_chr_pos:
                        del Y[key]
                elif is_allow_duplicate_chr_pos:
                    tmp_key = key[1:]
                    label = Y[tmp_key]
                    pos = tmp_key + ':' + seq
                if label is None:
                    print(key)
                    continue
                total_compressed = write_table_dict(table_dict, string, label, pos, total_compressed, alt_info,
                                                    tensor_shape, pileup)

                if total_compressed % 500 == 0 and total_compressed > 0:
                    table_dict = write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type)

                if total_compressed % 50000 == 0:
                    print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr)
        fin.close()

    if total_compressed % 500 != 0 and total_compressed > 0:
        table_dict = write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type)

    table_file.close()
    print("[INFO] Compressed %d/%d tensor" % (total_compressed, total), file=sys.stderr)
Esempio n. 10
0
def CreateTensorPileup(args):
    """
    Create pileup tensor for pileup model training or calling.
    Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency
    and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate
    regions once, we could directly get all variant candidates directly.
    """
    ctg_start = args.ctgStart
    ctg_end = args.ctgEnd
    fasta_file_path = args.ref_fn
    ctg_name = args.ctgName
    bam_file_path = args.bam_fn
    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
    chunk_num = args.chunk_num
    minimum_snp_af_for_candidate = args.snp_min_af
    minimum_indel_af_for_candidate = args.indel_min_af
    min_coverage = args.minCoverage
    min_mapping_quality = args.minMQ
    platform = args.platform

    vcf_fn = file_path_from(args.vcf_fn)
    is_known_vcf_file_provided = vcf_fn is not None
    confident_bed_fn = file_path_from(args.extend_bed)
    is_confident_bed_file_given = confident_bed_fn is not None
    extend_bed = file_path_from(args.extend_bed)
    is_extend_bed_file_given = extend_bed is not None
    fast_mode = args.fast_mode
    call_snp_only = args.call_snp_only
    enable_long_indel = args.enable_long_indel
    # 1-based regions [start, end] (start and end inclusive)
    tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed,
                                             contig_name=ctg_name,
                                             return_bed_region=True)

    fai_fn = file_path_from(fasta_file_path,
                            suffix=".fai",
                            exit_on_not_found=True,
                            sep='.')

    fast_mode = platform == 'ont' and fast_mode
    minimum_snp_af_for_candidate = max(
        minimum_snp_af_for_candidate, param.min_af_dict[platform]
    ) if fast_mode else minimum_snp_af_for_candidate
    min_coverage = max(min_coverage, 4) if fast_mode else min_coverage
    max_indel_length = param.maximum_variant_length_that_need_infer if not enable_long_indel else param.maximum_variant_length_that_need_infer_include_long_indel

    if not is_confident_bed_file_given and chunk_id is not None:
        contig_length = 0
        with open(fai_fn, 'r') as fai_fp:
            for row in fai_fp:
                columns = row.strip().split("\t")

                contig_name = columns[0]
                if contig_name != ctg_name:
                    continue
                contig_length = int(columns[1])
        chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num
        ctg_start = chunk_size * chunk_id  # 0-base to 1-base
        ctg_end = ctg_start + chunk_size

    if is_confident_bed_file_given and chunk_id is not None:
        chunk_size = (bed_end - bed_start) // chunk_num + 1 if (
            bed_end - bed_start) % chunk_num else (bed_end -
                                                   bed_start) // chunk_num
        ctg_start = bed_start + 1 + chunk_size * chunk_id  # 0-base to 1-base
        ctg_end = ctg_start + chunk_size

    if is_known_vcf_file_provided and chunk_id is not None:
        known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn,
                                                  contig_name=ctg_name)
        total_variants_size = len(known_variants_list)
        chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1
        chunk_start_pos = chunk_id * chunk_variants_size
        known_variants_set = set(
            known_variants_list[chunk_start_pos:chunk_start_pos +
                                chunk_variants_size])
        if len(known_variants_set) == 0:
            return [], [], []
        ctg_start, ctg_end = min(known_variants_set), max(known_variants_set)

    is_ctg_name_given = ctg_name is not None
    is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None
    if is_ctg_range_given:
        ctg_start = max(1, ctg_start)
        extend_start = max(1, ctg_start - no_of_positions)
        extend_end = ctg_end + no_of_positions

    region_str = "{}:{}-{}".format(ctg_name, extend_start, extend_end)
    region = Region.from_string(region_str)

    confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn,
                                       contig_name=ctg_name,
                                       bed_ctg_start=extend_start,
                                       bed_ctg_end=extend_end)

    if args.gvcf:
        from preprocess.utils import variantInfoCalculator
        nonVariantCaller = variantInfoCalculator(
            gvcfWritePath=args.temp_file_dir,
            ref_path=args.ref_fn,
            bp_resolution=args.bp_resolution,
            ctgName=ctg_name,
            sample_name='.'.join(
                [args.sampleName, ctg_name,
                 str(ctg_start),
                 str(ctg_end)]),
            p_err=args.base_err,
            gq_bin_size=args.gq_bin_size)

    chunk_result, all_alt_info_list, gvcf_output = pileup_counts_clair3(
        region,
        bam=bam_file_path,
        fasta=fasta_file_path,
        min_depth=min_coverage,
        min_snp_af=minimum_snp_af_for_candidate,
        min_indel_af=minimum_indel_af_for_candidate,
        min_mq=min_mapping_quality,
        max_indel_length=max_indel_length,
        call_snp_only=call_snp_only,
        max_depth=param.max_depth,
        gvcf=args.gvcf)

    # slice all candidates tensor according to the alternative information
    np_pileup_data, all_position_info, all_alt_info = [], [], []
    for idx, (pos, pos_info, alt_info) in enumerate(all_alt_info_list):
        pos = int(pos)
        pass_confident_bed = not is_confident_bed_file_given or is_region_in(
            tree=confident_bed_tree,
            contig_name=ctg_name,
            region_start=pos - 1,
            region_end=pos + 1)

        pass_vcf_region = not is_known_vcf_file_provided or (
            is_known_vcf_file_provided and pos in known_variants_set)

        if not pass_confident_bed or not pass_vcf_region:
            continue
        start, end = pos - flanking_base_num, pos + flanking_base_num + 1
        for result in chunk_result:
            if start - 1 >= result[1][0][0] and end <= result[1][-1][0]:
                offset = start - result[1][0][0] - 1
                tensor = result[0][offset:offset + no_of_positions]
                # mainly because no coverage in flanking windows
                if tensor.shape != (no_of_positions, channel_size):
                    continue
                # check any empty columns in flanking position, those columns with all zeros
                if np.sum(np.sum(tensor == 0, axis=1) == channel_size) > 0:
                    continue
                np_pileup_data.append(tensor)
                all_position_info.append(pos_info)
                all_alt_info.append(alt_info)
    np_pileup_data = np.array(np_pileup_data, dtype=np.int32)

    if args.gvcf:

        from shared.utils import reference_sequence_from, region_from
        samtools_execute_command = args.samtools
        ref_regions = []
        reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion
        reference_start = 1 if reference_start < 1 else reference_start
        ref_regions.append(
            region_from(ctg_name=ctg_name,
                        ctg_start=reference_start,
                        ctg_end=reference_end))
        reference_sequence = reference_sequence_from(
            samtools_execute_command=samtools_execute_command,
            fasta_file_path=fasta_file_path,
            regions=ref_regions)

        offset = 0 if ctg_start == 1 else 1
        empty_pileup_flag = False
        start = ctg_start - extend_start + offset
        end = ctg_end + 1 - extend_start + offset
        if sum(gvcf_output[1][start:end]) == 0:
            empty_pileup_flag = True
        for pos in range(ctg_start, ctg_end):
            if empty_pileup_flag:
                break
            ref_count = gvcf_output[0][pos - extend_start + offset]
            total_count = gvcf_output[1][pos - extend_start + offset]
            if pos - reference_start >= len(reference_sequence):
                continue
            reference_base = reference_sequence[pos - reference_start]
            if (ref_count == 0 and total_count == 0):
                cur_site_info = {
                    'chr': ctg_name,
                    'pos': pos,
                    'ref': reference_base,
                    'n_total': 0,
                    'n_ref': 0
                }
                nonVariantCaller.make_gvcf_online(cur_site_info)
                continue

            cur_site_info = {
                'chr': ctg_name,
                'pos': pos,
                'ref': reference_base,
                'n_total': total_count,
                'n_ref': ref_count
            }
            nonVariantCaller.make_gvcf_online(cur_site_info)
        if len(nonVariantCaller.current_block) != 0:
            nonVariantCaller.write_to_gvcf_batch(
                nonVariantCaller.current_block, nonVariantCaller.cur_min_DP,
                nonVariantCaller.cur_raw_gq)

        if empty_pileup_flag:
            nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end)
        nonVariantCaller.close_vcf_writer()

    return np_pileup_data, all_position_info, all_alt_info