def output_header(output_fn, reference_file_path, sample_name='SAMPLE'): output_file = open(output_fn, "w") from textwrap import dedent output_file.write( dedent("""\ ##fileformat=VCFv4.2 ##FILTER=<ID=PASS,Description="All filters passed"> ##FILTER=<ID=LowQual,Description="Low quality variant"> ##FILTER=<ID=RefCall,Description="Reference call"> ##INFO=<ID=P,Number=0,Type=Flag,Description="Result from pileup calling"> ##INFO=<ID=F,Number=0,Type=Flag,Description="Result from full-alignment calling"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> ##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele"> ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods rounded to the closest integer"> ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Estimated allele frequency in the range of [0,1]">""" ) + '\n') if reference_file_path is not None: reference_index_file_path = file_path_from(reference_file_path, suffix=".fai", exit_on_not_found=True, sep='.') with open(reference_index_file_path, "r") as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name, contig_size = columns[0], columns[1] output_file.write(("##contig=<ID=%s,length=%s>" % (contig_name, contig_size) + '\n')) output_file.write( '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s' % (sample_name)) output_file.close()
def select_qual_from_stdin(args): """ Select a global quality cut-off for full alignment calling from pileup vcf file. False positive pileup variants and true variants missed by pileup calling would mostly have low quality score (reference quality score for missing variants), so only use a proportion of low quality variants for full alignment while maintain high quality pileup output, as full alignment calling is substantially slower than pileup calling. """ var_pct_full = args.var_pct_full qual_fn = args.qual_fn if args.qual_fn is not None else "qual" vcf_fn = file_path_from(args.vcf_fn) ref_pct_full = args.ref_pct_full if args.ref_pct_full else var_pct_full # for efficiency, we use a maximum 30% reference candidates proportion for full-alignment calling, which is almost cover all false negative candidates # for ont platform, we set a default 10% reference candidates proportion for full-alignment calling unless a known vcf file is provided (genotyping mode) # directly set default value in run_clair3.sh from v0.1-r5 # ref_pct_full = 0.1 if args.platform == 'ont' else ref_pct_full # ref_pct_full = min(ref_pct_full, 0.3) variant_qual_list = [] ref_qual_list = [] for row in stdin: if row[0] == '#': continue row = row.rstrip().split() qual, gt_info = row[5], row[9] genotype = gt_info.split(':')[0] if genotype == '0/0': ref_qual_list.append(float(qual)) else: variant_qual_list.append(float(qual)) ref_qual_list = sorted(ref_qual_list) variant_qual_list = sorted(variant_qual_list) low_variant_qual_list = variant_qual_list[:int(var_pct_full * len(variant_qual_list))] if len(low_variant_qual_list) == 0: print(log_warning( "[WARNING] Cannot find any low-quality 0/1 or 1/1 variant in pileup output using variant quality cut-off proportion: {}, total variants: {}".format( var_pct_full, len(variant_qual_list)))) print(log_warning("[WARNING] Set low variant quality score cut-off to 0.0")) var_qual_cut_off = 0.0 else: var_qual_cut_off = low_variant_qual_list[-1] # If a known vcf file is provided, use user-defined proportion low_ref_qual_list = ref_qual_list[:int(ref_pct_full * len(ref_qual_list))] if vcf_fn is None else ref_qual_list[:int(args.ref_pct_full * len(ref_qual_list))] if len(low_ref_qual_list) == 0: print(log_warning( "[WARNING] Cannot find any low-quality 0/0 reference calls in pileup output using reference quality cut-off proportion: {}, total reference calls: {}".format( ref_pct_full, len(ref_qual_list)))) print(log_warning("[WARNING] Set low reference quality score cut-off to 0.0")) ref_qual_cut_off = 0.0 else: ref_qual_cut_off = low_ref_qual_list[-1] print ('[INFO] Set variants quality cutoff {}'.format(round(var_qual_cut_off, 0))) print ('[INFO] Set reference calls quality cutoff {}'.format(round(ref_qual_cut_off, 0))) if args.output_fn: with open(os.path.join(args.output_fn, qual_fn), 'w') as output: output.write(str(var_qual_cut_off) + ' ' + str(ref_qual_cut_off))
def _print_vcf_header(self): from textwrap import dedent print(dedent("""\ ##fileformat=VCFv4.2 ##FILTER=<ID=PASS,Description="All filters passed"> ##FILTER=<ID=LowQual,Description="Low quality variant"> ##FILTER=<ID=RefCall,Description="Reference call"> ##INFO=<ID=P,Number=0,Type=Flag,Description="Result from pileup calling"> ##INFO=<ID=F,Number=0,Type=Flag,Description="Result from full-alignment calling"> ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> ##INFO=<ID=END,Number=1,Type=Integer,Description="End position (for use with symbolic alleles)"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> ##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele"> ##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods rounded to the closest integer"> ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Estimated allele frequency in the range of [0,1]">""" ), file=self.vcf_writer) if self.reference_file_path is not None: reference_index_file_path = file_path_from( self.reference_file_path, suffix=".fai", exit_on_not_found=True, sep='.') with open(reference_index_file_path, "r") as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name, contig_size = columns[0], columns[1] print("##contig=<ID=%s,length=%s>" % (contig_name, contig_size), file=self.vcf_writer) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s' % (self.sampleName), file=self.vcf_writer) pass
def Run(args): basedir = os.path.dirname(__file__) callVarBamBin = basedir + "/../clair.py callVarBam" pypyBin = executable_command_string_from(args.pypy, exit_on_not_found=True) samtoolsBin = executable_command_string_from(args.samtools, exit_on_not_found=True) chkpnt_fn = file_path_from(args.chkpnt_fn, suffix=".meta", exit_on_not_found=True) bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) fai_fn = file_path_from(args.ref_fn + ".fai", exit_on_not_found=True) bed_fn = file_path_from(args.bed_fn) vcf_fn = file_path_from(args.vcf_fn) output_prefix = args.output_prefix af_threshold = args.threshold tree = bed_tree_from(bed_file_path=bed_fn) minCoverage = args.minCoverage sampleName = args.sampleName delay = args.delay threads = args.tensorflowThreads qual = args.qual is_include_all_contigs = args.includingAllContigs region_chunk_size = args.refChunkSize stop_consider_left_edge = command_option_from(args.stop_consider_left_edge, 'stop_consider_left_edge') log_path = command_option_from(args.log_path, 'log_path', option_value=args.log_path) pysam_for_all_indel_bases = command_option_from(args.pysam_for_all_indel_bases, 'pysam_for_all_indel_bases') haploid_mode = command_option_from(args.haploid, 'haploid') output_for_ensemble = command_option_from(args.output_for_ensemble, 'output_for_ensemble') debug = command_option_from(args.debug, 'debug') qual = command_option_from(args.qual, 'qual', option_value=args.qual) fast_plotting = command_option_from(args.fast_plotting, 'fast_plotting') call_var_bam_command_options = [ ExecuteCommand('python3', callVarBamBin), CommandOption('chkpnt_fn', chkpnt_fn), CommandOption('ref_fn', ref_fn), CommandOption('bam_fn', bam_fn), CommandOption('threshold', af_threshold), CommandOption('minCoverage', minCoverage), CommandOption('pypy', pypyBin), CommandOption('samtools', samtoolsBin), CommandOption('delay', delay), CommandOption('threads', threads), CommandOption('sampleName', sampleName), # optional command options CommandOption('vcf_fn', vcf_fn) if vcf_fn is not None else None, qual, stop_consider_left_edge, debug, pysam_for_all_indel_bases, haploid_mode, output_for_ensemble, ] activation_only_command_options = [ CommandOptionWithNoValue('activation_only'), log_path, CommandOption('max_plot', args.max_plot), CommandOption('parallel_level', args.parallel_level), CommandOption('workers', args.workers), fast_plotting, ] if args.activation_only else [] is_bed_file_provided = bed_fn is not None command_string = command_string_from(call_var_bam_command_options + activation_only_command_options) with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if not is_include_all_contigs and str(contig_name) not in major_contigs: continue region_start, region_end = 0, 0 contig_length = int(columns[1]) while region_end < contig_length: region_start = region_end region_end = region_start + region_chunk_size if region_end > contig_length: region_end = contig_length output_fn = "%s.%s_%d_%d.vcf" % (output_prefix, contig_name, region_start, region_end) is_region_in_bed = is_bed_file_provided and is_region_in(tree, contig_name, region_start, region_end) need_output_command = not is_bed_file_provided or is_region_in_bed if not need_output_command: continue additional_command_options = [ CommandOption('ctgName', contig_name), CommandOption('ctgStart', region_start), CommandOption('ctgEnd', region_end), CommandOption('call_fn', output_fn), CommandOption('bed_fn', bed_fn) if is_region_in_bed else None ] print(command_string + " " + command_string_from(additional_command_options))
def CreateTensorPileup(args): """ Create pileup tensor for pileup model training or calling. Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate regions once, we could directly get all variant candidates directly. """ ctg_start = args.ctgStart ctg_end = args.ctgEnd fasta_file_path = args.ref_fn ctg_name = args.ctgName samtools_execute_command = args.samtools bam_file_path = args.bam_fn chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base chunk_num = args.chunk_num tensor_can_output_path = args.tensor_can_fn minimum_af_for_candidate = args.min_af minimum_snp_af_for_candidate = args.snp_min_af minimum_indel_af_for_candidate = args.indel_min_af min_coverage = args.minCoverage platform = args.platform confident_bed_fn = args.bed_fn is_confident_bed_file_given = confident_bed_fn is not None alt_fn = args.indel_fn extend_bed = args.extend_bed is_extend_bed_file_given = extend_bed is not None min_mapping_quality = args.minMQ min_base_quality = args.minBQ fast_mode = args.fast_mode vcf_fn = args.vcf_fn is_known_vcf_file_provided = vcf_fn is not None call_snp_only = args.call_snp_only global test_pos test_pos = None # 1-based regions [start, end] (start and end inclusive) ref_regions = [] reads_regions = [] known_variants_set = set() tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed, contig_name=ctg_name, return_bed_region=True) fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.') if not is_confident_bed_file_given and chunk_id is not None: contig_length = 0 with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if contig_name != ctg_name: continue contig_length = int(columns[1]) chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num ctg_start = chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_confident_bed_file_given and chunk_id is not None: chunk_size = (bed_end - bed_start) // chunk_num + 1 if ( bed_end - bed_start) % chunk_num else (bed_end - bed_start) // chunk_num ctg_start = bed_start + 1 + chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_known_vcf_file_provided and chunk_id is not None: known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) total_variants_size = len(known_variants_list) chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1 chunk_start_pos = chunk_id * chunk_variants_size known_variants_set = set( known_variants_list[chunk_start_pos:chunk_start_pos + chunk_variants_size]) if len(known_variants_set) == 0: return ctg_start, ctg_end = min(known_variants_set), max(known_variants_set) is_ctg_name_given = ctg_name is not None is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None if is_ctg_range_given: extend_start = ctg_start - no_of_positions extend_end = ctg_end + no_of_positions reads_regions.append( region_from(ctg_name=ctg_name, ctg_start=extend_start, ctg_end=extend_end)) reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion reference_start = 1 if reference_start < 1 else reference_start ref_regions.append( region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) elif is_ctg_name_given: reads_regions.append(region_from(ctg_name=ctg_name)) ref_regions.append(region_from(ctg_name=ctg_name)) reference_start = 1 reference_sequence = reference_sequence_from( samtools_execute_command=samtools_execute_command, fasta_file_path=fasta_file_path, regions=ref_regions) if reference_sequence is None or len(reference_sequence) == 0: sys.exit( log_error( "[ERROR] Failed to load reference sequence from file ({}).". format(fasta_file_path))) if is_confident_bed_file_given and ctg_name not in tree: sys.exit( log_error("[ERROR] ctg_name {} not exists in bed file({}).".format( ctg_name, confident_bed_fn))) # samtools mpileup options # reverse-del: deletion in forward/reverse strand were marked as '*'/'#' min_base_quality = 0 if args.gvcf else min_base_quality max_depth = param.max_depth_dict[ args.platform] if args.platform else args.max_depth mq_option = ' --min-MQ {}'.format(min_mapping_quality) bq_option = ' --min-BQ {}'.format(min_base_quality) flags_option = ' --excl-flags {}'.format(param.SAMTOOLS_VIEW_FILTER_FLAG) max_depth_option = ' --max-depth {}'.format(max_depth) bed_option = ' -l {}'.format( extend_bed) if is_extend_bed_file_given else "" gvcf_option = ' -a' if args.gvcf else "" samtools_mpileup_process = subprocess_popen( shlex.split("{} mpileup {} -r {} --reverse-del".format( samtools_execute_command, bam_file_path, " ".join(reads_regions), ) + mq_option + bq_option + bed_option + flags_option + max_depth_option + gvcf_option)) if tensor_can_output_path != "PIPE": tensor_can_fpo = open(tensor_can_output_path, "wb") tensor_can_fp = subprocess_popen(shlex.split("{} -c".format( param.zstd)), stdin=PIPE, stdout=tensor_can_fpo) else: tensor_can_fp = TensorStdout(sys.stdout) # whether save all alternative information, only for debug mode if alt_fn: alt_fp = open(alt_fn, 'w') pos_offset = 0 pre_pos = -1 tensor = [[]] * sliding_window_size candidate_position = [] all_alt_dict = {} depth_dict = {} af_dict = {} # to generate gvcf, it is needed to record whole genome statistical information if args.gvcf: nonVariantCaller = variantInfoCalculator( gvcfWritePath=args.temp_file_dir, ref_path=args.ref_fn, bp_resolution=args.bp_resolution, ctgName=ctg_name, sample_name='.'.join( [args.sampleName, ctg_name, str(ctg_start), str(ctg_end)]), p_err=args.base_err, gq_bin_size=args.gq_bin_size) confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start, bed_ctg_end=extend_end) empty_pileup_flag = True for row in samtools_mpileup_process.stdout: empty_pileup_flag = False columns = row.strip().split('\t', maxsplit=5) pos = int(columns[1]) pileup_bases = columns[4] reference_base = reference_sequence[pos - reference_start].upper() valid_reference_flag = True within_flag = True if args.gvcf: if not valid_reference_flag: nonVariantCaller.make_gvcf_online({}, push_current=True) if ctg_start != None and ctg_end != None: within_flag = pos >= ctg_start and pos < ctg_end elif ctg_start != None and ctg_end == None: within_flag = pos >= ctg_start elif ctg_start == None and ctg_end != None: within_flag = pos <= ctg_end else: within_flag = True if columns[3] == '0' and within_flag and valid_reference_flag: cur_site_info = { 'chr': columns[0], 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0 } nonVariantCaller.make_gvcf_online(cur_site_info) continue # start with a new region, clear all sliding windows cache, avoid memory occupation if pre_pos + 1 != pos: pos_offset = 0 tensor = [[]] * sliding_window_size candidate_position = [] pre_pos = pos # a condition to skip some positions creating tensor,but return allele summary # allele count function pileup_tensor, alt_dict, af, depth, pass_af, pileup_list, max_del_length = generate_tensor( pos=pos, pileup_bases=pileup_bases, reference_sequence=reference_sequence, reference_start=reference_start, reference_base=reference_base, minimum_af_for_candidate=minimum_af_for_candidate, minimum_snp_af_for_candidate=minimum_snp_af_for_candidate, minimum_indel_af_for_candidate=minimum_indel_af_for_candidate, platform=platform, fast_mode=fast_mode, call_snp_only=call_snp_only) if args.gvcf and within_flag and valid_reference_flag: cur_n_total = 0 cur_n_ref = 0 for _key, _value in pileup_list: if (_key == reference_base): cur_n_ref = _value cur_n_total += _value cur_site_info = { 'chr': columns[0], 'pos': pos, 'ref': reference_base, 'n_total': cur_n_total, 'n_ref': cur_n_ref } nonVariantCaller.make_gvcf_online(cur_site_info) pass_confident_bed = not is_confident_bed_file_given or is_region_in( tree=confident_bed_tree, contig_name=ctg_name, region_start=pos - 1, region_end=pos + max_del_length + 1) # 0-based if (pass_confident_bed and reference_base in 'ACGT' and (pass_af and depth >= min_coverage) and not is_known_vcf_file_provided) or ( is_known_vcf_file_provided and pos in known_variants_set): candidate_position.append(pos) all_alt_dict[pos] = alt_dict depth_dict[pos] = depth af_dict[pos] = af tensor[pos_offset] = pileup_tensor # save pileup tensor for each candidate position with nearby flanking_base_num bp distance pos_offset = (pos_offset + 1) % sliding_window_size if len(candidate_position ) and pos - candidate_position[0] == flanking_base_num: center = candidate_position.pop(0) has_empty_tensor = sum([True for item in tensor if not len(item)]) if not has_empty_tensor: depth = depth_dict[center] ref_seq = reference_sequence[center - (flanking_base_num) - reference_start:center + flanking_base_num + 1 - reference_start] concat_tensor = tensor[pos_offset:] + tensor[0:pos_offset] alt_info = str(depth) + '-' + ' '.join([ ' '.join([item[0], str(item[1])]) for item in list(all_alt_dict[center].items()) ]) l = "%s\t%d\t%s\t%s\t%s" % ( ctg_name, center, ref_seq, " ".join( " ".join("%d" % x for x in innerlist) for innerlist in concat_tensor), alt_info) tensor_can_fp.stdin.write(l) tensor_can_fp.stdin.write("\n") if alt_fn: alt_info = ' '.join([ ' '.join([item[0], str(item[1])]) for item in list(all_alt_dict[center].items()) ]) alt_fp.write('\t'.join([ ctg_name + ' ' + str(center), str(depth), alt_info, str(af_dict[center]) ]) + '\n') del all_alt_dict[center], depth_dict[center], af_dict[center] if args.gvcf and len(nonVariantCaller.current_block) != 0: nonVariantCaller.write_to_gvcf_batch(nonVariantCaller.current_block, nonVariantCaller.cur_min_DP, nonVariantCaller.cur_raw_gq) if args.gvcf and empty_pileup_flag: nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end) if args.gvcf: nonVariantCaller.close_vcf_writer() samtools_mpileup_process.stdout.close() samtools_mpileup_process.wait() if tensor_can_output_path != "PIPE": tensor_can_fp.stdin.close() tensor_can_fp.wait() tensor_can_fpo.close() if alt_fn: alt_fp.close()
def reads_realignment(args): bed_file_path = args.full_aln_regions extend_bed = args.extend_bed fasta_file_path = args.ref_fn ctg_name = args.ctgName ctg_start = args.ctgStart ctg_end = args.ctgEnd chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base chunk_num = args.chunk_num samtools_execute_command = args.samtools bam_file_path = args.bam_fn minMQ = args.minMQ min_coverage = args.minCoverage is_bed_file_given = bed_file_path is not None is_ctg_name_given = ctg_name is not None read_fn = args.read_fn global test_pos test_pos = None if is_bed_file_given: candidate_file_path_process = subprocess_popen( shlex.split("gzip -fdc %s" % (bed_file_path))) candidate_file_path_output = candidate_file_path_process.stdout ctg_start, ctg_end = float('inf'), 0 for row in candidate_file_path_output: row = row.rstrip().split('\t') if row[0] != ctg_name: continue position = int(row[1]) + 1 end = int(row[2]) + 1 ctg_start = min(position, ctg_start) ctg_end = max(end, ctg_end) candidate_file_path_output.close() candidate_file_path_process.wait() if chunk_id is not None: fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.') contig_length = 0 with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if contig_name != ctg_name: continue contig_length = int(columns[1]) chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num ctg_start = chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None # 1-based regions [start, end] (start and end inclusive) ref_regions = [] reads_regions = [] reference_start, reference_end = None, None if is_ctg_range_given: extend_start = ctg_start - max_window_size extend_end = ctg_end + max_window_size reads_regions.append( region_from(ctg_name=ctg_name, ctg_start=extend_start, ctg_end=extend_end)) reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion reference_start = 1 if reference_start < 1 else reference_start ref_regions.append( region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) elif is_ctg_name_given: reads_regions.append(region_from(ctg_name=ctg_name)) ref_regions.append(region_from(ctg_name=ctg_name)) reference_start = 1 reference_sequence = reference_sequence_from( samtools_execute_command=samtools_execute_command, fasta_file_path=fasta_file_path, regions=ref_regions) if reference_sequence is None or len(reference_sequence) == 0: sys.exit( "[ERROR] Failed to load reference sequence from file ({}).".format( fasta_file_path)) tree = bed_tree_from(bed_file_path=bed_file_path) if is_bed_file_given and ctg_name not in tree: sys.exit("[ERROR] ctg_name({}) not exists in bed file({}).".format( ctg_name, bed_file_path)) bed_option = ' -L {}'.format(extend_bed) if extend_bed else "" bed_option = ' -L {}'.format( bed_file_path) if is_bed_file_given else bed_option mq_option = ' -q {}'.format(minMQ) if minMQ > 0 else "" samtools_view_command = "{} view -h {} {}".format( samtools_execute_command, bam_file_path, " ".join(reads_regions)) + mq_option + bed_option samtools_view_process = subprocess_popen( shlex.split(samtools_view_command)) if read_fn and read_fn == 'PIPE': save_file_fp = TensorStdout(sys.stdout) elif read_fn: save_file_fp = subprocess_popen(shlex.split( "{} view -bh - -o {}".format( samtools_execute_command, read_fn + ('.{}_{}'.format(ctg_start, ctg_end) if is_ctg_range_given and not test_pos else ""))), stdin=PIPE, stdout=PIPE) reference_start_0_based = 0 if reference_start is None else ( reference_start - 1) header = [] add_header = False aligned_reads = defaultdict() pileup = defaultdict(lambda: {"X": 0}) samtools_view_generator = samtools_view_generator_from( samtools_view_process=samtools_view_process, aligned_reads=aligned_reads, pileup=pileup, ctg_name=ctg_name, reference_sequence=reference_sequence, reference_start_0_based=reference_start_0_based, header=header) pre_aligned_reads = defaultdict() while True: chunk_start, chunk_end = next(samtools_view_generator) if chunk_start is None: break if not add_header: save_file_fp.stdin.write(''.join(header)) add_header = True variant_allele_list = [[position, pileup[position]["X"]] for position in list(pileup.keys())] candidate_position_list = [ (position, support_allele_count) for position, support_allele_count in variant_allele_list if support_allele_count >= min_coverage and position >= chunk_start - region_expansion_in_bp - 1 and position <= chunk_end + region_expansion_in_bp - 1 ] candidate_position_list.sort(key=(lambda x: x[0])) if not len(aligned_reads) or not len(candidate_position_list): continue if len(pre_aligned_reads): # update the read in previous chunk for read_name, read in pre_aligned_reads.items(): aligned_reads[read_name] = read region_dict = {} split_region_size = max_window_size region_tree = IntervalTree() for split_idx in range((chunk_end - chunk_start) // split_region_size): split_start = chunk_start + split_idx * split_region_size - region_expansion_in_bp - 1 split_end = split_start + split_region_size + region_expansion_in_bp * 2 + 1 region_dict[(split_start, split_end)] = [] region_tree.addi(split_start, split_end) for candidate_position in candidate_position_list: for region in region_tree.at(candidate_position[0]): region_dict[(region.begin, region.end)].append(candidate_position[0]) for key, split_candidate_position_list in region_dict.items(): start_pos, end_pos = None, None windows = [] read_windows_dict = {} for pos in split_candidate_position_list: if start_pos is None: start_pos = pos end_pos = pos elif pos > end_pos + 2 * min_windows_distance: temp_window = (start_pos - min_windows_distance, end_pos + min_windows_distance) windows.append(temp_window) read_windows_dict[temp_window] = [] start_pos = pos end_pos = pos else: end_pos = pos if start_pos is not None: temp_window = (start_pos - min_windows_distance, end_pos + min_windows_distance) windows.append(temp_window) read_windows_dict[temp_window] = [] if not len(windows): continue windows = sorted(windows, key=lambda x: x[0]) max_window_end = max([item[1] for item in windows]) # #find read windows overlap_pair for read_name, read in aligned_reads.items(): if read.read_start > max_window_end: continue argmax_window_idx = find_max_overlap_index( (read.read_start, read.read_end), windows) if argmax_window_idx is not None: read_windows_dict[windows[argmax_window_idx]].append( read_name) # realignment for window in windows: start_pos, end_pos = window if end_pos - start_pos > max_window_size: # or (window not in need_align_windows_set): continue ref_start = start_pos - reference_start_0_based ref_end = end_pos - reference_start_0_based ref = reference_sequence[ref_start:ref_end] reads = [] low_base_quality_pos_list = [] # pypy binding with ctypes for DBG building for read_name in read_windows_dict[window]: read = aligned_reads[read_name] if ( not read.graph_mq ) or read.read_start > end_pos or read.read_end < start_pos: continue reads.append(read.seq) low_base_quality_pos_list.append(' '.join([ str(bq_idx) for bq_idx, item in enumerate(read.base_quality) if int(item) < 15 ])) totoal_read_num = len(reads) c_ref = byte(ref) read_list1 = ctypes.c_char_p(byte(','.join(reads))) low_base_quality_pos_array = ctypes.c_char_p( byte(','.join(low_base_quality_pos_list))) dbg.get_consensus.restype = ctypes.POINTER(DBGPointer) dbg.get_consensus.argtypes = [ ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_int ] dbg_p = dbg.get_consensus(ctypes.c_char_p(c_ref), read_list1, low_base_quality_pos_array, totoal_read_num) c_consensus, consensus_size = dbg_p.contents.consensus, dbg_p.contents.consensus_size consensus = [ item.decode() for item in c_consensus[:consensus_size] ] if len(consensus) == 0 or len( consensus) == 1 and consensus[0] == ref or len( read_windows_dict[window]) == 0: continue min_read_start = min([ aligned_reads[item].read_start for item in read_windows_dict[window] ]) max_read_end = max([ aligned_reads[item].read_end for item in read_windows_dict[window] ]) tmp_ref_start = max( 0, min(min_read_start, start_pos) - expand_align_ref_region) tmp_ref_end = max(max_read_end, end_pos) + expand_align_ref_region ref_prefix = get_reference_seq(reference_sequence, tmp_ref_start, start_pos, reference_start_0_based) ref_center = get_reference_seq(reference_sequence, start_pos, end_pos, reference_start_0_based) if tmp_ref_end < end_pos: continue ref_suffix = get_reference_seq(reference_sequence, end_pos, tmp_ref_end, reference_start_0_based) ref_seq = ref_prefix + ref_center + ref_suffix # pypy binding with ctypes for realignment read_name_list = [] totoal_read_num = min(max_region_reads_num, len(read_windows_dict[window])) seq_list = (ctypes.c_char_p * totoal_read_num)() position_list = (ctypes.c_int * totoal_read_num)() cigars_list = (ctypes.c_char_p * totoal_read_num)() for read_idx, read_name in enumerate( read_windows_dict[window]): read = aligned_reads[read_name] if read_idx >= totoal_read_num: break seq_list[read_idx] = byte(read.seq.upper()) position_list[read_idx] = read.read_start cigars_list[read_idx] = byte(read.cigar) read_name_list.append(read_name) haplotypes_list = [ ref_prefix + cons + ref_suffix for cons in consensus ] haplotypes = ' '.join(haplotypes_list) realigner.realign_reads.restype = ctypes.POINTER(StructPointer) realigner.realign_reads.argtypes = [ ctypes.c_char_p * totoal_read_num, ctypes.c_int * totoal_read_num, ctypes.c_char_p * totoal_read_num, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int ] realigner_p = realigner.realign_reads( seq_list, position_list, cigars_list, ctypes.c_char_p(byte(ref_seq)), ctypes.c_char_p(byte(haplotypes)), tmp_ref_start, len(ref_prefix), len(ref_suffix), totoal_read_num) realign_positions, realign_cigars = realigner_p.contents.position, realigner_p.contents.cigar_string read_position_list = realign_positions[:totoal_read_num] read_cigar_list = [ item.decode() for item in realign_cigars[:totoal_read_num] ] if len(read_name_list): for read_id, read_name in enumerate(read_name_list): if read_cigar_list[read_id] == "" or ( aligned_reads[read_name].cigar == read_cigar_list[read_id] and aligned_reads[read_name].read_start == read_position_list[read_id]): continue # update cigar and read start position aligned_reads[read_name].test_pos = test_pos realignment_start = read_position_list[read_id] realignment_cigar = read_cigar_list[read_id].replace( 'X', 'M') if realignment_cigar == aligned_reads[ read_name].cigar and realignment_start == aligned_reads[ read_name].read_start: continue aligned_reads[read_name].set_realignment_info( split_start, read_cigar_list[read_id], read_position_list[read_id]) realigner.free_memory.restype = ctypes.POINTER(ctypes.c_void_p) realigner.free_memory.argtypes = [ ctypes.POINTER(StructPointer), ctypes.c_int ] realigner.free_memory(realigner_p, totoal_read_num) # # realignment end if read_fn: sorted_key = sorted([(key, item.best_pos) for key, item in aligned_reads.items()], key=lambda x: x[1]) for read_name, read_start in sorted_key: read = aligned_reads[read_name] if read_start < chunk_start - region_expansion_in_bp - max_window_size: # safe distance for save reads phasing_info = 'HP:i:{}'.format( read.phasing) if read.phasing else "" pass read_str = '\t'.join([ read_name, read.flag, ctg_name, str(read_start + 1), str(read.mapping_quality), read.best_cigar, read.RNEXT, read.PNEXT, read.TLEN, read.seq, read.raw_base_quality, phasing_info ]) save_file_fp.stdin.write(read_str + '\n') del aligned_reads[read_name] for pile_pos in list(pileup.keys()): if pile_pos < chunk_start - region_expansion_in_bp - max_window_size: del pileup[pile_pos] if read_fn and aligned_reads: sorted_key = sorted([(key, item.best_pos) for key, item in aligned_reads.items()], key=lambda x: x[1]) for read_name, read_start in sorted_key: read = aligned_reads[read_name] phasing_info = 'HP:i:{}'.format( read.phasing) if read.phasing else "" read_str = '\t'.join([ read_name, read.flag, ctg_name, str(read_start + 1), str(read.mapping_quality), read.best_cigar, read.RNEXT, read.PNEXT, read.TLEN, read.seq, read.raw_base_quality, phasing_info ]) save_file_fp.stdin.write(read_str + '\n') del aligned_reads[read_name] if read_fn != 'PIPE': save_file_fp.stdin.close() save_file_fp.wait() samtools_view_process.stdout.close() samtools_view_process.wait() if test_pos: save_file_fp = subprocess_popen(shlex.split("samtools index {}".format( read_fn + ('.{}_{}'.format(ctg_start, ctg_end) if is_ctg_range_given and not test_pos else ""))), stdin=PIPE, stdout=PIPE) save_file_fp.stdin.close() save_file_fp.wait()
def Run(args): basedir = dirname(__file__) EVCBin = basedir + "/../clair.py ExtractVariantCandidates" GTBin = basedir + "/../clair.py GetTruth" CTBin = basedir + "/../clair.py CreateTensor" CVBin = basedir + "/../clair.py call_var" pypyBin = executable_command_string_from(args.pypy, exit_on_not_found=True) samtoolsBin = executable_command_string_from(args.samtools, exit_on_not_found=True) chkpnt_fn = file_path_from(args.chkpnt_fn, suffix=".meta", exit_on_not_found=True) bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) vcf_fn = file_path_from(args.vcf_fn) bed_fn = file_path_from(args.bed_fn) dcov = args.dcov call_fn = args.call_fn af_threshold = args.threshold minCoverage = int(args.minCoverage) sampleName = args.sampleName ctgName = args.ctgName if ctgName is None: sys.exit( "--ctgName must be specified. You can call variants on multiple chromosomes simultaneously." ) stop_consider_left_edge = command_option_from(args.stop_consider_left_edge, 'stop_consider_left_edge') log_path = command_option_from(args.log_path, 'log_path', option_value=args.log_path) pysam_for_all_indel_bases = command_option_from( args.pysam_for_all_indel_bases, 'pysam_for_all_indel_bases') haploid_precision_mode = command_option_from(args.haploid_precision, 'haploid_precision') haploid_sensitive_mode = command_option_from(args.haploid_sensitive, 'haploid_sensitive') output_for_ensemble = command_option_from(args.output_for_ensemble, 'output_for_ensemble') pipe_line = command_option_from(args.pipe_line, 'pipe_line') store_loaded_mini_match = command_option_from(args.store_loaded_mini_match, 'store_loaded_mini_match') only_prediction = command_option_from(args.only_prediction, 'only_prediction') debug = command_option_from(args.debug, 'debug') qual = command_option_from(args.qual, 'qual', option_value=args.qual) fast_plotting = command_option_from(args.fast_plotting, 'fast_plotting') ctgStart = None ctgEnd = None if args.ctgStart is not None and args.ctgEnd is not None and int( args.ctgStart) <= int(args.ctgEnd): ctgStart = CommandOption('ctgStart', args.ctgStart) ctgEnd = CommandOption('ctgEnd', args.ctgEnd) if args.threads is None: numCpus = multiprocessing.cpu_count() else: numCpus = args.threads if args.threads < multiprocessing.cpu_count( ) else multiprocessing.cpu_count() maxCpus = multiprocessing.cpu_count() _cpuSet = ",".join( str(x) for x in random.sample(range(0, maxCpus), numCpus)) taskSet = "taskset -c %s" % (_cpuSet) try: subprocess.check_output("which %s" % ("taskset"), shell=True) except: taskSet = "" if args.delay > 0: delay = random.randrange(0, args.delay) print("Delay %d seconds before starting variant calling ..." % (delay), file=sys.stderr) sleep(delay) extract_variant_candidate_command_options = [ pypyBin, EVCBin, CommandOption('bam_fn', bam_fn), CommandOption('ref_fn', ref_fn), CommandOption('bed_fn', bed_fn), CommandOption('ctgName', ctgName), ctgStart, ctgEnd, CommandOption('threshold', af_threshold), CommandOption('minCoverage', minCoverage), CommandOption('samtools', samtoolsBin) ] get_truth_command_options = [ pypyBin, GTBin, CommandOption('vcf_fn', vcf_fn), CommandOption('ref_fn', ref_fn), CommandOption('ctgName', ctgName), ctgStart, ctgEnd ] create_tensor_command_options = [ pypyBin, CTBin, CommandOption('bam_fn', bam_fn), CommandOption('ref_fn', ref_fn), CommandOption('ctgName', ctgName), ctgStart, ctgEnd, stop_consider_left_edge, CommandOption('samtools', samtoolsBin), CommandOption('dcov', dcov) ] call_variant_command_options = [ taskSet, ExecuteCommand('python', CVBin), CommandOption('chkpnt_fn', chkpnt_fn), CommandOption('call_fn', call_fn), CommandOption('bam_fn', bam_fn), CommandOption('sampleName', sampleName), CommandOption('time_counter_file_name', args.time_counter_file_name), CommandOption('threads', numCpus), CommandOption('ref_fn', ref_fn), pysam_for_all_indel_bases, haploid_precision_mode, haploid_sensitive_mode, output_for_ensemble, pipe_line, store_loaded_mini_match, only_prediction, qual, debug ] call_variant_with_activation_command_options = [ CommandOptionWithNoValue('activation_only'), log_path, CommandOption('max_plot', args.max_plot), CommandOption('parallel_level', args.parallel_level), CommandOption('workers', args.workers), fast_plotting, ] if args.activation_only else [] is_true_variant_call = vcf_fn is not None try: c.extract_variant_candidate = subprocess_popen( shlex.split( command_string_from( get_truth_command_options if is_true_variant_call else extract_variant_candidate_command_options))) c.create_tensor = subprocess_popen( shlex.split(command_string_from(create_tensor_command_options)), stdin=c.extract_variant_candidate.stdout) c.call_variant = subprocess_popen(shlex.split( command_string_from(call_variant_command_options + call_variant_with_activation_command_options)), stdin=c.create_tensor.stdout, stdout=sys.stderr) except Exception as e: print(e, file=sys.stderr) sys.exit("Failed to start required processes. Exiting...") signal.signal(signal.SIGALRM, check_return_code) signal.alarm(2) try: c.call_variant.wait() c.create_tensor.stdout.close() c.create_tensor.wait() c.extract_variant_candidate.stdout.close() c.extract_variant_candidate.wait() except KeyboardInterrupt as e: print( "KeyboardInterrupt received when waiting at CallVarBam, terminating all scripts." ) try: c.call_variant.terminate() c.create_tensor.terminate() c.extract_variant_candidate.terminate() except Exception as e: print(e) raise KeyboardInterrupt except Exception as e: print( "Exception received when waiting at CallVarBam, terminating all scripts." ) print(e) try: c.call_variant.terminate() c.create_tensor.terminate() c.extract_variant_candidate.terminate() except Exception as e: print(e) raise e
def CheckEnvs(args): basedir = os.path.dirname(__file__) bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) fai_fn = file_path_from(args.ref_fn, suffix=".fai", exit_on_not_found=True, sep='.') bai_fn = file_path_from(args.bam_fn, suffix=".bai", sep='.') csi_fn = file_path_from(args.bam_fn, suffix=".csi", sep='.') if bai_fn is None and csi_fn is None: sys.exit(log_error("[ERROR] Neither Bam index file {} or {} not found".format(file_name + '.bai', file_name + '.csi'))) bed_fn = file_path_from(args.bed_fn) vcf_fn = file_path_from(args.vcf_fn) tree = bed_tree_from(bed_file_path=bed_fn) # create temp file folder output_fn_prefix = args.output_fn_prefix output_fn_prefix = folder_path_from(output_fn_prefix, create_not_found=True) log_path = folder_path_from(os.path.join(output_fn_prefix, 'log'), create_not_found=True) tmp_file_path = folder_path_from(os.path.join(output_fn_prefix, 'tmp'), create_not_found=True) split_bed_path = folder_path_from(os.path.join(tmp_file_path, 'split_beds'), create_not_found=True) if bed_fn or vcf_fn else None pileup_vcf_path = folder_path_from(os.path.join(tmp_file_path, 'pileup_output'), create_not_found=True) merge_vcf_path = folder_path_from(os.path.join(tmp_file_path, 'merge_output'), create_not_found=True) phase_output_path = folder_path_from(os.path.join(tmp_file_path, 'phase_output'), create_not_found=True) gvcf_temp_output_path = folder_path_from(os.path.join(tmp_file_path, 'gvcf_tmp_output'), create_not_found=True) full_alignment_output_path = folder_path_from(os.path.join(tmp_file_path, 'full_alignment_output'), create_not_found=True) phase_vcf_path = folder_path_from(os.path.join(phase_output_path, 'phase_vcf'), create_not_found=True) phase_bam_path = folder_path_from(os.path.join(phase_output_path, 'phase_bam'), create_not_found=True) candidate_bed_path = folder_path_from(os.path.join(full_alignment_output_path, 'candidate_bed'), create_not_found=True) # environment parameters pypy = args.pypy samtools = args.samtools whatshap = args.whatshap parallel = args.parallel qual = args.qual var_pct_full = args.var_pct_full ref_pct_full = args.ref_pct_full snp_min_af = args.snp_min_af indel_min_af = args.indel_min_af min_contig_size = args.min_contig_size sample_name = args.sampleName contig_name_list = os.path.join(tmp_file_path, 'CONTIGS') chunk_list = os.path.join(tmp_file_path, 'CHUNK_LIST') legal_range_from(param_name="qual", x=qual, min_num=0, exit_out_of_range=True) legal_range_from(param_name="var_pct_full", x=var_pct_full, min_num=0, max_num=1, exit_out_of_range=True) legal_range_from(param_name="ref_pct_full", x=ref_pct_full, min_num=0, max_num=1, exit_out_of_range=True) legal_range_from(param_name="snp_min_af", x=snp_min_af, min_num=0, max_num=1, exit_out_of_range=True) legal_range_from(param_name="indel_min_af", x=indel_min_af, min_num=0, max_num=1, exit_out_of_range=True) if ref_pct_full > 0.3: print(log_warning( "[WARNING] For efficiency, we use a maximum 30% reference candidates for full-alignment calling")) tool_version = { 'python': LooseVersion(sys.version.split()[0]), 'pypy': check_version(tool=pypy, pos=0, is_pypy=True), 'samtools': check_version(tool=samtools, pos=1), 'whatshap': check_version(tool=whatshap, pos=1), 'parallel': check_version(tool=parallel, pos=2), } check_tools_version(tool_version, required_tool_version) is_include_all_contigs = args.include_all_ctgs is_bed_file_provided = bed_fn is not None is_known_vcf_file_provided = vcf_fn is not None if is_known_vcf_file_provided and is_bed_file_provided: sys.exit(log_error("[ERROR] Please provide either --vcf_fn or --bed_fn only")) if is_known_vcf_file_provided: know_vcf_contig_set = split_extend_vcf(vcf_fn=vcf_fn, output_fn=split_bed_path) ctg_name_list = args.ctg_name is_ctg_name_list_provided = ctg_name_list is not None and ctg_name_list != "EMPTY" contig_set = set(ctg_name_list.split(',')) if is_ctg_name_list_provided else set() if is_ctg_name_list_provided and is_bed_file_provided: print(log_warning("[WARNING] both --ctg_name and --bed_fn provided, will only proceed contigs in intersection")) if is_ctg_name_list_provided and is_known_vcf_file_provided: print(log_warning("[WARNING] both --ctg_name and --vcf_fn provided, will only proceed contigs in intersection")) if is_ctg_name_list_provided: contig_set = contig_set.intersection( set(tree.keys())) if is_bed_file_provided else contig_set contig_set = contig_set.intersection( know_vcf_contig_set) if is_known_vcf_file_provided else contig_set else: contig_set = contig_set.union( set(tree.keys())) if is_bed_file_provided else contig_set contig_set = contig_set.union( know_vcf_contig_set) if is_known_vcf_file_provided else contig_set # if each split region is too small(long) for given default chunk num, will increase(decrease) the total chunk num default_chunk_num = args.chunk_num DEFAULT_CHUNK_SIZE = args.chunk_size contig_length_list = [] contig_chunk_num = {} with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name, contig_length = columns[0], int(columns[1]) if not is_include_all_contigs and ( not (is_bed_file_provided or is_ctg_name_list_provided or is_known_vcf_file_provided)) and str( contig_name) not in major_contigs: continue if is_bed_file_provided and contig_name not in tree: continue if is_ctg_name_list_provided and contig_name not in contig_set: continue if is_known_vcf_file_provided and contig_name not in contig_set: continue if min_contig_size > 0 and contig_length < min_contig_size: print(log_warning( "[WARNING] {} contig length {} is smaller than minimum contig size {}, will skip it!".format(contig_name, contig_length, min_contig_size))) if contig_name in contig_set: contig_set.remove(contig_name) continue contig_set.add(contig_name) contig_length_list.append(contig_length) chunk_num = int( contig_length / float(DEFAULT_CHUNK_SIZE)) + 1 if contig_length % DEFAULT_CHUNK_SIZE else int( contig_length / float(DEFAULT_CHUNK_SIZE)) contig_chunk_num[contig_name] = max(chunk_num, 1) if default_chunk_num > 0: min_chunk_length = min(contig_length_list) / float(default_chunk_num) max_chunk_length = max(contig_length_list) / float(default_chunk_num) contigs_order = major_contigs_order + list(contig_set) sorted_contig_list = sorted(list(contig_set), key=lambda x: contigs_order.index(x)) found_contig = True if not len(contig_set): if is_bed_file_provided: all_contig_in_bed = ' '.join(list(tree.keys())) print(log_warning("[WARNING] No contig intersection found by --bed_fn, contigs in BED {}: {}".format(bed_fn, all_contig_in_bed))) if is_known_vcf_file_provided: all_contig_in_vcf = ' '.join(list(know_vcf_contig_set)) print(log_warning("[WARNING] No contig intersection found by --vcf_fn, contigs in VCF {}: {}".format(vcf_fn, all_contig_in_vcf))) if is_ctg_name_list_provided: all_contig_in_ctg_name = ' '.join(ctg_name_list.split(',')) print(log_warning("[WARNING] No contig intersection found by --ctg_name, contigs in contigs list: {}".format(all_contig_in_ctg_name))) found_contig = False else: for c in sorted_contig_list: if c not in contig_chunk_num: print(log_warning(("[WARNING] Contig {} given but not found in reference fai file".format(c)))) # check contig in bam have support reads sorted_contig_list, found_contig = check_contig_in_bam(bam_fn=bam_fn, sorted_contig_list=sorted_contig_list, samtools=samtools) if not found_contig: # output header only to merge_output.vcf.gz output_fn = os.path.join(output_fn_prefix, "merge_output.vcf") output_header(output_fn=output_fn, reference_file_path=ref_fn, sample_name=sample_name) compress_index_vcf(output_fn) print(log_warning( ("[WARNING] No contig intersection found, output header only in {}").format(output_fn + ".gz"))) with open(contig_name_list, 'w') as output_file: output_file.write("") return print('[INFO] Call variant in contigs: {}'.format(' '.join(sorted_contig_list))) print('[INFO] Chunk number for each contig: {}'.format( ' '.join([str(contig_chunk_num[c]) for c in sorted_contig_list]))) if default_chunk_num > 0 and max_chunk_length > MAX_CHUNK_LENGTH: print(log_warning( '[WARNING] Current maximum chunk size {} is larger than default maximum chunk size {}, You may set a larger chunk_num by setting --chunk_num=$ for better parallelism.'.format( min_chunk_length, MAX_CHUNK_LENGTH))) elif default_chunk_num > 0 and min_chunk_length < MIN_CHUNK_LENGTH: print(log_warning( '[WARNING] Current minimum chunk size {} is smaller than default minimum chunk size {}, You may set a smaller chunk_num by setting --chunk_num=$.'.format( min_chunk_length, MIN_CHUNK_LENGTH))) if default_chunk_num == 0 and max(contig_length_list) < DEFAULT_CHUNK_SIZE / 5: print(log_warning( '[WARNING] Current maximum contig length {} is much smaller than default chunk size {}, You may set a smaller chunk size by setting --chunk_size=$ for better parallelism.'.format( max(contig_length_list), DEFAULT_CHUNK_SIZE))) if is_bed_file_provided: split_extend_bed(bed_fn=bed_fn, output_fn=split_bed_path, contig_set=contig_set) with open(contig_name_list, 'w') as output_file: output_file.write('\n'.join(sorted_contig_list)) with open(chunk_list, 'w') as output_file: for contig_name in sorted_contig_list: chunk_num = contig_chunk_num[contig_name] for chunk_id in range(1, chunk_num + 1): output_file.write(contig_name + ' ' + str(chunk_id) + ' ' + str(chunk_num) + '\n')
def Run(args): basedir = dirname(__file__) CTP_Bin = basedir + "/../clair3.py CreateTensorPileup" CTFA_Bin = basedir + "/../clair3.py CreateTensorFullAlignment" T2B_Bin = basedir + "/../clair3.py Tensor2Bin" if args.delay > 0: delay = random.randrange(0, args.delay) print("[INFO] Delay %d seconds before starting tensor creation ..." % (delay)) sleep(delay) pypyBin = executable_command_string_from(args.pypy, exit_on_not_found=True) pythonBin = executable_command_string_from(args.python, exit_on_not_found=True) samtoolsBin = executable_command_string_from(args.samtools, exit_on_not_found=True) if args.pileup: bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) else: bam_fn = file_path_from(args.bam_fn) if bam_fn is None or bam_fn == "": print( log_warning( "[WARNING] Skip full-alignment variant calling for empty full-alignment regions" )) return ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) bed_fn = file_path_from(args.bed_fn) vcf_fn = file_path_from(args.vcf_fn) var_fn = file_path_from(args.var_fn, exit_on_not_found=True) bin_fn = args.bin_fn extend_bed = file_path_from(args.extend_bed) full_aln_regions = file_path_from(args.full_aln_regions) platform = args.platform if not platform or platform not in param.support_platform: sys.exit( "[ERROR] Provided platform are not in support platform list [ont, hifi, ilmn]" ) pileup = args.pileup ctgName = args.ctgName min_af = args.min_af if args.min_af else param.min_af_dict[platform] snp_min_af = args.snp_min_af indel_min_af = args.indel_min_af if ctgName is None: sys.exit( "--ctgName must be specified. You can call variants on multiple chromosomes simultaneously." ) pileup_mode = command_option_from(args.pileup, 'pileup') phasing_info_mode = command_option_from(args.phasing_info_in_bam, 'phasing_info_in_bam') add_no_phasing_mode = command_option_from( args.add_no_phasing_data_training, 'add_no_phasing_data_training') allow_duplicate_mode = command_option_from(args.allow_duplicate_chr_pos, 'allow_duplicate_chr_pos') maximum_non_variant_ratio = CommandOption('maximum_non_variant_ratio', args.maximum_non_variant_ratio) shuffle_mode = command_option_from(args.shuffle, 'shuffle') ctgStart = None ctgEnd = None chunk_id = None chunk_num = None if args.ctgStart is not None and args.ctgEnd is not None and int( args.ctgStart) <= int(args.ctgEnd): ctgStart = CommandOption('ctgStart', args.ctgStart) ctgEnd = CommandOption('ctgEnd', args.ctgEnd) if args.chunk_id is not None and args.chunk_num is not None and int( args.chunk_id) <= int(args.chunk_num): chunk_id = CommandOption('chunk_id', args.chunk_id) chunk_num = CommandOption('chunk_num', args.chunk_num) CT_Bin = CTP_Bin if pileup else CTFA_Bin create_tensor_command_options = [ pypyBin, CT_Bin, CommandOption('bam_fn', bam_fn), CommandOption('ref_fn', ref_fn), CommandOption('vcf_fn', vcf_fn), CommandOption('ctgName', ctgName), CommandOption('platform', platform), CommandOption('samtools', samtoolsBin), CommandOption('bed_fn', bed_fn), CommandOption('extend_bed', extend_bed), CommandOption('min_af', min_af), CommandOption('snp_min_af', snp_min_af), CommandOption('indel_min_af', indel_min_af), ctgStart, ctgEnd, chunk_id, chunk_num, ] if not pileup: create_tensor_command_options.append(phasing_info_mode) create_tensor_command_options.append(add_no_phasing_mode) create_tensor_command_options.append( CommandOption('full_aln_regions', full_aln_regions)) compress_tensor_command_options = [ pythonBin, T2B_Bin, CommandOption('platform', platform), CommandOption('var_fn', var_fn), CommandOption('bin_fn', bin_fn), CommandOption('bed_fn', bed_fn), chunk_id, chunk_num, allow_duplicate_mode, maximum_non_variant_ratio, shuffle_mode, ] if pileup: compress_tensor_command_options.append(pileup_mode) try: c.create_tensor = subprocess_popen( shlex.split(command_string_from(create_tensor_command_options)), ) c.compress_tensor = subprocess_popen(shlex.split( command_string_from(compress_tensor_command_options)), stdin=c.create_tensor.stdout, stdout=sys.stderr) except Exception as e: print(e, file=sys.stderr) sys.exit("Failed to start required processes. Exiting...") signal.signal(signal.SIGALRM, check_return_code) signal.alarm(2) try: c.compress_tensor.wait() signal.alarm(0) c.create_tensor.stdout.close() c.create_tensor.wait() except KeyboardInterrupt as e: print( "KeyboardInterrupt received when waiting at Tensor2Bin, terminating all scripts." ) try: c.compress_tensor.terminate() c.create_tensor.terminate() except Exception as e: print(e) raise KeyboardInterrupt except Exception as e: print( "Exception received when waiting at CreateTensor, terminating all scripts." ) print(e) try: c.compress_tensor.terminate() c.create_tensor.terminate() except Exception as e: print(e) raise e
def OutputVariant(args): var_fn = args.var_fn vcf_fn = args.vcf_fn ctg_name = args.ctgName ctg_start = args.ctgStart ctg_end = args.ctgEnd if args.var_fn != "PIPE": var_fpo = open(var_fn, "wb") var_fp = subprocess_popen(shlex.split("gzip -c"), stdin=PIPE, stdout=var_fpo) else: var_fp = TruthStdout(sys.stdout) is_ctg_region_provided = ctg_start is not None and ctg_end is not None if (is_ctg_region_provided and file_path_from("%s.tbi" % (vcf_fn)) is not None and executable_command_string_from("tabix") is not None): vcf_fp = subprocess_popen( shlex.split("tabix -f -p vcf %s %s:%s-%s" % (vcf_fn, ctg_name, ctg_start, ctg_end))) else: vcf_fp = subprocess_popen(shlex.split("gzip -fdc %s" % (vcf_fn))) for row in vcf_fp.stdout: columns = row.strip().split() if columns[0][0] == "#": continue # position in vcf is 1-based chromosome, position = columns[0], columns[1] if chromosome != ctg_name: continue if is_ctg_region_provided and not (ctg_start <= int(position) <= ctg_end): continue reference, alternate, last_column = columns[3], columns[4], columns[-1] # normal GetTruth genotype = last_column.split(":")[0].replace("/", "|").replace( ".", "0").split("|") genotype_1, genotype_2 = genotype # 1000 Genome GetTruth (format problem) (no genotype is given) # genotype_1, genotype_2 = "1", "1" # if alternate.find(',') >= 0: # genotype_1, genotype_2 = "1", "2" if int(genotype_1) > int(genotype_2): genotype_1, genotype_2 = genotype_2, genotype_1 var_fp.stdin.write(" ".join((chromosome, position, reference, alternate, genotype_1, genotype_2))) var_fp.stdin.write("\n") vcf_fp.stdout.close() vcf_fp.wait() if args.var_fn != "PIPE": var_fp.stdin.close() var_fp.wait() var_fpo.close()
def Run(args): basedir = dirname(__file__) CTP_Bin = basedir + "/../clair3.py CreateTensorPileup" CTFA_Bin = basedir + "/../clair3.py CreateTensorFullAlignment" RR_Bin = basedir + "/../clair3.py RealignReads" CVBin = basedir + "/../clair3.py CallVariants" if args.delay > 0: delay = random.randrange(0, args.delay) print("[INFO] Delay %d seconds before starting variant calling ..." % (delay)) sleep(delay) pypyBin = executable_command_string_from(args.pypy, exit_on_not_found=True) pythonBin = executable_command_string_from(args.python, exit_on_not_found=True) samtoolsBin = executable_command_string_from(args.samtools, exit_on_not_found=True) chkpnt_fn = args.chkpnt_fn if args.pileup: bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) else: bam_fn = file_path_from(args.bam_fn) if bam_fn is None or bam_fn == "": print(log_warning( "[WARNING] Skip full-alignment variant calling for empty full-alignment regions")) return ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) bed_fn = file_path_from(args.bed_fn) vcf_fn = file_path_from(args.vcf_fn) extend_bed = file_path_from(args.extend_bed) full_aln_regions = file_path_from(args.full_aln_regions) platform = args.platform if not platform or platform not in param.support_platform: sys.exit("[ERROR] Provided platform are not in support platform list [ont, hifi, ilmn]") pileup = args.pileup call_fn = args.call_fn sampleName = args.sampleName ctgName = args.ctgName need_realignment = args.need_realignment and platform == 'ilmn' and not pileup min_af = args.min_af if args.min_af else param.min_af_dict[platform] snp_min_af = args.snp_min_af indel_min_af = args.indel_min_af if ctgName is None: sys.exit("--ctgName must be specified. You can call variants on multiple chromosomes simultaneously.") haploid_precise_mode = command_option_from(args.haploid_precise, 'haploid_precise') haploid_sensitive_mode = command_option_from(args.haploid_sensitive, 'haploid_sensitive') output_for_ensemble = command_option_from(args.output_for_ensemble, 'output_for_ensemble') showRef_mode = command_option_from(args.showRef, 'showRef') qual = command_option_from(args.qual, 'qual', option_value=args.qual) add_indel_length_mode = CommandOption('add_indel_length', args.add_indel_length) phasing_info_in_bam_mode = command_option_from(args.phasing_info_in_bam, 'phasing_info_in_bam') need_phasing_mode = command_option_from(args.need_phasing, 'need_phasing') is_from_tables_mode = command_option_from(args.is_from_tables, 'is_from_tables') pileup_mode = command_option_from(args.pileup, 'pileup') gvcf_mode = CommandOption('gvcf', args.gvcf) fast_mode = CommandOption('fast_mode', args.fast_mode) call_snp_only_mode = CommandOption('call_snp_only', args.call_snp_only) enable_long_indel_mode = CommandOption('enable_long_indel', args.enable_long_indel) ctgStart = None ctgEnd = None chunk_id = None chunk_num = None if args.ctgStart is not None and args.ctgEnd is not None and int(args.ctgStart) <= int(args.ctgEnd): ctgStart = CommandOption('ctgStart', args.ctgStart) ctgEnd = CommandOption('ctgEnd', args.ctgEnd) if args.chunk_id is not None and args.chunk_num is not None and int(args.chunk_id) <= int(args.chunk_num): chunk_id = CommandOption('chunk_id', args.chunk_id) chunk_num = CommandOption('chunk_num', args.chunk_num) if machine() in {"aarch64", "arm64"} or system() == "Darwin": taskSet = "" else: sched_getaffinity_list = list(os.sched_getaffinity(0)) maxCpus = len(sched_getaffinity_list) if args.tensorflow_threads is None: numCpus = maxCpus else: numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus)) taskSet = "taskset -c %s" % (_cpuSet) try: subprocess.check_output("which %s" % ("taskset"), shell=True) except: taskSet = "" if need_realignment: realign_reads_command_options = [ pypyBin, RR_Bin, CommandOption('bam_fn', bam_fn), CommandOption('ref_fn', ref_fn), CommandOption('ctgName', ctgName), ctgStart, ctgEnd, chunk_id, chunk_num, CommandOption('samtools', samtoolsBin), CommandOption('extend_bed', extend_bed), CommandOption('full_aln_regions', full_aln_regions), ] bam_fn = "PIPE" CT_Bin = CTP_Bin if pileup else CTFA_Bin create_tensor_command_options = [ pypyBin, CT_Bin, CommandOption('bam_fn', bam_fn), CommandOption('ref_fn', ref_fn), CommandOption('vcf_fn', vcf_fn), CommandOption('ctgName', ctgName), CommandOption('min_af', min_af), CommandOption('platform', platform), CommandOption('samtools', samtoolsBin), CommandOption('bed_fn', bed_fn), CommandOption('extend_bed', extend_bed), CommandOption('sampleName', args.sampleName), CommandOption('minCoverage', args.minCoverage), CommandOption('minMQ', args.minMQ), ctgStart, ctgEnd, chunk_id, chunk_num, gvcf_mode, ] if not pileup: create_tensor_command_options.append(phasing_info_in_bam_mode) create_tensor_command_options.append(need_phasing_mode) create_tensor_command_options.append(CommandOption('full_aln_regions', full_aln_regions)) else: create_tensor_command_options.append(CommandOption('snp_min_af', snp_min_af)) create_tensor_command_options.append(CommandOption('indel_min_af', indel_min_af)) create_tensor_command_options.append(fast_mode) create_tensor_command_options.append(call_snp_only_mode) if (args.gvcf): create_tensor_command_options.append(CommandOption('base_err', args.base_err)) create_tensor_command_options.append(CommandOption('gq_bin_size', args.gq_bin_size)) create_tensor_command_options.append(CommandOption('temp_file_dir', args.temp_file_dir)) if args.bp_resolution: create_tensor_command_options.append(CommandOptionWithNoValue('bp_resolution')) call_variant_command_options = [ taskSet, pythonBin, CVBin, CommandOption('chkpnt_fn', chkpnt_fn), CommandOption('call_fn', call_fn), CommandOption('sampleName', sampleName), CommandOption('ref_fn', ref_fn), CommandOption('platform', platform), CommandOption('ctgName', ctgName), CommandOption('temp_file_dir', args.temp_file_dir), haploid_precise_mode, haploid_sensitive_mode, output_for_ensemble, qual, add_indel_length_mode, showRef_mode, is_from_tables_mode, pileup_mode, chunk_id, chunk_num, gvcf_mode, enable_long_indel_mode ] try: if need_realignment: c.realign_reads = subprocess_popen( shlex.split(command_string_from(realign_reads_command_options)), ) c.create_tensor = subprocess_popen( shlex.split(command_string_from(create_tensor_command_options)), stdin=c.realign_reads.stdout) else: c.create_tensor = subprocess_popen( shlex.split(command_string_from(create_tensor_command_options)), ) c.call_variant = subprocess_popen( shlex.split(command_string_from(call_variant_command_options)), stdin=c.create_tensor.stdout, stdout=sys.stderr ) except Exception as e: print(e, file=sys.stderr) sys.exit("Failed to start required processes. Exiting...") signal.signal(signal.SIGALRM, check_return_code) signal.alarm(2) try: c.call_variant.wait() signal.alarm(0) c.create_tensor.stdout.close() c.create_tensor.wait() if need_realignment: c.realign_reads.stdout.close() c.realign_reads.wait() except KeyboardInterrupt as e: print("KeyboardInterrupt received when waiting at CallVarBam, terminating all scripts.") try: c.call_variant.terminate() c.create_tensor.terminate() if need_realignment: c.realign_reads.terminate() except Exception as e: print(e) raise KeyboardInterrupt except Exception as e: print("Exception received when waiting at CallVarBam, terminating all scripts.") print(e) try: c.call_variant.terminate() c.create_tensor.terminate() if need_realignment: c.realign_reads.terminate() except Exception as e: print(e) raise e
def CreateTensorPileup(args): """ Create pileup tensor for pileup model training or calling. Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate regions once, we could directly get all variant candidates directly. """ ctg_start = args.ctgStart ctg_end = args.ctgEnd fasta_file_path = args.ref_fn ctg_name = args.ctgName bam_file_path = args.bam_fn chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base chunk_num = args.chunk_num minimum_snp_af_for_candidate = args.snp_min_af minimum_indel_af_for_candidate = args.indel_min_af min_coverage = args.minCoverage min_mapping_quality = args.minMQ platform = args.platform vcf_fn = file_path_from(args.vcf_fn) is_known_vcf_file_provided = vcf_fn is not None confident_bed_fn = file_path_from(args.extend_bed) is_confident_bed_file_given = confident_bed_fn is not None extend_bed = file_path_from(args.extend_bed) is_extend_bed_file_given = extend_bed is not None fast_mode = args.fast_mode call_snp_only = args.call_snp_only enable_long_indel = args.enable_long_indel # 1-based regions [start, end] (start and end inclusive) tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed, contig_name=ctg_name, return_bed_region=True) fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.') fast_mode = platform == 'ont' and fast_mode minimum_snp_af_for_candidate = max( minimum_snp_af_for_candidate, param.min_af_dict[platform] ) if fast_mode else minimum_snp_af_for_candidate min_coverage = max(min_coverage, 4) if fast_mode else min_coverage max_indel_length = param.maximum_variant_length_that_need_infer if not enable_long_indel else param.maximum_variant_length_that_need_infer_include_long_indel if not is_confident_bed_file_given and chunk_id is not None: contig_length = 0 with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if contig_name != ctg_name: continue contig_length = int(columns[1]) chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num ctg_start = chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_confident_bed_file_given and chunk_id is not None: chunk_size = (bed_end - bed_start) // chunk_num + 1 if ( bed_end - bed_start) % chunk_num else (bed_end - bed_start) // chunk_num ctg_start = bed_start + 1 + chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_known_vcf_file_provided and chunk_id is not None: known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) total_variants_size = len(known_variants_list) chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1 chunk_start_pos = chunk_id * chunk_variants_size known_variants_set = set( known_variants_list[chunk_start_pos:chunk_start_pos + chunk_variants_size]) if len(known_variants_set) == 0: return [], [], [] ctg_start, ctg_end = min(known_variants_set), max(known_variants_set) is_ctg_name_given = ctg_name is not None is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None if is_ctg_range_given: ctg_start = max(1, ctg_start) extend_start = max(1, ctg_start - no_of_positions) extend_end = ctg_end + no_of_positions region_str = "{}:{}-{}".format(ctg_name, extend_start, extend_end) region = Region.from_string(region_str) confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start, bed_ctg_end=extend_end) if args.gvcf: from preprocess.utils import variantInfoCalculator nonVariantCaller = variantInfoCalculator( gvcfWritePath=args.temp_file_dir, ref_path=args.ref_fn, bp_resolution=args.bp_resolution, ctgName=ctg_name, sample_name='.'.join( [args.sampleName, ctg_name, str(ctg_start), str(ctg_end)]), p_err=args.base_err, gq_bin_size=args.gq_bin_size) chunk_result, all_alt_info_list, gvcf_output = pileup_counts_clair3( region, bam=bam_file_path, fasta=fasta_file_path, min_depth=min_coverage, min_snp_af=minimum_snp_af_for_candidate, min_indel_af=minimum_indel_af_for_candidate, min_mq=min_mapping_quality, max_indel_length=max_indel_length, call_snp_only=call_snp_only, max_depth=param.max_depth, gvcf=args.gvcf) # slice all candidates tensor according to the alternative information np_pileup_data, all_position_info, all_alt_info = [], [], [] for idx, (pos, pos_info, alt_info) in enumerate(all_alt_info_list): pos = int(pos) pass_confident_bed = not is_confident_bed_file_given or is_region_in( tree=confident_bed_tree, contig_name=ctg_name, region_start=pos - 1, region_end=pos + 1) pass_vcf_region = not is_known_vcf_file_provided or ( is_known_vcf_file_provided and pos in known_variants_set) if not pass_confident_bed or not pass_vcf_region: continue start, end = pos - flanking_base_num, pos + flanking_base_num + 1 for result in chunk_result: if start - 1 >= result[1][0][0] and end <= result[1][-1][0]: offset = start - result[1][0][0] - 1 tensor = result[0][offset:offset + no_of_positions] # mainly because no coverage in flanking windows if tensor.shape != (no_of_positions, channel_size): continue # check any empty columns in flanking position, those columns with all zeros if np.sum(np.sum(tensor == 0, axis=1) == channel_size) > 0: continue np_pileup_data.append(tensor) all_position_info.append(pos_info) all_alt_info.append(alt_info) np_pileup_data = np.array(np_pileup_data, dtype=np.int32) if args.gvcf: from shared.utils import reference_sequence_from, region_from samtools_execute_command = args.samtools ref_regions = [] reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion reference_start = 1 if reference_start < 1 else reference_start ref_regions.append( region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) reference_sequence = reference_sequence_from( samtools_execute_command=samtools_execute_command, fasta_file_path=fasta_file_path, regions=ref_regions) offset = 0 if ctg_start == 1 else 1 empty_pileup_flag = False start = ctg_start - extend_start + offset end = ctg_end + 1 - extend_start + offset if sum(gvcf_output[1][start:end]) == 0: empty_pileup_flag = True for pos in range(ctg_start, ctg_end): if empty_pileup_flag: break ref_count = gvcf_output[0][pos - extend_start + offset] total_count = gvcf_output[1][pos - extend_start + offset] if pos - reference_start >= len(reference_sequence): continue reference_base = reference_sequence[pos - reference_start] if (ref_count == 0 and total_count == 0): cur_site_info = { 'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0 } nonVariantCaller.make_gvcf_online(cur_site_info) continue cur_site_info = { 'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': total_count, 'n_ref': ref_count } nonVariantCaller.make_gvcf_online(cur_site_info) if len(nonVariantCaller.current_block) != 0: nonVariantCaller.write_to_gvcf_batch( nonVariantCaller.current_block, nonVariantCaller.cur_min_DP, nonVariantCaller.cur_raw_gq) if empty_pileup_flag: nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end) nonVariantCaller.close_vcf_writer() return np_pileup_data, all_position_info, all_alt_info