def check_vcf_gq_field( vcf_path: Union[Path, str], ) -> None: """Check vcf file for GQ field""" vcf_file = VCF(vcf_path, threads=settings.cyvcf_threads) if not vcf_file.contains("GQ"): raise VCFParserError(f"GQ not found in {vcf_path}")
def process_vep_scores(vcf_file, is_clinvar=False): indexes, scores, absent = OrderedDict(), defaultdict(list), [] vcf_data = VCF(vcf_file) if vcf_data.contains("ANN") or vcf_data.contains("CSQ"): spliceai_scores_fromtool=False for field in vcf_data.header_iter(): if field["HeaderType"] == "INFO" and field["ID"] == "ANN": tools = field["Description"].split("Format:")[1][:-1].strip().split("|") for plugin in vep_plugins: if plugin in tools: indexes[plugin] = tools.index(plugin) else: absent.append(plugin) elif field["HeaderType"] == "INFO" and field["ID"] == "SpliceAI": spliceai_scores_fromtool=True for record in vcf_data: key = record.ID + "_" + str(record.POS) scores[key].extend([record.CHROM,record.POS, record.REF, record.ALT, record.ID, record.var_type, record.var_subtype]) annotation = record.INFO.get("ANN") if annotation: info = annotation.split(",")[0].split("|") try: scores[key].append(info[tools.index('Existing_variation')]) scores[key].append(info[tools.index('HGVSc')]) scores[key].append(info[tools.index('Consequence')]) except ValueError: pass if len(absent) > 0: scores[key].append(("GERP", record.INFO.get("GERP"))) scores[key].append(("phyloP", record.INFO.get("phyloP"))) scores[key].append(("phastCons", record.INFO.get("phastCons"))) scores[key].append(("SiPhy", record.INFO.get("SiPhy"))) scores[key].append(("fitcons", record.INFO.get("fitcons"))) scores[key].append(("LINSIGHT", record.INFO.get("linsight_g"))) scores[key].append(("ReMM", record.INFO.get("ReMM"))) scores[key].append(("DANN", record.INFO.get("DANN"))) scores[key].append(("GWAVA", record.INFO.get("GWAVA"))) scores[key].append(("FATHMM-MKL", record.INFO.get("fathmmMKL"))) scores[key].append(("Eigen", record.INFO.get("Eigen"))) scores[key].append(("Eigen-PC", record.INFO.get("Eigen-PC"))) scores[key].append(("funseq2", record.INFO.get("funseq2"))) scores[key].append(("dpsi_zscore", record.INFO.get("dpsi_zscore"))) scores[key].append(("traP", record.INFO.get("traP"))) if spliceai_scores_fromtool: scores[key].append(("SpliceAI", record.INFO.get("SpliceAI"))) else: scores[key].append(("SpliceAI", format_spliceai_fields(record))) for pl, i in indexes.items(): scores[key].append((pl, info[i])) if is_clinvar: scores[key].append(('CLNREVSTAT', record.INFO.get("CLNREVSTAT"))) scores[key].append(('CLNSIG', record.INFO.get("CLNSIG"))) else: print("Program requires ANN field to be present in the INFO field of the input VCF file.\n") exit(1) for k, v in scores.items(): [val[1] for val in v if isinstance(val,tuple) and val[0] == "SpliceAI"] vcf_data.close() return scores
def extend_vcf_annotations(query_vcf, pcgr_db_directory, pcgr_predispose): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions """ ## read VEP and PCGR tags to be appended to VCF file pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file( os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv')) if pcgr_predispose is True: pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file( os.path.join(pcgr_db_directory, 'pcgr_infotags_predisposition.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf) vep_to_pcgr_af = { 'gnomAD_AMR_AF': 'AMR_AF_GNOMAD', 'gnomAD_AFR_AF': 'AFR_AF_GNOMAD', 'gnomAD_EAS_AF': 'EAS_AF_GNOMAD', 'gnomAD_NFE_AF': 'NFE_AF_GNOMAD', 'gnomAD_AF': 'GLOBAL_AF_GNOMAD', 'gnomAD_SAS_AF': 'SAS_AF_GNOMAD', 'gnomAD_OTH_AF': 'OTH_AF_GNOMAD', 'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD', 'gnomAD_FIN_AF': 'FIN_AF_GNOMAD', 'AFR_AF': 'AFR_AF_1KG', 'AMR_AF': 'AMR_AF_1KG', 'SAS_AF': 'SAS_AF_1KG', 'EUR_AF': 'EUR_AF_1KG', 'EAS_AF': 'EAS_AF_1KG', 'AF': 'GLOBAL_AF_1KG' } vcf = VCF(query_vcf) vep_csq_index2fields = {} vep_csq_fields2index = {} dbnsfp_prediction_algorithms = [] effect_predictions_description = "" for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys(): identifier = str(header_element['ID']) if identifier == 'CSQ' or identifier == 'DBNSFP': description = str(header_element['Description']) if 'Format: ' in description: subtags = description.split('Format: ')[1].split('|') if identifier == 'CSQ': i = 0 for t in subtags: v = t if t in vep_to_pcgr_af: v = str(vep_to_pcgr_af[t]) if v in pcgr_vcf_infotags_meta: vep_csq_index2fields[i] = v vep_csq_fields2index[v] = i i = i + 1 if identifier == 'DBNSFP': if len(subtags) > 7: effect_predictions_description = "Format: " + '|'.join( subtags[7:]) i = 7 while (i < len(subtags)): dbnsfp_prediction_algorithms.append( str( re.sub(r'((_score)|(_pred))"*$', '', subtags[i]))) i = i + 1 for tag in pcgr_vcf_infotags_meta: if not vcf.contains(tag): vcf.add_info_to_header({ 'ID': tag, 'Description': str(pcgr_vcf_infotags_meta[tag]['description']), 'Type': str(pcgr_vcf_infotags_meta[tag]['type']), 'Number': str(pcgr_vcf_infotags_meta[tag]['number']) }) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 pcgr_onco_xref_map = { 'SYMBOL': 1, 'ENTREZ_ID': 2, 'UNIPROT_ID': 3, 'APPRIS': 4, 'UNIPROT_ACC': 5, 'CHORUM_ID': 6, 'TUMOR_SUPPRESSOR': 7, 'ONCOGENE': 8, 'NETWORK_CG': 9, 'DISGENET_CUI': 10, 'CHEMBL_COMPOUND_ID': 11, 'INTOGEN_DRIVER': 12, 'ONCOSCORE': 13, 'CANCER_PREDISPOSITION': 14 } for rec in vcf: all_transcript_consequences = [] if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: logger.info( 'Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str( rec.REF) + '>' + alt_allele logger.warning( 'Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped' ) continue pcgr_onco_xref = {} num_chromosome_records_processed += 1 if not rec.INFO.get('PCGR_ONCO_XREF') is None: for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split( ','): xrefs = transcript_onco_xref.split('|') ensembl_transcript_id = str(xrefs[0]) pcgr_onco_xref[ensembl_transcript_id] = {} for annotation in pcgr_onco_xref_map.keys(): annotation_index = pcgr_onco_xref_map[annotation] if annotation_index > (len(xrefs) - 1): continue if xrefs[annotation_index] != '': pcgr_onco_xref[ensembl_transcript_id][ annotation] = xrefs[annotation_index] for identifier in ['CSQ', 'DBNSFP']: if identifier == 'CSQ': num_picks = 0 for csq in rec.INFO.get(identifier).split(','): csq_fields = csq.split('|') if csq_fields[vep_csq_fields2index[ 'PICK']] == "1": ## only consider the primary/picked consequence when expanding with annotation tags num_picks += 1 j = 0 ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele) while (j < len(csq_fields)): if j in vep_csq_index2fields: if csq_fields[j] != '': rec.INFO[vep_csq_index2fields[j]] = str( csq_fields[j]) if vep_csq_index2fields[j] == 'Feature': ensembl_transcript_id = str( csq_fields[j]) if ensembl_transcript_id in pcgr_onco_xref: for annotation in pcgr_onco_xref_map.keys( ): if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC': continue if annotation in pcgr_onco_xref[ ensembl_transcript_id]: if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG' or annotation == 'CANCER_PREDISPOSITION': rec.INFO[ annotation] = True else: rec.INFO[annotation] = pcgr_onco_xref[ ensembl_transcript_id][ annotation] if vep_csq_index2fields[j] == 'DOMAINS': domain_identifiers = str( csq_fields[j]).split('&') for v in domain_identifiers: if v.startswith('Pfam_domain'): rec.INFO['PFAM_DOMAIN'] = str( re.sub( r'\.[0-9]{1,}$', '', re.sub( r'Pfam_domain:', '', v))) if vep_csq_index2fields[ j] == 'Existing_variation': var_identifiers = str( csq_fields[j]).split('&') cosmic_identifiers = [] dbsnp_identifiers = [] for v in var_identifiers: if v.startswith('COSM'): cosmic_identifiers.append(v) if v.startswith('rs'): dbsnp_identifiers.append( re.sub('^rs', '', v)) if len(cosmic_identifiers) > 0: rec.INFO[ 'COSMIC_MUTATION_ID'] = '&'.join( cosmic_identifiers) if len(dbsnp_identifiers) > 0: rec.INFO['DBSNPRSID'] = '&'.join( dbsnp_identifiers) j = j + 1 set_coding_change(rec) symbol = '.' if csq_fields[vep_csq_fields2index['SYMBOL']] != "": symbol = str( csq_fields[vep_csq_fields2index['SYMBOL']]) consequence_entry = str( csq_fields[vep_csq_fields2index['Consequence']] ) + ':' + str(symbol) + ':' + str(csq_fields[ vep_csq_fields2index['Feature_type']]) + ':' + str( csq_fields[vep_csq_fields2index['Feature']] ) + ':' + str( csq_fields[vep_csq_fields2index['BIOTYPE']]) all_transcript_consequences.append(consequence_entry) if identifier == 'DBNSFP': if not rec.INFO.get('DBNSFP') is None: map_variant_effect_predictors( rec, dbnsfp_prediction_algorithms) rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences) w.write_record(rec) w.close() logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' write_pass_vcf(annotated_vcf) else: pcgrutils.pcgr_error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger) else: pcgrutils.pcgr_error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
def process_vcf(vcf: str, tools_config: dict, thresholds: list, af_col: str, is_clinvar: bool = False): """ Scans a VEP annotated VCF file and extracts all the tools scores :param str vcf: VCF file :param dict tools_config: Dict with the map between available tools and VCF annotations (already parsed, only existing fields should be passed here). :param list thresholds: List of tools to process based on the tools config file as well as the selected scope for the analysis :param str af_col: VCF field that measures population frequency of variants. :param bool is_clinvar: Whether `vcf` is from clinvar database. :return dict: Dict with scores keyed by variant :return list: List with the tools provided in the config that were not found in the VCF. (Will be used to update the config_tools variable) """ logging.info("Extracting predictions from VCF.") # List of required INFO fields when # dataset is from Clinvar CLINVAR_FIELDS = ['CLNREVSTAT', 'CLNSIG'] clinvar_confirmed = False # List of tools belonging to the # provided scope that will be analysed # Requires to be present in the updated config # (some tools might be removed from config after processing first file) TOOLS_TO_ANALYSE = [ t[0] for t in thresholds if t[0] in tools_config.keys() ] # MAP with the list of VCF fields per tool TOOLS_CONFIG_MAP = { k: v[0] for k, v in tools_config.items() if k in TOOLS_TO_ANALYSE } # List with the VCF fields to check # Present fields in the header will be # popped out sequentially MISSING_VCF_FIELDS_FLAT = [ i for sublist in [v for _k, v in TOOLS_CONFIG_MAP.items()] for i in sublist ] # Missing tools from VCF to return # and exclude from analysis MISSING_VCF_TOOLS = [] # Indexes is an ordered dict with the index(s) # where information about a given tool is stored # in the VEP annotation field vep_indexes = OrderedDict() # List of VCF fields (not tool names) that are not # found in the VEP annotation field absent_from_VEP_annot = [] # Dict with the list of tools and corresponding # VCF fields found in the INFO field as a single # annotation present_in_INFO = defaultdict(list) # List of tool names with all annotation fields # found in the VEP annotation field all_vep_annotations = [] present_in_VEP_annot = [] # Parse header and count number of variants vcf_data = VCF(vcf) if vcf_data.contains("ANN"): VEP_TAG = "ANN" elif vcf_data.contains("CSQ"): VEP_TAG = "CSQ" else: raise ValueError( "VETA requires VEP annotations. ANN or CSQ field was not found in " "the INFO field of the input VCF file. Exiting.\n") for field in vcf_data.header_iter(): # If field is in VEP annotations if field["HeaderType"] == "INFO" and field["ID"] == VEP_TAG: all_vep_annotations = field["Description"].split( "Format:")[1][:-1].strip().split("|") if af_col in all_vep_annotations: vep_indexes[af_col] = [all_vep_annotations.index(af_col)] # Looks only for scores belonging # to the specific scope set # Also look for the AF column set for _tool in TOOLS_TO_ANALYSE: _tool_field = TOOLS_CONFIG_MAP[_tool] _present, _absent = _check_if_field_exists( _tool_field, all_vep_annotations) if _present: _fields_present = [v[0] for v in _present] present_in_VEP_annot.extend(_fields_present) vep_indexes[_tool] = [v[1] for v in _present] [ MISSING_VCF_FIELDS_FLAT.remove(i) for i in _fields_present ] if _absent: absent_from_VEP_annot.extend(_absent) elif field["HeaderType"] == "INFO": if field['ID'] in MISSING_VCF_FIELDS_FLAT: _tool_name = [ _t for _t, v in TOOLS_CONFIG_MAP.items() if field['ID'] in v ][0] present_in_INFO[_tool_name].append(field['ID']) MISSING_VCF_FIELDS_FLAT.remove(field['ID']) if field['ID'] in CLINVAR_FIELDS: clinvar_confirmed = True if field['ID'] == af_col and af_col not in vep_indexes.keys( ) and af_col not in present_in_INFO.keys(): present_in_INFO[af_col].append(field['ID']) if is_clinvar and not clinvar_confirmed: raise ValueError( 'Input VCF is not from Clinvar ({}) fields were ' 'not found in the VCF INFO annotations. They are ' 'required for clinical significance extraction.\n' 'Additionally, be aware that when running in ' 'benchmark mode, non-Clinvar files must be passed ' 'via an input directory where target files ' 'referring to each class are located.'.format(CLINVAR_FIELDS)) # if there are VCF fields from # the config absent in VCF header # LOG absent fields if MISSING_VCF_FIELDS_FLAT: for tool, fields in TOOLS_CONFIG_MAP.items(): for _f in fields: if _f in MISSING_VCF_FIELDS_FLAT: logging.info("\'{}\' field does not exist " "in VCF. Corresponding tool " "(\'{}\') will be discarded from " "analysis.".format(_f, tool)) MISSING_VCF_TOOLS.append(tool) # Delete tool from analysis # if at least one subfield # is not present try: del vep_indexes[tool] except KeyError: pass try: del present_in_INFO[tool] except KeyError: pass n_variants = 0 for _ in vcf_data: n_variants += 1 vcf_data.close() # Pack kwargs args = { 'VEP_TAG': VEP_TAG, 'all_vep_annotations': all_vep_annotations, 'present_in_INFO': present_in_INFO, 'vep_indexes': vep_indexes, 'is_clinvar': is_clinvar } # Variant processing if n_variants > 5000: header = subprocess.Popen(["bcftools", "view", "-h", vcf], stdout=subprocess.PIPE).stdout.readlines() body_variants = tempfile.NamedTemporaryFile() subprocess.call(["bcftools", "view", "-H", vcf], stdout=body_variants) list_files = osutils.split_file_in_chunks(body_variants.name, header, n_variants) with multiprocessing.Pool() as p: dict_list = list( tqdm(p.imap(partial(_iter_variants, **args), list_files))) logging.info("Merging data from parallel VCF processing.") scores = defaultdict(list) for d in dict_list: # if bool(d.keys() & scores.keys()): # raise ValueError("Repeated variants in the input file.") scores.update(d) [os.remove(f) for f in list_files] logging.info("Done.") p.close() else: scores = _iter_variants(vcf, **args) return scores, MISSING_VCF_TOOLS
def main( in_vcf: str, out_vcf: str, overwrite: bool, verbose: bool, min_qual: float, min_depth: int, min_fed: float, max_depth: int, min_strand_bias: int, min_bqb: float, min_mqb: float, min_rpb: float, min_rpbz: Optional[float], max_rpbz: Optional[float], max_scbz: Optional[float], max_sgb: float, min_vdb: float, hist: bool, min_frs: float, min_mq: int, ): """Apply the following filters to a VCF:\n - Minimum proportion of the expected (median) depth\n - Maximum proportion of the expected (median) depth\n - Minimum QUAL threshold\n - Minimum Strand bias percentage """ log_level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level ) vcf_reader = VCF(in_vcf) if not vcf_reader.contains(Tags.Depth.value): raise DepthTagError(f"Depth tag {Tags.Depth} not found in header") if (not vcf_reader.contains(str(Tags.StrandDepth))) and min_strand_bias: logging.warning( f"Strand depth tag {Tags.StrandDepth} not found in header. " f"Turning off strand bias filter..." ) min_strand_bias = 0 logging.info("Calculating expected (median) depth...") depths = [] quals = [] for v in vcf_reader: depths.append(get_depth(v)) quals.append(v.QUAL or 0) median_depth = np.median(depths) logging.info(f"Expected depth: {median_depth}") if hist: import histoprint tick_format = "% .1f" logging.info("Depth histogram:") histoprint.print_hist( np.histogram(depths, bins=HIST_BINS), title="Depth histogram", summary=True, tick_format=tick_format, file=click.get_text_stream("stderr"), ) logging.info("QUAL histogram") histoprint.print_hist( np.histogram(quals, bins=HIST_BINS), title="QUAL histogram", summary=True, tick_format=tick_format, file=click.get_text_stream("stderr"), ) assessor = Filter( expected_depth=int(median_depth), min_qual=min_qual, min_depth=min_depth, min_fed=min_fed, max_depth=max_depth, min_strand_bias=min_strand_bias, min_bqb=min_bqb, min_mqb=min_mqb, min_rpb=min_rpb, max_sgb=max_sgb, min_vdb=min_vdb, min_frs=min_frs, min_mq=min_mq, min_rpbz=min_rpbz, max_rpbz=max_rpbz, max_scbz=max_scbz, ) vcf_reader = VCF(in_vcf) assessor.add_filters_to_header(vcf_reader) if not Path(out_vcf).parent.exists(): Path(out_vcf).parent.mkdir(exist_ok=True, parents=True) vcf_writer = Writer(out_vcf, tmpl=vcf_reader) stats = Counter() logging.info("Filtering variants...") for variant in vcf_reader: filter_status = assessor.filter_status(variant) if ( (not overwrite) and variant.FILTER is not None and filter_status != str(Tags.Pass) ): current_filter = variant.FILTER.rstrip(";") variant.FILTER = f"{current_filter};{filter_status}" else: variant.FILTER = filter_status vcf_writer.write_record(variant) stats.update(filter_status.split(";")) vcf_reader.close() vcf_writer.close() logging.info("FILTER STATISTICS") logging.info("=================") for filt, count in stats.items(): logging.info(f"Filter: {filt}\tCount: {count}") logging.info("Done!")
def main(arguments=None): args = parse_arguments() vcf = VCF(args["vcf_file"], threads=args["threads"]) vcf.add_info_to_header(UPDATED) vcf.add_info_to_header(PANEL_FREQ_DIFF) vcf.add_info_to_header(MISSINGNES) vcf.add_info_to_header(MAF) if not vcf.contains("AF"): vcf.add_info_to_header(AF) w = Writer(args["output"], vcf) panel = generate_panel_data( panel_file=args["reference_panel"], chr=args["chromosomes"], annotation=args["chromosome_annotation"], panel_type=args["reference_panel_type"], separator=args["separator"], ) vcf_summary = VCFSummary() print( toml_header % ( args["outlier_threshold"], args["min_ambigious_threshold"], args["max_ambigious_threshold"], args["reference_panel_type"], args["chromosome_annotation"], args["ambigious"], args["fix_complement_ref_alt"], ), file=sys.stderr, ) for variant in vcf: status = "unchanged" reason = "None" panel_variant_freq = None variant_id_end = str(variant.CHROM) + "_" + str(variant.end) if not variant.INFO.get("AF"): variant.INFO["AF"] = variant.aaf if variant_id_end in panel: variant.INFO["UPD"] = 0 panel_variant = panel[variant_id_end] panel_variant_freq = panel_variant["freq"] vcf_summary.n_in_panel += 1 if not variant.ALT: print_variant_toml(variant, panel_variant["freq"], "removed", "unknown_alt/monomorphic") vcf_summary.unknown_alt += 1 continue if (args["ambigious"] and variant.aaf > args["min_ambigious_threshold"] and variant.aaf < args["max_ambigious_threshold"]): vcf_summary.ambigious += 1 print_variant_toml(variant, panel_variant["freq"], "removed", "ambigious_frequency") continue if should_recode(variant, panel_variant): swap_ref_alt(variant) variant.INFO["UPD"] = 1 vcf_summary.updated += 1 status = "updated" reason = "ref_alt_swapped" if (should_flipstrand(variant, panel_variant) and args["fix_complement_ref_alt"]): flipstrand(variant, panel_variant["freq"]) variant.INFO["UPD"] = 1 vcf_summary.flipped += 1 status = "strand_flipped" reason = "ref/alt_not_in_panel_nucleotides" vcf_summary.add_variant(variant_id_end) v_freq = variant.INFO.get("AF") variant.INFO["PFD"] = abs( variant.INFO.get("AF") - panel_variant["freq"]) variant.INFO["MISS"] = np.sum(variant.gt_types == 2) / len( variant.gt_types) variant.INFO["MAF"] = v_freq if v_freq < 0.5 else 1 - v_freq vcf_summary.VARIANTS[variant_id_end].update({ "freq": variant.aaf, "panel_freq": panel_variant["freq"], "updated_freq": v_freq, "MAF": variant.INFO.get("MAF"), "MISS": variant.INFO.get("MISS"), "PFD": variant.INFO.get("PFD"), "v_id": variant.ID, "updated": variant.INFO.get("UPD"), }) print_variant_toml(variant, panel_variant_freq, status, reason) vcf_summary.kept += 1 w.write_record(variant) w.close() vcf.close() plot_file = re.sub(r"(vcf|bcf)(\.gz)*$", "png", args["output"]) if not vcf_summary.n_in_panel == 0: create_summary_plot(vcf_summary, outfile=plot_file, threshold=args["outlier_threshold"]) print(vcf_summary, file=sys.stderr) print("n_reference_panel_size=%d" % len(panel.keys()), file=sys.stderr)