def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, pon_annotation, cpsr): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions 5. Panel-of-normal (blacklisted variants) annotation List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_directory """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'pcgr_infotags.tsv')) if cpsr is True: vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'cpsr_infotags.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta) dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms'] vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap'] vcf = VCF(query_vcf) for tag in sorted(vcf_infotags_meta): if pon_annotation == 0: if not tag.startswith('PANEL_OF_NORMALS'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) else: vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 pcgr_onco_xref_map = {'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID':1, 'ENSEMBL_PROTEIN_ID':2, 'SYMBOL':3, 'SYMBOL_ENTREZ':4, 'ENTREZ_ID':5, 'UNIPROT_ID':6, 'APPRIS':7,'UNIPROT_ACC':8,'REFSEQ_MRNA':9,'CORUM_ID':10,'TUMOR_SUPPRESSOR':11, 'TUMOR_SUPPRESSOR_EVIDENCE':12, 'ONCOGENE':13, 'ONCOGENE_EVIDENCE':14, 'NETWORK_CG':15,'DISGENET_CUI':16,'CHEMBL_COMPOUND_ID':17,'CHEMBL_COMPOUND_ID_EARLY_PHASE':18, 'INTOGEN_DRIVER':19, 'TCGA_DRIVER':20,'ONCOSCORE':21, 'MIM_PHENOTYPE_ID':22, 'CANCER_PREDISPOSITION_SOURCE':23, 'CANCER_SUSCEPTIBILITY_CUI':24, 'CANCER_SYNDROME_CUI':25, 'CANCER_PREDISPOSITION_MOI':26, 'CANCER_PREDISPOSITION_MOD':27, 'SIGNALING_PATHWAY':28, 'OPENTARGETS_DISEASE_ASSOCS':29, 'OPENTARGETS_TRACTABILITY_COMPOUND':30, 'OPENTARGETS_TRACTABILITY_ANTIBODY':31, 'GE_PANEL_ID':32, 'ACTIONABLE_TARGET':33,'GENCODE_GENE_STATUS':34, 'PROB_HAPLOINSUFFICIENCY':35,'PROB_EXAC_LOF_INTOLERANT':36,'PROB_EXAC_LOF_INTOLERANT_HOM':37, 'PROB_EXAC_LOF_TOLERANT_NULL':38,'PROB_EXAC_NONTCGA_LOF_INTOLERANT':39, 'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM':40, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL':41, 'PROB_GNOMAD_LOF_INTOLERANT':42, 'PROB_GNOMAD_LOF_INTOLERANT_HOM':43, 'PROB_GNOMAD_LOF_TOLERANT_NULL':44, 'ESSENTIAL_GENE_CRISPR':45, 'ESSENTIAL_GENE_CRISPR2':46} vcf_info_element_types = {} for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element: identifier = str(header_element['ID']) fieldtype = str(header_element['Type']) vcf_info_element_types[identifier] = fieldtype for rec in vcf: if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: if not current_chrom is None: logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped') continue csq_record_results = {} num_chromosome_records_processed += 1 pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF") csq_record_results = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ') vep_csq_records = None if 'vep_all_csq' in csq_record_results: rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results['vep_all_csq']) if 'vep_block' in csq_record_results: vep_csq_records = csq_record_results['vep_block'] block_idx = 0 if cpsr is True: block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records) record = vep_csq_records[block_idx] for k in record: if k in vcf_info_element_types: if vcf_info_element_types[k] == "Flag" and record[k] == "1": rec.INFO[k] = True else: if not record[k] is None: rec.INFO[k] = record[k] if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms) w.write_record(rec) w.close() if current_chrom is not None: logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: check_subprocess(logger, 'bgzip -f ' + str(out_vcf)) check_subprocess(logger, 'tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger) else: annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc 3. Protein-relevant annotations, e.g. c functional protein features etc. 4. Variant effect predictions """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file( os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf( query_vcf, vcf_infotags_meta) dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[ 'dbnsfp_prediction_algorithms'] vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap'] vcf = VCF(query_vcf) for tag in vcf_infotags_meta: if lof_prediction == 0: if not tag.startswith('LoF'): vcf.add_info_to_header({ 'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']), 'Type': str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number']) }) else: vcf.add_info_to_header({ 'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']), 'Type': str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number']) }) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 gvanno_xref_map = { 'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID': 1, 'ENSEMBL_PROTEIN_ID': 2, 'SYMBOL': 3, 'SYMBOL_ENTREZ': 4, 'ENTREZ_ID': 5, 'UNIPROT_ID': 6, 'UNIPROT_ACC': 7, 'REFSEQ_MRNA': 8, 'CORUM_ID': 9, 'TUMOR_SUPPRESSOR': 10, 'TUMOR_SUPPRESSOR_EVIDENCE': 11, 'ONCOGENE': 12, 'ONCOGENE_EVIDENCE': 13, 'MIM_PHENOTYPE_ID': 14, 'OPENTARGETS_DISEASE_ASSOCS': 15, 'OPENTARGETS_TRACTABILITY_COMPOUND': 16, 'OPENTARGETS_TRACTABILITY_ANTIBODY': 17, 'PROB_HAPLOINSUFFICIENCY': 18, 'PROB_EXAC_LOF_INTOLERANT': 19, 'PROB_EXAC_LOF_INTOLERANT_HOM': 20, 'PROB_EXAC_LOF_TOLERANT_NULL': 21, 'PROB_EXAC_NONTCGA_LOF_INTOLERANT': 22, 'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM': 23, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL': 24, 'PROB_GNOMAD_LOF_INTOLERANT': 25, 'PROB_GNOMAD_LOF_INTOLERANT_HOM': 26, 'PROB_GNOMAD_LOF_TOLERANT_NULL': 27, 'ESSENTIAL_GENE_CRISPR': 28, 'ESSENTIAL_GENE_CRISPR2': 29 } vcf_info_element_types = {} for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element: identifier = str(header_element['ID']) fieldtype = str(header_element['Type']) vcf_info_element_types[identifier] = fieldtype for rec in vcf: if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: logger.info( 'Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str( rec.REF) + '>' + alt_allele logger.warning( 'Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped' ) continue num_chromosome_records_processed += 1 gvanno_xref = annoutils.make_transcript_xref_map( rec, gvanno_xref_map, xref_tag="GVANNO_XREF") csq_record_results = annoutils.parse_vep_csq(rec, gvanno_xref, vep_csq_fields_map, logger, pick_only=True, csq_identifier='CSQ') if 'vep_all_csq' in csq_record_results: rec.INFO['VEP_ALL_CSQ'] = ','.join( csq_record_results['vep_all_csq']) if 'vep_block' in csq_record_results: vep_csq_records = csq_record_results['vep_block'] block_idx = 0 record = vep_csq_records[block_idx] for k in record: if k in vcf_info_element_types: if vcf_info_element_types[k] == "Flag" and record[k] == "1": rec.INFO[k] = True else: if not record[k] is None: rec.INFO[k] = record[k] if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors( rec, dbnsfp_prediction_algorithms) w.write_record(rec) w.close() logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)', logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)', logger)
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, cpsr): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions """ ## read VEP and PCGR tags to be appended to VCF file pcgr_vcf_infotags_meta = annoutils.read_infotag_file( os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv')) if cpsr is True: pcgr_vcf_infotags_meta = annoutils.read_infotag_file( os.path.join(pcgr_db_directory, 'cpsr_infotags.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf) vep_to_pcgr_af = { 'gnomAD_AMR_AF': 'AMR_AF_GNOMAD', 'gnomAD_AFR_AF': 'AFR_AF_GNOMAD', 'gnomAD_EAS_AF': 'EAS_AF_GNOMAD', 'gnomAD_NFE_AF': 'NFE_AF_GNOMAD', 'gnomAD_AF': 'GLOBAL_AF_GNOMAD', 'gnomAD_SAS_AF': 'SAS_AF_GNOMAD', 'gnomAD_OTH_AF': 'OTH_AF_GNOMAD', 'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD', 'gnomAD_FIN_AF': 'FIN_AF_GNOMAD', 'AFR_AF': 'AFR_AF_1KG', 'AMR_AF': 'AMR_AF_1KG', 'SAS_AF': 'SAS_AF_1KG', 'EUR_AF': 'EUR_AF_1KG', 'EAS_AF': 'EAS_AF_1KG', 'AF': 'GLOBAL_AF_1KG' } vcf = VCF(query_vcf) vep_csq_index2fields = {} vep_csq_fields2index = {} dbnsfp_prediction_algorithms = [] effect_predictions_description = "" for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys(): identifier = str(header_element['ID']) if identifier == 'CSQ' or identifier == 'DBNSFP': description = str(header_element['Description']) if 'Format: ' in description: subtags = description.split('Format: ')[1].split('|') if identifier == 'CSQ': i = 0 for t in subtags: v = t if t in vep_to_pcgr_af: v = str(vep_to_pcgr_af[t]) if v in pcgr_vcf_infotags_meta: vep_csq_index2fields[i] = v vep_csq_fields2index[v] = i i = i + 1 if identifier == 'DBNSFP': if len(subtags) > 7: effect_predictions_description = "Format: " + '|'.join( subtags[7:]) i = 7 while (i < len(subtags)): dbnsfp_prediction_algorithms.append( str( re.sub(r'((_score)|(_pred))"*$', '', subtags[i]))) i = i + 1 for tag in pcgr_vcf_infotags_meta: #if not vcf.contains(tag): vcf.add_info_to_header({ 'ID': tag, 'Description': str(pcgr_vcf_infotags_meta[tag]['description']), 'Type': str(pcgr_vcf_infotags_meta[tag]['type']), 'Number': str(pcgr_vcf_infotags_meta[tag]['number']) }) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 pcgr_onco_xref_map = { 'SYMBOL': 1, 'ENTREZ_ID': 2, 'UNIPROT_ID': 3, 'APPRIS': 4, 'UNIPROT_ACC': 5, 'CHORUM_ID': 6, 'TUMOR_SUPPRESSOR': 7, 'ONCOGENE': 8, 'NETWORK_CG': 9, 'DISGENET_CUI': 10, 'CHEMBL_COMPOUND_ID': 11, 'INTOGEN_DRIVER': 12, 'ONCOSCORE': 13, 'CANCER_PREDISPOSITION_SOURCE': 15, 'CANCER_SUSCEPTIBILITY_CUI': 16, 'CANCER_SYNDROME_CUI': 17, 'CANCER_PREDISPOSITION_MOI': 18 } for rec in vcf: all_transcript_consequences = [] if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: logger.info( 'Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str( rec.REF) + '>' + alt_allele logger.warning( 'Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped' ) continue pcgr_onco_xref = {} num_chromosome_records_processed += 1 if not rec.INFO.get('PCGR_ONCO_XREF') is None: for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split( ','): xrefs = transcript_onco_xref.split('|') ensembl_transcript_id = str(xrefs[0]) pcgr_onco_xref[ensembl_transcript_id] = {} for annotation in pcgr_onco_xref_map.keys(): annotation_index = pcgr_onco_xref_map[annotation] if annotation_index > (len(xrefs) - 1): continue if xrefs[annotation_index] != '': pcgr_onco_xref[ensembl_transcript_id][ annotation] = xrefs[annotation_index] for identifier in ['CSQ', 'DBNSFP']: if identifier == 'CSQ': num_picks = 0 for csq in rec.INFO.get(identifier).split(','): csq_fields = csq.split('|') if csq_fields[vep_csq_fields2index[ 'PICK']] == "1": ## only consider the primary/picked consequence when expanding with annotation tags num_picks += 1 j = 0 ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele) while (j < len(csq_fields)): if j in vep_csq_index2fields: if csq_fields[j] != '': rec.INFO[vep_csq_index2fields[j]] = str( csq_fields[j]) if vep_csq_index2fields[j] == 'Feature': ensembl_transcript_id = str( csq_fields[j]) if ensembl_transcript_id in pcgr_onco_xref: for annotation in pcgr_onco_xref_map.keys( ): if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC': continue if annotation in pcgr_onco_xref[ ensembl_transcript_id]: if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG': rec.INFO[ annotation] = True else: if annotation.startswith( 'CANCER_'): if cpsr is True: rec.INFO[ annotation] = pcgr_onco_xref[ ensembl_transcript_id][ annotation] else: rec.INFO[annotation] = pcgr_onco_xref[ ensembl_transcript_id][ annotation] if vep_csq_index2fields[j] == 'DOMAINS': domain_identifiers = str( csq_fields[j]).split('&') for v in domain_identifiers: if v.startswith('Pfam_domain'): rec.INFO['PFAM_DOMAIN'] = str( re.sub( r'\.[0-9]{1,}$', '', re.sub( r'Pfam_domain:', '', v))) if vep_csq_index2fields[ j] == 'Existing_variation': var_identifiers = str( csq_fields[j]).split('&') cosmic_identifiers = [] dbsnp_identifiers = [] for v in var_identifiers: if v.startswith('COSM'): cosmic_identifiers.append(v) if v.startswith('rs'): dbsnp_identifiers.append(v) if len(cosmic_identifiers) > 0: rec.INFO[ 'COSMIC_MUTATION_ID'] = '&'.join( cosmic_identifiers) if len(dbsnp_identifiers) > 0: rec.INFO['DBSNPRSID'] = '&'.join( dbsnp_identifiers) j = j + 1 annoutils.set_coding_change(rec) symbol = '.' if csq_fields[vep_csq_fields2index['SYMBOL']] != "": symbol = str( csq_fields[vep_csq_fields2index['SYMBOL']]) consequence_entry = str( csq_fields[vep_csq_fields2index['Consequence']] ) + ':' + str(symbol) + ':' + str(csq_fields[ vep_csq_fields2index['Feature_type']]) + ':' + str( csq_fields[vep_csq_fields2index['Feature']] ) + ':' + str( csq_fields[vep_csq_fields2index['BIOTYPE']]) all_transcript_consequences.append(consequence_entry) if identifier == 'DBNSFP': if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors( rec, dbnsfp_prediction_algorithms) rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences) w.write_record(rec) w.close() logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc 3. Protein-relevant annotations, e.g. c functional protein features etc. 4. Variant effect predictions """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file( os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf( query_vcf, vcf_infotags_meta) vep_csq_index2fields = meta_vep_dbnsfp_info['vep_csq_index2fields'] vep_csq_fields2index = meta_vep_dbnsfp_info['vep_csq_fields2index'] dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[ 'dbnsfp_prediction_algorithms'] vcf = VCF(query_vcf) for tag in vcf_infotags_meta: if lof_prediction == 0: if not tag.startswith('LoF'): vcf.add_info_to_header({ 'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']), 'Type': str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number']) }) else: vcf.add_info_to_header({ 'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']), 'Type': str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number']) }) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 gvanno_xref_map = { 'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID': 1, 'SYMBOL': 2, 'ENTREZ_ID': 3, 'UNIPROT_ID': 4, 'APPRIS': 5, 'UNIPROT_ACC': 6, 'REFSEQ_MRNA': 7, 'CORUM_ID': 8, 'TUMOR_SUPPRESSOR': 9, 'ONCOGENE': 10, 'DISGENET_CUI': 11, 'MIM_PHENOTYPE_ID': 12 } for rec in vcf: all_transcript_consequences = [] if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: logger.info( 'Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str( rec.REF) + '>' + alt_allele logger.warning( 'Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped' ) continue gvanno_xref = {} num_chromosome_records_processed += 1 if not rec.INFO.get('GVANNO_XREF') is None: for transcript_xref in rec.INFO.get('GVANNO_XREF').split(','): xrefs = transcript_xref.split('|') ensembl_transcript_id = str(xrefs[0]) gvanno_xref[ensembl_transcript_id] = {} for annotation in gvanno_xref_map.keys(): annotation_index = gvanno_xref_map[annotation] if annotation_index > (len(xrefs) - 1): continue if xrefs[annotation_index] != '': gvanno_xref[ensembl_transcript_id][annotation] = xrefs[ annotation_index] for identifier in ['CSQ', 'DBNSFP']: if identifier == 'CSQ': num_picks = 0 for csq in rec.INFO.get(identifier).split(','): csq_fields = csq.split('|') if csq_fields[vep_csq_fields2index[ 'PICK']] == "1": ## only consider the primary/picked consequence when expanding with annotation tags num_picks += 1 j = 0 ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele) while (j < len(csq_fields)): if j in vep_csq_index2fields: if csq_fields[j] != '': rec.INFO[vep_csq_index2fields[j]] = str( csq_fields[j]) if vep_csq_index2fields[j] == 'Feature': ensembl_transcript_id = str( csq_fields[j]) if ensembl_transcript_id in gvanno_xref: for annotation in gvanno_xref_map.keys( ): if annotation in gvanno_xref[ ensembl_transcript_id]: if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE': rec.INFO[ annotation] = True else: rec.INFO[annotation] = gvanno_xref[ ensembl_transcript_id][ annotation] if vep_csq_index2fields[j] == 'DOMAINS': domain_identifiers = str( csq_fields[j]).split('&') for v in domain_identifiers: if v.startswith('Pfam_domain'): rec.INFO['PFAM_DOMAIN'] = str( re.sub( r'\.[0-9]{1,}$', '', re.sub( r'Pfam_domain:', '', v))) if vep_csq_index2fields[ j] == 'Existing_variation': var_identifiers = str( csq_fields[j]).split('&') cosmic_identifiers = [] dbsnp_identifiers = [] for v in var_identifiers: if v.startswith('COSM'): cosmic_identifiers.append(v) if v.startswith('rs'): dbsnp_identifiers.append(v) if len(cosmic_identifiers) > 0: rec.INFO[ 'COSMIC_MUTATION_ID'] = '&'.join( cosmic_identifiers) if len(dbsnp_identifiers) > 0: rec.INFO['DBSNPRSID'] = '&'.join( dbsnp_identifiers) j = j + 1 annoutils.set_coding_change(rec) symbol = '.' if csq_fields[vep_csq_fields2index['SYMBOL']] != "": symbol = str( csq_fields[vep_csq_fields2index['SYMBOL']]) consequence_entry = str( csq_fields[vep_csq_fields2index['Consequence']] ) + ':' + str(symbol) + ':' + str(csq_fields[ vep_csq_fields2index['Feature_type']]) + ':' + str( csq_fields[vep_csq_fields2index['Feature']] ) + ':' + str( csq_fields[vep_csq_fields2index['BIOTYPE']]) all_transcript_consequences.append(consequence_entry) if identifier == 'DBNSFP': if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors( rec, dbnsfp_prediction_algorithms) rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences) w.write_record(rec) w.close() logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)', logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)', logger)
def extend_vcf_annotations(query_vcf, pcgr_db_dir, logger, pon_annotation, regulatory_annotation, cpsr, debug): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions 5. Panel-of-normal (blacklisted variants) annotation List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_dir """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'pcgr_infotags.tsv')) if cpsr is True: vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'cpsr_infotags.tsv')) pcgr_onco_xref_map = annoutils.read_genexref_namemap(os.path.join(pcgr_db_dir, 'pcgr_onco_xref', 'pcgr_onco_xref_namemap.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta) dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms'] vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap'] vcf = cyvcf2.VCF(query_vcf) for tag in sorted(vcf_infotags_meta): if pon_annotation == 0 and regulatory_annotation == 0: if not tag.startswith('PANEL_OF_NORMALS') and not tag.startswith('REGULATORY_'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) elif pon_annotation == 1 and regulatory_annotation == 0: if not tag.startswith('REGULATORY_'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) elif pon_annotation == 0 and regulatory_annotation == 1: if not tag.startswith('PANEL_OF_NORMALS'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) else: vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) w = cyvcf2.Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 vcf_info_element_types = {} for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element: identifier = str(header_element['ID']) fieldtype = str(header_element['Type']) vcf_info_element_types[identifier] = fieldtype vars_no_csq = list() for rec in vcf: if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: if not current_chrom is None: logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}") current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = f"g.{rec.CHROM}:{pos}{rec.REF}>{alt_allele}" vars_no_csq.append(variant_id) continue num_chromosome_records_processed += 1 pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF") if regulatory_annotation == 1: csq_record_results_all = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = False, csq_identifier = 'CSQ') if 'vep_block' in csq_record_results_all: vep_csq_records_all = csq_record_results_all['vep_block'] rec.INFO['REGULATORY_ANNOTATION'] = annoutils.map_regulatory_variant_annotations(vep_csq_records_all) csq_record_results_pick = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ') vep_csq_records = None if 'vep_all_csq' in csq_record_results_pick: rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results_pick['vep_all_csq']) if 'vep_block' in csq_record_results_pick: vep_csq_records = csq_record_results_pick['vep_block'] block_idx = 0 if cpsr is True: block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records) record = vep_csq_records[block_idx] for k in record: if k in vcf_info_element_types: if vcf_info_element_types[k] == "Flag" and record[k] == "1": rec.INFO[k] = True else: if not record[k] is None: rec.INFO[k] = record[k] if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms) w.write_record(rec) if vars_no_csq: logger.warning(f"There were {len(vars_no_csq)} records with no CSQ tag from VEP (was --vep_no_intergenic flag set?). Skipping them and showing (up to) the first 100:") print('----') print(', '.join(vars_no_csq[:100])) print('----') w.close() if current_chrom is not None: logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}") vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: check_subprocess(logger, f'bgzip -f {out_vcf}', debug=False) check_subprocess(logger, f'tabix -f -p vcf {out_vcf}.gz', debug=False) annotated_vcf = f'{out_vcf}.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger) else: error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)