def main(clinvar_xml, zooma_feedback): with open(zooma_feedback, 'wt') as outfile: outfile.write( 'STUDY\tBIOENTITY\tPROPERTY_TYPE\tPROPERTY_VALUE\tSEMANTIC_TAG\tANNOTATOR\tANNOTATION_DATE\n' ) for clinvar_record in clinvar_xml_utils.ClinVarDataset(clinvar_xml): process_clinvar_record(clinvar_record, outfile)
def load_clinvar_data(clinvar_xml): """Load ClinVar data, preprocess, and return it as a Pandas dataframe.""" # Iterate through ClinVar XML records variant_data = [] # To populate the return dataframe (see columns below) stats = Counter() for i, clinvar_record in enumerate( clinvar_xml_utils.ClinVarDataset(clinvar_xml)): if i and i % 100000 == 0: total_repeat_expansion_variants = stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_REPEAT_EXPANSION] + \ stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_NO_COMPLETE_COORDS] logger.info( f'Processed {i} records, collected {total_repeat_expansion_variants} repeat expansion variant ' f'candidates') # Skip a record if it does not contain variant information if not clinvar_record.measure: continue # Repeat expansion events come in two forms: with explicit coordinates and allele sequences (CHROM/POS/REF/ALT), # or without them. In the first case we can compute the explicit variant length as len(ALT) - len(REF). In the # second case, which is more rare but still important, we have to resort to parsing HGVS-like variant names. stats[clinvar_record.measure.microsatellite_category] += 1 # Skip the record if it's a deletion or a short insertion if not clinvar_record.measure.is_repeat_expansion_variant: continue # Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant # summary format which was used previously. gene_symbols = clinvar_record.measure.preferred_gene_symbols if not gene_symbols: gene_symbols = ['-'] # Extract HGNC ID hgnc_ids = clinvar_record.measure.hgnc_ids hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len( gene_symbols) == 1 else '-' # Append data strings for gene_symbol in gene_symbols: variant_data.append([ clinvar_record.measure.name, clinvar_record.accession, gene_symbol, hgnc_id ]) total_repeat_expansion_variants = stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_REPEAT_EXPANSION] + \ stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_NO_COMPLETE_COORDS] logger.info( f'Done. A total of {i} records, {total_repeat_expansion_variants} repeat expansion variant candidates' ) variants = pd.DataFrame(variant_data, columns=('Name', 'RCVaccession', 'GeneSymbol', 'HGNC_ID')) # Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates variants = variants.drop_duplicates() # Sort values by variant name return variants.sort_values(by=['Name']), stats
def parse_trait_names(filepath: str) -> list: """For a file containing ClinVar records in the XML format, return a list of Traits for the records in the file. Each Trait object contains trait name, how many times it occurs in the input file, and whether it is linked to an NT expansion variant. Trait occurrence count is calculated based on all unique (RCV, trait name) tuples in the input file. This is because each such tuple will, generally speaking, correspond to one output evidence string. So if we want to gauge which trait names are more important to curate, we need to consider how many such tuples it appears in. Traits which are implicated in "Microsatellite" variants are marked using a special field, because a subset of microsatellites are NT expansion variants, and their curation is of highest importance even if the number of records which they are linked to is low. :param filepath: Path to a gzipped file containing ClinVar XML dump. :return: A list of Trait objects.""" # Tracks how many times a trait name occurs in ClinVar trait_name_counter = Counter() # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants. # Their curation is of highest importance regardless of how many records they are actually associated with. nt_expansion_traits = set() for clinvar_record in clinvar_xml_utils.ClinVarDataset(filepath): trait_names = set(trait.preferred_or_other_valid_name.lower() for trait in clinvar_record.traits_with_valid_names) for trait_name in trait_names: trait_name_counter[trait_name] += 1 if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant: nt_expansion_traits |= trait_names # Count trait occurrences traits = [] for trait_name, trait_frequency in trait_name_counter.items(): if trait_name == '-': print('Skipped {} missing trait names'.format(trait_frequency)) continue associated_with_nt_expansion = trait_name in nt_expansion_traits traits.append( Trait(name=trait_name, frequency=trait_frequency, associated_with_nt_expansion=associated_with_nt_expansion)) return traits
#!/usr/bin/env python3 import argparse from eva_cttv_pipeline import clinvar_xml_utils parser = argparse.ArgumentParser( 'Processes ClinVar XML dump and extract all variants, in CHROM:POS:REF:ALT format,' 'for processing by the VEP mapping pipeline.') parser.add_argument('--clinvar-xml', required=True, help='Path to the ClinVar XML file') args = parser.parse_args() for clinvar_record in clinvar_xml_utils.ClinVarDataset(args.clinvar_xml): if clinvar_record.measure is None or not clinvar_record.measure.has_complete_coordinates: continue m = clinvar_record.measure print(f'{m.chr}:{m.vcf_pos}:{m.vcf_ref}:{m.vcf_alt}')
def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema, output_evidence_strings): report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = open(output_evidence_strings, 'wt') logger.info('Processing ClinVar records') for clinvar_record in clinvar_xml_utils.ClinVarDataset(clinvar_xml): report.clinvar_total += 1 if report.clinvar_total % 1000 == 0: logger.info(f'{report.clinvar_total} records processed') if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant: report.repeat_expansion_variants += len( get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)) # Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid, # potentially mappable name). if not clinvar_record.traits_with_valid_names: report.clinvar_fatal_no_valid_traits += 1 continue # Failure mode 2 (skip). A ClinVar record contains an unsupported variation type. if clinvar_record.measure is None: report.clinvar_skip_unsupported_variation += 1 continue # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele # origins, (2) EFO mappings, and (3) genes where the variant has effect. grouped_allele_origins = convert_allele_origins( clinvar_record.valid_allele_origins) consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings) grouped_diseases = group_diseases_by_efo_mapping( clinvar_record.traits_with_valid_names, string_to_efo_mappings) # Failure mode 3 (skip). No functional consequences are available. if not consequence_types: report.clinvar_skip_no_functional_consequences += 1 continue # Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable # EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a # failure so we can continue to measure mapping coverage. if not any(group[-1] for group in grouped_diseases): report.clinvar_skip_missing_efo_mapping += 1 unmapped_trait_name = clinvar_record.traits_with_valid_names[ 0].preferred_or_other_valid_name report.unmapped_trait_names[unmapped_trait_name] += 1 assert grouped_allele_origins and grouped_diseases and consequence_types, \ 'Some of the attribute lists are still empty even after passing all checks.' complete_evidence_strings_generated = 0 evidence_strings_generated = 0 for allele_origins, disease_attributes, consequence_attributes in itertools.product( grouped_allele_origins, grouped_diseases, consequence_types): disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, disease_mapped_efo_id, consequence_attributes) # Validate and immediately output the evidence string (not keeping everything in memory). validate_evidence_string(evidence_string, ot_schema_contents) output_evidence_strings_file.write( json.dumps(evidence_string) + '\n') # Record some evidence string and trait metrics. evidence_strings_generated += 1 if disease_mapped_efo_id is not None: complete_evidence_strings_generated += 1 report.used_trait_mappings.add( (disease_name, disease_mapped_efo_id)) assert evidence_strings_generated != 0, 'No evidence strings generated despite all attributes passing checks.' if complete_evidence_strings_generated == 1: report.clinvar_done_one_complete_evidence_string += 1 elif complete_evidence_strings_generated > 1: report.clinvar_done_multiple_complete_evidence_strings += 1 report.complete_evidence_string_count += complete_evidence_strings_generated report.evidence_string_count += evidence_strings_generated output_evidence_strings_file.close() return report
def get_test_clinvar_record(filename='test_clinvar_record.xml.gz'): """The default test file contains an extract of ClinVar XML for the record RCV000002127.""" test_clinvar_record_file = os.path.join(test_dir, 'resources', filename) return [ r for r in clinvar_xml_utils.ClinVarDataset(test_clinvar_record_file) ][0]
def get_test_clinvar_record(): """The test file contains an extract of ClinVar XML for the record RCV000002127.""" return [r for r in clinvar_xml_utils.ClinVarDataset(test_clinvar_record_file)][0]