def main(clinvar_xml, zooma_feedback):
    with open(zooma_feedback, 'wt') as outfile:
        outfile.write(
            'STUDY\tBIOENTITY\tPROPERTY_TYPE\tPROPERTY_VALUE\tSEMANTIC_TAG\tANNOTATOR\tANNOTATION_DATE\n'
        )
        for clinvar_record in clinvar_xml_utils.ClinVarDataset(clinvar_xml):
            process_clinvar_record(clinvar_record, outfile)
def load_clinvar_data(clinvar_xml):
    """Load ClinVar data, preprocess, and return it as a Pandas dataframe."""
    # Iterate through ClinVar XML records
    variant_data = []  # To populate the return dataframe (see columns below)
    stats = Counter()
    for i, clinvar_record in enumerate(
            clinvar_xml_utils.ClinVarDataset(clinvar_xml)):
        if i and i % 100000 == 0:
            total_repeat_expansion_variants = stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_REPEAT_EXPANSION] + \
                                              stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_NO_COMPLETE_COORDS]
            logger.info(
                f'Processed {i} records, collected {total_repeat_expansion_variants} repeat expansion variant '
                f'candidates')

        # Skip a record if it does not contain variant information
        if not clinvar_record.measure:
            continue

        # Repeat expansion events come in two forms: with explicit coordinates and allele sequences (CHROM/POS/REF/ALT),
        # or without them. In the first case we can compute the explicit variant length as len(ALT) - len(REF). In the
        # second case, which is more rare but still important, we have to resort to parsing HGVS-like variant names.
        stats[clinvar_record.measure.microsatellite_category] += 1
        # Skip the record if it's a deletion or a short insertion
        if not clinvar_record.measure.is_repeat_expansion_variant:
            continue

        # Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant
        # summary format which was used previously.
        gene_symbols = clinvar_record.measure.preferred_gene_symbols
        if not gene_symbols:
            gene_symbols = ['-']

        # Extract HGNC ID
        hgnc_ids = clinvar_record.measure.hgnc_ids
        hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(
            gene_symbols) == 1 else '-'

        # Append data strings
        for gene_symbol in gene_symbols:
            variant_data.append([
                clinvar_record.measure.name, clinvar_record.accession,
                gene_symbol, hgnc_id
            ])
    total_repeat_expansion_variants = stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_REPEAT_EXPANSION] + \
                                      stats[clinvar_xml_utils.ClinVarRecordMeasure.MS_NO_COMPLETE_COORDS]
    logger.info(
        f'Done. A total of {i} records, {total_repeat_expansion_variants} repeat expansion variant candidates'
    )

    variants = pd.DataFrame(variant_data,
                            columns=('Name', 'RCVaccession', 'GeneSymbol',
                                     'HGNC_ID'))
    # Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates
    variants = variants.drop_duplicates()
    # Sort values by variant name
    return variants.sort_values(by=['Name']), stats
def parse_trait_names(filepath: str) -> list:
    """For a file containing ClinVar records in the XML format, return a list of Traits for the records in the file.
    Each Trait object contains trait name, how many times it occurs in the input file, and whether it is linked to an NT
    expansion variant.

    Trait occurrence count is calculated based on all unique (RCV, trait name) tuples in the input file. This is because
    each such tuple will, generally speaking, correspond to one output evidence string. So if we want to gauge which
    trait names are more important to curate, we need to consider how many such tuples it appears in.

    Traits which are implicated in "Microsatellite" variants are marked using a special field, because a subset of
    microsatellites are NT expansion variants, and their curation is of highest importance even if the number of records
    which they are linked to is low.

    :param filepath: Path to a gzipped file containing ClinVar XML dump.
    :return: A list of Trait objects."""

    # Tracks how many times a trait name occurs in ClinVar
    trait_name_counter = Counter()

    # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants.
    # Their curation is of highest importance regardless of how many records they are actually associated with.
    nt_expansion_traits = set()

    for clinvar_record in clinvar_xml_utils.ClinVarDataset(filepath):
        trait_names = set(trait.preferred_or_other_valid_name.lower()
                          for trait in clinvar_record.traits_with_valid_names)
        for trait_name in trait_names:
            trait_name_counter[trait_name] += 1
        if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant:
            nt_expansion_traits |= trait_names

    # Count trait occurrences
    traits = []
    for trait_name, trait_frequency in trait_name_counter.items():
        if trait_name == '-':
            print('Skipped {} missing trait names'.format(trait_frequency))
            continue
        associated_with_nt_expansion = trait_name in nt_expansion_traits
        traits.append(
            Trait(name=trait_name,
                  frequency=trait_frequency,
                  associated_with_nt_expansion=associated_with_nt_expansion))

    return traits
Exemple #4
0
#!/usr/bin/env python3

import argparse

from eva_cttv_pipeline import clinvar_xml_utils

parser = argparse.ArgumentParser(
    'Processes ClinVar XML dump and extract all variants, in CHROM:POS:REF:ALT format,'
    'for processing by the VEP mapping pipeline.')
parser.add_argument('--clinvar-xml',
                    required=True,
                    help='Path to the ClinVar XML file')
args = parser.parse_args()

for clinvar_record in clinvar_xml_utils.ClinVarDataset(args.clinvar_xml):
    if clinvar_record.measure is None or not clinvar_record.measure.has_complete_coordinates:
        continue
    m = clinvar_record.measure
    print(f'{m.chr}:{m.vcf_pos}:{m.vcf_ref}:{m.vcf_alt}')
Exemple #5
0
def clinvar_to_evidence_strings(string_to_efo_mappings,
                                variant_to_gene_mappings, clinvar_xml,
                                ot_schema, output_evidence_strings):
    report = Report(trait_mappings=string_to_efo_mappings,
                    consequence_mappings=variant_to_gene_mappings)
    ot_schema_contents = json.loads(open(ot_schema).read())
    output_evidence_strings_file = open(output_evidence_strings, 'wt')

    logger.info('Processing ClinVar records')
    for clinvar_record in clinvar_xml_utils.ClinVarDataset(clinvar_xml):
        report.clinvar_total += 1
        if report.clinvar_total % 1000 == 0:
            logger.info(f'{report.clinvar_total} records processed')
        if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant:
            report.repeat_expansion_variants += len(
                get_consequence_types(clinvar_record.measure,
                                      variant_to_gene_mappings))

        # Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
        # potentially mappable name).
        if not clinvar_record.traits_with_valid_names:
            report.clinvar_fatal_no_valid_traits += 1
            continue

        # Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
        if clinvar_record.measure is None:
            report.clinvar_skip_unsupported_variation += 1
            continue

        # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
        # origins, (2) EFO mappings, and (3) genes where the variant has effect.
        grouped_allele_origins = convert_allele_origins(
            clinvar_record.valid_allele_origins)
        consequence_types = get_consequence_types(clinvar_record.measure,
                                                  variant_to_gene_mappings)
        grouped_diseases = group_diseases_by_efo_mapping(
            clinvar_record.traits_with_valid_names, string_to_efo_mappings)

        # Failure mode 3 (skip). No functional consequences are available.
        if not consequence_types:
            report.clinvar_skip_no_functional_consequences += 1
            continue

        # Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
        # EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
        # failure so we can continue to measure mapping coverage.
        if not any(group[-1] for group in grouped_diseases):
            report.clinvar_skip_missing_efo_mapping += 1
            unmapped_trait_name = clinvar_record.traits_with_valid_names[
                0].preferred_or_other_valid_name
            report.unmapped_trait_names[unmapped_trait_name] += 1

        assert grouped_allele_origins and grouped_diseases and consequence_types, \
            'Some of the attribute lists are still empty even after passing all checks.'

        complete_evidence_strings_generated = 0
        evidence_strings_generated = 0
        for allele_origins, disease_attributes, consequence_attributes in itertools.product(
                grouped_allele_origins, grouped_diseases, consequence_types):
            disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
            evidence_string = generate_evidence_string(clinvar_record,
                                                       allele_origins,
                                                       disease_name,
                                                       disease_source_id,
                                                       disease_mapped_efo_id,
                                                       consequence_attributes)

            # Validate and immediately output the evidence string (not keeping everything in memory).
            validate_evidence_string(evidence_string, ot_schema_contents)
            output_evidence_strings_file.write(
                json.dumps(evidence_string) + '\n')

            # Record some evidence string and trait metrics.
            evidence_strings_generated += 1
            if disease_mapped_efo_id is not None:
                complete_evidence_strings_generated += 1
                report.used_trait_mappings.add(
                    (disease_name, disease_mapped_efo_id))

        assert evidence_strings_generated != 0, 'No evidence strings generated despite all attributes passing checks.'
        if complete_evidence_strings_generated == 1:
            report.clinvar_done_one_complete_evidence_string += 1
        elif complete_evidence_strings_generated > 1:
            report.clinvar_done_multiple_complete_evidence_strings += 1

        report.complete_evidence_string_count += complete_evidence_strings_generated
        report.evidence_string_count += evidence_strings_generated

    output_evidence_strings_file.close()
    return report
def get_test_clinvar_record(filename='test_clinvar_record.xml.gz'):
    """The default test file contains an extract of ClinVar XML for the record RCV000002127."""
    test_clinvar_record_file = os.path.join(test_dir, 'resources', filename)
    return [
        r for r in clinvar_xml_utils.ClinVarDataset(test_clinvar_record_file)
    ][0]
Exemple #7
0
def get_test_clinvar_record():
    """The test file contains an extract of ClinVar XML for the record RCV000002127."""
    return [r for r in clinvar_xml_utils.ClinVarDataset(test_clinvar_record_file)][0]