def test_output_trait_mapping(self): tempfile_path = tempfile.mkstemp()[1] with open(tempfile_path, "w", newline='') as mapping_file: mapping_writer = csv.writer(mapping_file, delimiter="\t") mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"]) test_trait = Trait('aprt deficiency, japanese type', 11) # Normally a set, but changed to a list for predictable output order in test test_trait.finished_mapping_set = [ OntologyEntry('http://www.orpha.net/ORDO/Orphanet_976', 'Adenine phosphoribosyltransferase deficiency'), OntologyEntry('http://www.orpha.net/ORDO/Orphanet_977', 'Adenine phosphoribosyltransferase deficiency type A') ] output.output_trait_mapping(test_trait, mapping_writer) with open(tempfile_path, "rt", newline='') as mapping_file: mapping_reader = csv.reader(mapping_file, delimiter="\t") next(mapping_reader) self.assertEqual(['aprt deficiency, japanese type', 'http://www.orpha.net/ORDO/Orphanet_976', 'Adenine phosphoribosyltransferase deficiency'], next(mapping_reader)) self.assertEqual(['aprt deficiency, japanese type', 'http://www.orpha.net/ORDO/Orphanet_977', 'Adenine phosphoribosyltransferase deficiency type A'], next(mapping_reader))
def test_output_for_curation(self): tempfile_path = tempfile.mkstemp()[1] with open(tempfile_path, "wt") as curation_file: curation_writer = csv.writer(curation_file, delimiter="\t") test_trait = Trait("transitional cell carcinoma of the bladder", 276) test_oxo_result = OxOResult("HP:0006740", "Transitional cell carcinoma of the bladder", "HP:0006740") test_oxo_mapping = OxOMapping("bladder transitional cell carcinoma", "EFO:0006544", 2, "HP:0006740") test_oxo_mapping.in_efo = test_oxo_mapping.is_current = True test_oxo_mapping.ontology_label = "bladder transitional cell carcinoma" test_oxo_result.mapping_list = [test_oxo_mapping] test_trait.oxo_result_list = [test_oxo_result] output.output_for_curation(test_trait, curation_writer) with open(tempfile_path, "rt") as curation_file: curation_reader = csv.reader(curation_file, delimiter="\t") self.assertEqual(["transitional cell carcinoma of the bladder", "276", "http://www.ebi.ac.uk/efo/EFO_0006544|bladder transitional cell carcinoma|2|HP:0006740"], next(curation_reader))
def test_output_trait_mapping(self): tempfile_path = tempfile.mkstemp()[1] with open(tempfile_path, "w", newline='') as mapping_file: mapping_writer = csv.writer(mapping_file, delimiter="\t") mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"]) test_trait = Trait('aprt deficiency, japanese type', 11) # Normally a set, but changed to a list for predictable output order in test test_trait.finished_mapping_set = [ OntologyEntry('http://www.orpha.net/ORDO/Orphanet_976', 'Adenine phosphoribosyltransferase deficiency'), OntologyEntry( 'http://www.orpha.net/ORDO/Orphanet_977', 'Adenine phosphoribosyltransferase deficiency type A') ] output.output_trait_mapping(test_trait, mapping_writer) with open(tempfile_path, "rt", newline='') as mapping_file: mapping_reader = csv.reader(mapping_file, delimiter="\t") next(mapping_reader) self.assertEqual([ 'aprt deficiency, japanese type', 'http://www.orpha.net/ORDO/Orphanet_976', 'Adenine phosphoribosyltransferase deficiency' ], next(mapping_reader)) self.assertEqual([ 'aprt deficiency, japanese type', 'http://www.orpha.net/ORDO/Orphanet_977', 'Adenine phosphoribosyltransferase deficiency type A' ], next(mapping_reader))
def test_output_for_curation(self): tempfile_path = tempfile.mkstemp()[1] with open(tempfile_path, "wt") as curation_file: curation_writer = csv.writer(curation_file, delimiter="\t") test_trait = Trait("transitional cell carcinoma of the bladder", 276) test_oxo_result = OxOResult( "HP:0006740", "Transitional cell carcinoma of the bladder", "HP:0006740") test_oxo_mapping = OxOMapping( "bladder transitional cell carcinoma", "EFO:0006544", 2, "HP:0006740") test_oxo_mapping.in_efo = test_oxo_mapping.is_current = True test_oxo_mapping.ontology_label = "bladder transitional cell carcinoma" test_oxo_result.mapping_list = [test_oxo_mapping] test_trait.oxo_result_list = [test_oxo_result] output.output_for_curation(test_trait, curation_writer) with open(tempfile_path, "rt") as curation_file: curation_reader = csv.reader(curation_file, delimiter="\t") expected_record = [ "transitional cell carcinoma of the bladder", "276", "http://www.ebi.ac.uk/efo/EFO_0006544|bladder transitional cell carcinoma|2|HP:0006740|EFO_CURRENT" ] self.assertEqual(expected_record, next(curation_reader))
def main(input_filepath, output_mappings_filepath, output_curation_filepath, filters, zooma_host, oxo_target_list, oxo_distance): logger.info('Started parsing trait names') trait_names_list = parse_trait_names(input_filepath) trait_names_counter = Counter(trait_names_list) logger.info("Loaded {} trait names".format(len(trait_names_counter))) with open(output_mappings_filepath, "w", newline='') as mapping_file, \ open(output_curation_filepath, "wt") as curation_file: mapping_writer = csv.writer(mapping_file, delimiter="\t") mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"]) curation_writer = csv.writer(curation_file, delimiter="\t") logger.info('Processing trait names in parallel') trait_list = [ Trait(trait_name, freq) for trait_name, freq in trait_names_counter.items() ] trait_process_pool = multiprocessing.Pool(processes=12) processed_trait_list = [ trait_process_pool.apply(process_trait, args=(trait, filters, zooma_host, oxo_target_list, oxo_distance)) for trait in trait_list ] for trait in processed_trait_list: output_trait(trait, mapping_writer, curation_writer) logger.info('Finished processing trait names')
def parse_trait_names(filepath: str) -> list: """ For a file containing ClinVar records in the TSV format, return a list of Traits for the records in the file. Each Trait object contains trait name, how many times it occurs in the input file, and whether it is linked to an NT expansion variant. Trait occurrence count is calculated based on all unique (AlleleID, RCV, trait name) tuples in the input file. This is because each such tuple will, generally speaking, correspond to one output evidence string. So if we want to gauge which trait names are more important to curate, we need to consider how many such tuples it appears in. The reason we need to keep track of only *unique* tuples is because some (most) alleles will appear twice in the document with coordinates for GRCh37 and GRCh38, and we don't want to count them twice. Traits which are implicated in "NT expansion" variants are marked using a special field, because their curation is of highest importance even if the number of records which they are linked to is low. :param filepath: Path to a gzipped file containing ClinVar TSV summary. :return: A list of Trait objects. """ # Tracks unique (AlleleID, RCV, trait name) tuples unique_association_tuples = set() # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants. # Their curation is of highest importantce regardless of how many records they are actually associated with. nt_expansion_traits = set() with gzip.open(filepath, "rt") as clinvar_summary: header = clinvar_summary.readline().rstrip().split('\t') for line in clinvar_summary: values = line.rstrip().split('\t') data = dict(zip(header, values)) # Extract relevant fields is_nt_expansion_variant = data['Type'] == 'NT expansion' allele_id = data['#AlleleID'] traits = set(data['PhenotypeList'].split(';')) rcv_ids = set(data['RCVaccession'].split(';')) # Process all (trait, rcv) records for trait, rcv_id in zip(traits, rcv_ids): unique_association_tuples.add((trait, rcv_id, allele_id)) if is_nt_expansion_variant: nt_expansion_traits.add(trait) # Count trait occurrences trait_names = [t[0] for t in unique_association_tuples] traits = [] for trait_name, trait_frequency in Counter(trait_names).items(): if trait_name == '-': print('Skipped {} missing trait names'.format(trait_frequency)) continue associated_with_nt_expansion = trait_name in nt_expansion_traits traits.append( Trait(name=trait_name.lower(), frequency=trait_frequency, associated_with_nt_expansion=associated_with_nt_expansion)) return traits
def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, oxo_distance: int) -> Trait: """ Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma mappings that are in EFO then query OxO with any high confidence mappings not in EFO. :param trait: The trait to be processed. :param filters: A dictionary of filters to use for querying Zooma. :param zooma_host: A string with the hostname to use for querying Zooma :param oxo_target_list: A list of strings, each being an OxO ID for an ontology. Used to specify which ontologies should be queried using OxO. :param oxo_distance: int specifying the maximum number of steps to use to query OxO. i.e. OxO's "distance" parameter. :return: The original trait after querying Zooma and possibly OxO, with any results found. """ trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host) trait.process_zooma_results() if (trait.is_finished or len(trait.zooma_result_list) == 0 or any([ entry.is_current for mapping in trait.zooma_result_list for entry in mapping.mapping_list ])): return trait uris_for_oxo_set = get_uris_for_oxo(trait.zooma_result_list) if len(uris_for_oxo_set) == 0: return trait oxo_input_id_list = uris_to_oxo_format(uris_for_oxo_set) trait.oxo_result_list = get_oxo_results(oxo_input_id_list, oxo_target_list, oxo_distance) trait.process_oxo_mappings() return trait
def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, oxo_distance: int) -> Trait: """ Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma mappings that are in EFO then query OxO with any high confidence mappings not in EFO. :param trait: The trait to be processed. :param filters: A dictionary of filters to use for querying Zooma. :param zooma_host: A string with the hostname to use for querying Zooma :param oxo_target_list: A list of strings, each being an OxO ID for an ontology. Used to specify which ontologies should be queried using OxO. :param oxo_distance: int specifying the maximum number of steps to use to query OxO. i.e. OxO's "distance" parameter. :return: The original trait after querying Zooma and possibly OxO, with any results found. """ trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host) trait.process_zooma_results() if (trait.is_finished or len(trait.zooma_result_list) == 0 or any([entry.is_current for mapping in trait.zooma_result_list for entry in mapping.mapping_list])): return trait uris_for_oxo_set = get_uris_for_oxo(trait.zooma_result_list) if len(uris_for_oxo_set) == 0: return trait oxo_input_id_list = uris_to_oxo_format(uris_for_oxo_set) trait.oxo_result_list = get_oxo_results(oxo_input_id_list, oxo_target_list, oxo_distance) trait.process_oxo_mappings() return trait
def parse_trait_names(filepath: str) -> list: """For a file containing ClinVar records in the XML format, return a list of Traits for the records in the file. Each Trait object contains trait name, how many times it occurs in the input file, and whether it is linked to an NT expansion variant. Trait occurrence count is calculated based on all unique (RCV, trait name) tuples in the input file. This is because each such tuple will, generally speaking, correspond to one output evidence string. So if we want to gauge which trait names are more important to curate, we need to consider how many such tuples it appears in. Traits which are implicated in "Microsatellite" variants are marked using a special field, because a subset of microsatellites are NT expansion variants, and their curation is of highest importance even if the number of records which they are linked to is low. :param filepath: Path to a gzipped file containing ClinVar XML dump. :return: A list of Trait objects.""" # Tracks how many times a trait name occurs in ClinVar trait_name_counter = Counter() # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants. # Their curation is of highest importance regardless of how many records they are actually associated with. nt_expansion_traits = set() for clinvar_record in clinvar_xml_utils.ClinVarDataset(filepath): trait_names = set(trait.preferred_or_other_valid_name.lower() for trait in clinvar_record.traits_with_valid_names) for trait_name in trait_names: trait_name_counter[trait_name] += 1 if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant: nt_expansion_traits |= trait_names # Count trait occurrences traits = [] for trait_name, trait_frequency in trait_name_counter.items(): if trait_name == '-': print('Skipped {} missing trait names'.format(trait_frequency)) continue associated_with_nt_expansion = trait_name in nt_expansion_traits traits.append( Trait(name=trait_name, frequency=trait_frequency, associated_with_nt_expansion=associated_with_nt_expansion)) return traits
def main(input_filepath, output_mappings_filepath, output_curation_filepath, filters, zooma_host, oxo_target_list, oxo_distance): trait_names_list = parse_trait_names(input_filepath) trait_names_counter = Counter(trait_names_list) with open(output_mappings_filepath, "w", newline='') as mapping_file, \ open(output_curation_filepath, "wt") as curation_file: mapping_writer = csv.writer(mapping_file, delimiter="\t") mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"]) curation_writer = csv.writer(curation_file, delimiter="\t") bar = progressbar.ProgressBar( max_value=len(trait_names_counter), widgets=[progressbar.AdaptiveETA(samples=1000)]) for trait_name, freq in bar(trait_names_counter.items()): trait = Trait(trait_name, freq) trait = process_trait(trait, filters, zooma_host, oxo_target_list, oxo_distance) output_trait(trait, mapping_writer, curation_writer)