def __get_translated_filter_and_variant_annotation( cls, call: SimpleCall, panel: Panel, call_reference_assembly: ReferenceAssembly, translated_reference_allele: Optional[str], ) -> Tuple[FullCallFilter, str]: annotated_alleles = cls.__get_annotated_alleles( call, call_reference_assembly, translated_reference_allele) if panel.has_ref_seq_difference_annotation(call.gene, call.start_coordinate, call.reference_allele, call_reference_assembly): annotate_as_ref = all( cls. __is_ref_allele_to_opposite_assembly_due_to_ref_sequence_difference( annotated_allele, call_reference_assembly) for annotated_allele in annotated_alleles) all_variants_are_ref_to_a_reference_assembly = all( cls.__allele_is_ref_to_a_reference_assembly( annotated_allele, call_reference_assembly) for annotated_allele in annotated_alleles) if annotate_as_ref: translated_variant_annotation = REF_CALL_ANNOTATION_STRING translated_filter = FullCallFilter.PASS elif all_variants_are_ref_to_a_reference_assembly: translated_variant_annotation = panel.get_ref_seq_difference_annotation( call.gene, call.start_coordinate, call.reference_allele, call_reference_assembly) if call.is_pass(): translated_filter = FullCallFilter.PASS else: translated_filter = FullCallFilter.INFERRED_PASS else: translated_variant_annotation = call.variant_annotation + "?" translated_filter = FullCallFilter.UNKNOWN logging.warning( f"Unexpected allele in ref seq difference location. Check whether annotation is correct: " f"found alleles=({annotated_alleles[0]}, {annotated_alleles[1]}), " f"annotation={translated_variant_annotation}") elif panel.contains_rs_id_matching_call(call, call_reference_assembly): # known variant and no ref seq differences involved translated_variant_annotation = call.variant_annotation if call.is_pass(): translated_filter = FullCallFilter.PASS else: translated_filter = FullCallFilter.NO_CALL else: # unknown variant, no ref seq difference involved translated_variant_annotation = call.variant_annotation + "?" translated_filter = FullCallFilter.UNKNOWN logging.warning( f"Unknown variant. Check whether annotation is correct: " f"found alleles=({annotated_alleles[0]}, {annotated_alleles[1]}), " f"annotation={translated_variant_annotation}") return translated_filter, translated_variant_annotation
def get_genotype_tsv_text(cls, pgx_analysis: PgxAnalysis, panel: Panel, version: str) -> str: gene_to_haplotype_calls = pgx_analysis.get_gene_to_haplotype_calls() genes_in_analysis = set(gene_to_haplotype_calls.keys()) assert genes_in_analysis == panel.get_genes(), ( f"Gene lists inconsistent.\n" f"From analysis={sorted(list(genes_in_analysis))}\n" f"From panel={sorted(list(panel.get_genes()))}") gene_to_drug_info = {} for gene_info in panel.get_gene_infos(): sorted_drugs = sorted([drug for drug in gene_info.drugs], key=lambda info: (info.name, info.url_prescription_info)) gene_to_drug_info[gene_info.gene] = (cls.DRUG_SEPARATOR.join( [drug.name for drug in sorted_drugs]), cls.DRUG_SEPARATOR.join([ drug.url_prescription_info for drug in sorted_drugs ])) header = cls.TSV_SEPARATOR.join(cls.GENOTYPE_TSV_COLUMNS) lines = [header] for gene in sorted(gene_to_haplotype_calls.keys()): if gene_to_haplotype_calls[gene]: for haplotype_call in sorted( gene_to_haplotype_calls[gene], key=lambda call: call.haplotype_name): lines.append( cls.TSV_SEPARATOR.join([ gene, haplotype_call.haplotype_name, cls.__get_zygosity(haplotype_call), panel.get_haplotype_function( gene, haplotype_call.haplotype_name), gene_to_drug_info[gene][0], gene_to_drug_info[gene][1], panel.get_id(), version, ])) else: lines.append( cls.TSV_SEPARATOR.join([ gene, cls.UNRESOLVED_HAPLOTYPE_STRING, cls.NOT_APPLICABLE_ZYGOSITY_STRING, UNKNOWN_FUNCTION_STRING, gene_to_drug_info[gene][0], gene_to_drug_info[gene][1], panel.get_id(), version, ])) text = "\n".join(lines) + "\n" return text
def __get_translated_reference_allele( cls, call: SimpleCall, panel: Panel, call_reference_assembly: ReferenceAssembly) -> Optional[str]: if panel.contains_rs_id_matching_call(call, call_reference_assembly): rs_id_info = panel.get_matching_rs_id_info( call.start_coordinate, call.reference_allele, call_reference_assembly) cls.__assert_rs_id_call_matches_info(call.rs_ids, (rs_id_info.rs_id, )) return rs_id_info.get_reference_allele( call_reference_assembly.opposite()) else: # unknown variant return None
def __get_example_panel(cls) -> Panel: dpyd_two_a_variant = Variant("rs3918290", "T") dpyd_two_b_variant = Variant("rs1801159", "C") dpyd_three_variant = Variant("rs72549303", "TG") fake_variant = Variant("rs1212125", "C") fake2_variant = Variant("rs1212127", "C") dpyd_haplotypes = frozenset({ Haplotype("*2A", "No Function", frozenset({dpyd_two_a_variant})), Haplotype("*2B", "No Function", frozenset({dpyd_two_a_variant, dpyd_two_b_variant})), Haplotype("*3", "Normal Function", frozenset({dpyd_three_variant})), }) dpyd_rs_id_infos = frozenset({ RsIdInfo("rs3918290", "C", "C", GeneCoordinate("1", 97915614), GeneCoordinate("chr1", 97450058)), RsIdInfo("rs72549309", "GATGA", "GATGA", GeneCoordinate("1", 98205966), GeneCoordinate("chr1", 97740410)), RsIdInfo("rs1801159", "T", "T", GeneCoordinate("1", 97981395), GeneCoordinate("chr1", 97515839)), RsIdInfo("rs72549303", "TG", "TC", GeneCoordinate("1", 97915621), GeneCoordinate("chr1", 97450065)), }) dpyd_drugs = frozenset({ DrugInfo("5-Fluorouracil", "https://www.pharmgkb.org/chemical/PA128406956/guidelineAnnotation/PA166104939"), DrugInfo("Capecitabine", "https://www.pharmgkb.org/chemical/PA448771/guidelineAnnotation/PA166104963"), }) dpyd_rs_id_to_difference_annotations = { "rs72549303": Annotation("6744CA>GA", "6744GA>CA"), } fake_haplotypes = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake_variant})), }) fake_rs_id_infos = frozenset({ RsIdInfo("rs1212125", "T", "T", GeneCoordinate("5", 97915617), GeneCoordinate("chr5", 97450060)), }) fake_drugs = frozenset({ DrugInfo("Aspirin", "https://www.pharmgkb.org/some_other_url"), }) fake_rs_id_to_difference_annotations: Dict[str, Annotation] = {} fake2_haplotypes = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake2_variant})), }) fake2_rs_id_infos = frozenset({ RsIdInfo("rs1212127", "C", "T", GeneCoordinate("16", 97915617), GeneCoordinate("chr16", 97450060)), }) fake2_drugs = frozenset({ DrugInfo("Aspirin", "https://www.pharmgkb.org/some_other_url"), }) fake2_rs_id_to_difference_annotations = {"rs1212127": Annotation("1324C>T", "1324T>C")} gene_infos = frozenset({ GeneInfo("DPYD", "*1", dpyd_haplotypes, dpyd_rs_id_infos, dpyd_drugs, dpyd_rs_id_to_difference_annotations), GeneInfo("FAKE", "*1", fake_haplotypes, fake_rs_id_infos, fake_drugs, fake_rs_id_to_difference_annotations), GeneInfo("FAKE2", "*1", fake2_haplotypes, fake2_rs_id_infos, fake2_drugs, fake2_rs_id_to_difference_annotations), }) name = "Panel" version = "0.2" return Panel(name, version, gene_infos)
def test_haplotype_reporter_empty(self) -> None: pgx_analysis = PgxAnalysis(FullCallData(frozenset()), {}) panel = Panel("EmptyPanel", "0.3", frozenset()) version = "V1" result = HaplotypeReporter.get_genotype_tsv_text(pgx_analysis, panel, version) result_expected = "gene\thaplotype\tzygosity\tfunction\tlinked_drugs\turl_prescription_info\tpanel_version\trepo_version\n" self.assertEqual(result_expected, result)
def load_panel(panel_path: str) -> Panel: """ Load manually annotated JSON panel file """ try: with open(panel_path, "r+", encoding="utf-8") as json_file: return Panel.from_json(json.load(json_file)) except IOError: raise FileNotFoundError( f"Panel file {panel_path} not found or cannot be opened.")
def test_panel_with_repeat_gene_names(self) -> None: """Error when panel has info for gene multiple times""" name = "FakePanel" version = "1.0" gene1 = "FAKE" gene2 = "OTHER" reference_haplotype_name = "*1" other_reference_haplotype_name = "*1_something else" haplotypes: FrozenSet[Haplotype] = frozenset() rs_id_infos: FrozenSet[RsIdInfo] = frozenset() drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() gene_info1 = GeneInfo( gene1, reference_haplotype_name, haplotypes, rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_info2 = GeneInfo( gene2, reference_haplotype_name, haplotypes, rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_info3 = GeneInfo( gene1, other_reference_haplotype_name, haplotypes, rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_infos_without_repeat = frozenset([gene_info1, gene_info2]) gene_infos_with_repeat = frozenset([gene_info1, gene_info3]) Panel(name, version, gene_infos_without_repeat) with self.assertRaises(ValueError): Panel(name, version, gene_infos_with_repeat)
def get_gene_to_haplotypes_call( cls, full_call_data: FullCallData, panel: Panel) -> Dict[str, Set[HaplotypeCall]]: gene_to_haplotype_calls = {} for gene_info in panel.get_gene_infos(): logging.info(f"Calling haplotypes for {gene_info.gene}") gene_to_haplotype_calls[ gene_info.gene] = cls.__get_haplotypes_call( full_call_data, gene_info) return gene_to_haplotype_calls
def __coordinates_of_call_overlap_with_panel_coordinates( cls, call_index: int, panel: Panel, variants: Dict[str, Any], vcf_reference_assembly: ReferenceAssembly) -> bool: chromosome = cls.__get_chromosome_from_variants(call_index, variants) position = cls.__get_position_from_variants(call_index, variants) reference_allele = cls.__get_reference_allele_from_variants( call_index, variants) relevant_coordinates = cls.__get_relevant_coordinates( chromosome, position, reference_allele) coordinate_match_to_panel_exists = any( panel.contains_rs_id_with_coordinate(coord, vcf_reference_assembly) for coord in relevant_coordinates) return coordinate_match_to_panel_exists
def __fill_in_rs_ids_if_needed( cls, call: SimpleCall, panel: Panel, reference_assembly: ReferenceAssembly) -> SimpleCall: rs_ids: Tuple[str, ...] if call.rs_ids == (".", ) and panel.contains_rs_id_matching_call( call, reference_assembly): rs_id_info = panel.get_matching_rs_id_info(call.start_coordinate, call.reference_allele, reference_assembly) rs_ids = (rs_id_info.rs_id, ) new_simple_call = SimpleCall( call.start_coordinate, call.reference_allele, call.alleles, call.gene, rs_ids, call.variant_annotation, call.filter, ) return new_simple_call else: return call
def __get_calls_for_panel_variants_without_calls( cls, simple_call_data: SimpleCallData, panel: Panel) -> FrozenSet[SimpleCall]: # assume ref call when no call is found. Set filter to NO_CALL reference_assembly = simple_call_data.reference_assembly rs_ids_found_in_patient = { rs_id for call in simple_call_data.calls for rs_id in call.rs_ids if rs_id != "." } coordinates_covered_by_found_calls = { coordinate for call in simple_call_data.calls for coordinate in call.get_relevant_coordinates() } uncalled_calls = set() for gene_info in panel.get_gene_infos(): for rs_id_info in gene_info.rs_id_infos: coordinates_partially_handled = bool( rs_id_info.get_relevant_coordinates(reference_assembly). intersection(coordinates_covered_by_found_calls)) if rs_id_info.rs_id not in rs_ids_found_in_patient and not coordinates_partially_handled: # Assuming REF/REF relative to reference assembly start_coordinate = rs_id_info.get_start_coordinate( reference_assembly) reference_allele = rs_id_info.get_reference_allele( reference_assembly) uncalled_ref_call = SimpleCall( start_coordinate, reference_allele, (reference_allele, reference_allele), gene_info.gene, (rs_id_info.rs_id, ), REF_CALL_ANNOTATION_STRING, SimpleCallFilter.NO_CALL, ) uncalled_calls.add(uncalled_ref_call) return frozenset(uncalled_calls)
def __rs_id_exists_in_panel(cls, call_index: int, panel: Panel, variants: Dict[str, Any]) -> bool: rs_ids = cls.__get_rs_ids_from_variants(call_index, variants) rs_id_match_to_panel_exists = any( panel.contains_rs_id(rs_id) for rs_id in rs_ids) return rs_id_match_to_panel_exists
def __assert_gene_in_panel(cls, gene: str, panel: Panel) -> None: if gene not in panel.get_genes(): error_msg = f"Call for unknown gene:\ngene={gene}" raise ValueError(error_msg)
def test_panel_with_overlapping_rs_id_infos_for_different_genes( self) -> None: """Error when panel has overlapping rs id infos for different genes, but not when they are exactly the same""" name = "FakePanel" version = "1.0" gene1 = "FAKE" gene2 = "OTHER" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" haplotypes: FrozenSet[Haplotype] = frozenset() drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() rs_id_info1 = RsIdInfo( "rs294924", "AT", "AT", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483), ) rs_id_info2 = RsIdInfo( "rs3949923", "C", "C", GeneCoordinate(chromosome_v37, 293993), GeneCoordinate(chromosome_v38, 1388323), ) rs_id_info3 = RsIdInfo( "rs12993", "GG", "GG", GeneCoordinate(chromosome_v37, 499592), GeneCoordinate(chromosome_v38, 399482), ) rs_id_infos1 = frozenset([rs_id_info1]) rs_id_infos2 = frozenset([rs_id_info1, rs_id_info2]) rs_id_infos3 = frozenset([rs_id_info3]) gene_info1 = GeneInfo( gene1, reference_haplotype_name, haplotypes, rs_id_infos1, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_info2 = GeneInfo( gene2, reference_haplotype_name, haplotypes, rs_id_infos2, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_info3 = GeneInfo( gene2, reference_haplotype_name, haplotypes, rs_id_infos3, drugs, rs_id_to_ref_seq_difference_annotation, ) Panel(name, version, frozenset([gene_info1, gene_info2])) with self.assertRaises(ValueError): Panel(name, version, frozenset([gene_info1, gene_info3]))
def test_load_panel(self) -> None: """Load panel from json""" panel_path = get_test_resource("test_panel.json") panel = load_panel(str(panel_path)) dpyd_two_a_variant = Variant("rs3918290", "T") dpyd_two_b_variant = Variant("rs1801159", "C") dpyd_three_variant = Variant("rs72549303", "TG") fake_variant = Variant("rs1212125", "C") fake2_variant = Variant("rs1212127", "C") dpyd_haplotypes_expected = frozenset({ Haplotype("*2A", "No Function", frozenset({dpyd_two_a_variant})), Haplotype("*2B", "No Function", frozenset({dpyd_two_a_variant, dpyd_two_b_variant})), Haplotype("*3", "Normal Function", frozenset({dpyd_three_variant})), }) dpyd_rs_id_infos_expected = frozenset({ RsIdInfo("rs3918290", "C", "C", GeneCoordinate("1", 97915614), GeneCoordinate("chr1", 97450058)), RsIdInfo("rs72549309", "GATGA", "GATGA", GeneCoordinate("1", 98205966), GeneCoordinate("chr1", 97740410)), RsIdInfo("rs1801159", "T", "T", GeneCoordinate("1", 97981395), GeneCoordinate("chr1", 97515839)), RsIdInfo("rs72549303", "TG", "TC", GeneCoordinate("1", 97915621), GeneCoordinate("chr1", 97450065)), RsIdInfo("rs1801265", "G", "A", GeneCoordinate("1", 98348885), GeneCoordinate("chr1", 97883329)), }) dpyd_drugs_expected = frozenset({ DrugInfo("5-Fluorouracil", "https://www.source_url.org/5-Fluorouracil"), DrugInfo("Capecitabine", "https://www.source_url.org/Capecitabine"), }) dpyd_rs_id_to_difference_annotations = { "rs72549303": Annotation("6744CA>GA", "6744GA>CA"), "rs1801265": Annotation("85C>T", "85T>C"), } fake_haplotypes_expected = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake_variant})), }) fake_rs_id_infos_expected = frozenset({ RsIdInfo("rs1212125", "T", "T", GeneCoordinate("5", 97915617), GeneCoordinate("chr5", 97450060)), }) fake_drugs_expected = frozenset({ DrugInfo("Aspirin", "https://www.source_url.org/Aspirin"), }) fake_rs_id_to_difference_annotations: Dict[str, Annotation] = {} fake2_haplotypes_expected = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake2_variant})), }) fake2_rs_id_infos_expected = frozenset({ RsIdInfo("rs1212127", "C", "T", GeneCoordinate("16", 97915617), GeneCoordinate("chr16", 97450060)), }) fake2_drugs_expected = frozenset({ DrugInfo("Aspirin", "https://www.source_url.org/Aspirin"), }) fake2_rs_id_to_difference_annotations = { "rs1212127": Annotation("1324C>T", "1324T>C") } gene_infos_expected = frozenset({ GeneInfo("DPYD", "*1", dpyd_haplotypes_expected, dpyd_rs_id_infos_expected, dpyd_drugs_expected, dpyd_rs_id_to_difference_annotations), GeneInfo("FAKE", "*1", fake_haplotypes_expected, fake_rs_id_infos_expected, fake_drugs_expected, fake_rs_id_to_difference_annotations), GeneInfo("FAKE2", "*1", fake2_haplotypes_expected, fake2_rs_id_infos_expected, fake2_drugs_expected, fake2_rs_id_to_difference_annotations), }) name_expected = "fake_panel" version_expected = "0.3" panel_expected = Panel(name_expected, version_expected, gene_infos_expected) self.assertEqual(panel_expected, panel)