Ejemplo n.º 1
0
 def __get_translated_filter_and_variant_annotation(
     cls,
     call: SimpleCall,
     panel: Panel,
     call_reference_assembly: ReferenceAssembly,
     translated_reference_allele: Optional[str],
 ) -> Tuple[FullCallFilter, str]:
     annotated_alleles = cls.__get_annotated_alleles(
         call, call_reference_assembly, translated_reference_allele)
     if panel.has_ref_seq_difference_annotation(call.gene,
                                                call.start_coordinate,
                                                call.reference_allele,
                                                call_reference_assembly):
         annotate_as_ref = all(
             cls.
             __is_ref_allele_to_opposite_assembly_due_to_ref_sequence_difference(
                 annotated_allele, call_reference_assembly)
             for annotated_allele in annotated_alleles)
         all_variants_are_ref_to_a_reference_assembly = all(
             cls.__allele_is_ref_to_a_reference_assembly(
                 annotated_allele, call_reference_assembly)
             for annotated_allele in annotated_alleles)
         if annotate_as_ref:
             translated_variant_annotation = REF_CALL_ANNOTATION_STRING
             translated_filter = FullCallFilter.PASS
         elif all_variants_are_ref_to_a_reference_assembly:
             translated_variant_annotation = panel.get_ref_seq_difference_annotation(
                 call.gene, call.start_coordinate, call.reference_allele,
                 call_reference_assembly)
             if call.is_pass():
                 translated_filter = FullCallFilter.PASS
             else:
                 translated_filter = FullCallFilter.INFERRED_PASS
         else:
             translated_variant_annotation = call.variant_annotation + "?"
             translated_filter = FullCallFilter.UNKNOWN
             logging.warning(
                 f"Unexpected allele in ref seq difference location. Check whether annotation is correct: "
                 f"found alleles=({annotated_alleles[0]}, {annotated_alleles[1]}), "
                 f"annotation={translated_variant_annotation}")
     elif panel.contains_rs_id_matching_call(call, call_reference_assembly):
         # known variant and no ref seq differences involved
         translated_variant_annotation = call.variant_annotation
         if call.is_pass():
             translated_filter = FullCallFilter.PASS
         else:
             translated_filter = FullCallFilter.NO_CALL
     else:
         # unknown variant, no ref seq difference involved
         translated_variant_annotation = call.variant_annotation + "?"
         translated_filter = FullCallFilter.UNKNOWN
         logging.warning(
             f"Unknown variant. Check whether annotation is correct: "
             f"found alleles=({annotated_alleles[0]}, {annotated_alleles[1]}), "
             f"annotation={translated_variant_annotation}")
     return translated_filter, translated_variant_annotation
Ejemplo n.º 2
0
    def get_genotype_tsv_text(cls, pgx_analysis: PgxAnalysis, panel: Panel,
                              version: str) -> str:
        gene_to_haplotype_calls = pgx_analysis.get_gene_to_haplotype_calls()

        genes_in_analysis = set(gene_to_haplotype_calls.keys())
        assert genes_in_analysis == panel.get_genes(), (
            f"Gene lists inconsistent.\n"
            f"From analysis={sorted(list(genes_in_analysis))}\n"
            f"From panel={sorted(list(panel.get_genes()))}")

        gene_to_drug_info = {}
        for gene_info in panel.get_gene_infos():
            sorted_drugs = sorted([drug for drug in gene_info.drugs],
                                  key=lambda info:
                                  (info.name, info.url_prescription_info))
            gene_to_drug_info[gene_info.gene] = (cls.DRUG_SEPARATOR.join(
                [drug.name for drug in sorted_drugs]),
                                                 cls.DRUG_SEPARATOR.join([
                                                     drug.url_prescription_info
                                                     for drug in sorted_drugs
                                                 ]))

        header = cls.TSV_SEPARATOR.join(cls.GENOTYPE_TSV_COLUMNS)
        lines = [header]
        for gene in sorted(gene_to_haplotype_calls.keys()):
            if gene_to_haplotype_calls[gene]:
                for haplotype_call in sorted(
                        gene_to_haplotype_calls[gene],
                        key=lambda call: call.haplotype_name):
                    lines.append(
                        cls.TSV_SEPARATOR.join([
                            gene,
                            haplotype_call.haplotype_name,
                            cls.__get_zygosity(haplotype_call),
                            panel.get_haplotype_function(
                                gene, haplotype_call.haplotype_name),
                            gene_to_drug_info[gene][0],
                            gene_to_drug_info[gene][1],
                            panel.get_id(),
                            version,
                        ]))
            else:
                lines.append(
                    cls.TSV_SEPARATOR.join([
                        gene,
                        cls.UNRESOLVED_HAPLOTYPE_STRING,
                        cls.NOT_APPLICABLE_ZYGOSITY_STRING,
                        UNKNOWN_FUNCTION_STRING,
                        gene_to_drug_info[gene][0],
                        gene_to_drug_info[gene][1],
                        panel.get_id(),
                        version,
                    ]))
        text = "\n".join(lines) + "\n"
        return text
Ejemplo n.º 3
0
 def __get_translated_reference_allele(
         cls, call: SimpleCall, panel: Panel,
         call_reference_assembly: ReferenceAssembly) -> Optional[str]:
     if panel.contains_rs_id_matching_call(call, call_reference_assembly):
         rs_id_info = panel.get_matching_rs_id_info(
             call.start_coordinate, call.reference_allele,
             call_reference_assembly)
         cls.__assert_rs_id_call_matches_info(call.rs_ids,
                                              (rs_id_info.rs_id, ))
         return rs_id_info.get_reference_allele(
             call_reference_assembly.opposite())
     else:
         # unknown variant
         return None
Ejemplo n.º 4
0
    def __get_example_panel(cls) -> Panel:
        dpyd_two_a_variant = Variant("rs3918290", "T")
        dpyd_two_b_variant = Variant("rs1801159", "C")
        dpyd_three_variant = Variant("rs72549303", "TG")
        fake_variant = Variant("rs1212125", "C")
        fake2_variant = Variant("rs1212127", "C")

        dpyd_haplotypes = frozenset({
            Haplotype("*2A", "No Function", frozenset({dpyd_two_a_variant})),
            Haplotype("*2B", "No Function", frozenset({dpyd_two_a_variant, dpyd_two_b_variant})),
            Haplotype("*3", "Normal Function", frozenset({dpyd_three_variant})),
        })
        dpyd_rs_id_infos = frozenset({
            RsIdInfo("rs3918290", "C", "C", GeneCoordinate("1", 97915614), GeneCoordinate("chr1", 97450058)),
            RsIdInfo("rs72549309", "GATGA", "GATGA", GeneCoordinate("1", 98205966), GeneCoordinate("chr1", 97740410)),
            RsIdInfo("rs1801159", "T", "T", GeneCoordinate("1", 97981395), GeneCoordinate("chr1", 97515839)),
            RsIdInfo("rs72549303", "TG", "TC", GeneCoordinate("1", 97915621), GeneCoordinate("chr1", 97450065)),
        })
        dpyd_drugs = frozenset({
            DrugInfo("5-Fluorouracil", "https://www.pharmgkb.org/chemical/PA128406956/guidelineAnnotation/PA166104939"),
            DrugInfo("Capecitabine", "https://www.pharmgkb.org/chemical/PA448771/guidelineAnnotation/PA166104963"),
        })
        dpyd_rs_id_to_difference_annotations = {
            "rs72549303": Annotation("6744CA>GA", "6744GA>CA"),
        }

        fake_haplotypes = frozenset({
            Haplotype("*4A", "Reduced Function", frozenset({fake_variant})),
        })
        fake_rs_id_infos = frozenset({
            RsIdInfo("rs1212125", "T", "T", GeneCoordinate("5", 97915617), GeneCoordinate("chr5", 97450060)),
        })
        fake_drugs = frozenset({
            DrugInfo("Aspirin", "https://www.pharmgkb.org/some_other_url"),
        })
        fake_rs_id_to_difference_annotations: Dict[str, Annotation] = {}

        fake2_haplotypes = frozenset({
            Haplotype("*4A", "Reduced Function", frozenset({fake2_variant})),
        })
        fake2_rs_id_infos = frozenset({
            RsIdInfo("rs1212127", "C", "T", GeneCoordinate("16", 97915617), GeneCoordinate("chr16", 97450060)),
        })
        fake2_drugs = frozenset({
            DrugInfo("Aspirin", "https://www.pharmgkb.org/some_other_url"),
        })
        fake2_rs_id_to_difference_annotations = {"rs1212127": Annotation("1324C>T", "1324T>C")}

        gene_infos = frozenset({
            GeneInfo("DPYD", "*1", dpyd_haplotypes, dpyd_rs_id_infos,
                     dpyd_drugs, dpyd_rs_id_to_difference_annotations),
            GeneInfo("FAKE", "*1", fake_haplotypes, fake_rs_id_infos,
                     fake_drugs, fake_rs_id_to_difference_annotations),
            GeneInfo("FAKE2", "*1", fake2_haplotypes, fake2_rs_id_infos,
                     fake2_drugs, fake2_rs_id_to_difference_annotations),
        })
        name = "Panel"
        version = "0.2"
        return Panel(name, version, gene_infos)
Ejemplo n.º 5
0
    def test_haplotype_reporter_empty(self) -> None:
        pgx_analysis = PgxAnalysis(FullCallData(frozenset()), {})
        panel = Panel("EmptyPanel", "0.3", frozenset())
        version = "V1"
        result = HaplotypeReporter.get_genotype_tsv_text(pgx_analysis, panel, version)

        result_expected = "gene\thaplotype\tzygosity\tfunction\tlinked_drugs\turl_prescription_info\tpanel_version\trepo_version\n"
        self.assertEqual(result_expected, result)
Ejemplo n.º 6
0
def load_panel(panel_path: str) -> Panel:
    """ Load manually annotated JSON panel file """
    try:
        with open(panel_path, "r+", encoding="utf-8") as json_file:
            return Panel.from_json(json.load(json_file))
    except IOError:
        raise FileNotFoundError(
            f"Panel file {panel_path} not found or cannot be opened.")
Ejemplo n.º 7
0
    def test_panel_with_repeat_gene_names(self) -> None:
        """Error when panel has info for gene multiple times"""
        name = "FakePanel"
        version = "1.0"

        gene1 = "FAKE"
        gene2 = "OTHER"

        reference_haplotype_name = "*1"
        other_reference_haplotype_name = "*1_something else"
        haplotypes: FrozenSet[Haplotype] = frozenset()
        rs_id_infos: FrozenSet[RsIdInfo] = frozenset()
        drugs: FrozenSet[DrugInfo] = frozenset()
        rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict()

        gene_info1 = GeneInfo(
            gene1,
            reference_haplotype_name,
            haplotypes,
            rs_id_infos,
            drugs,
            rs_id_to_ref_seq_difference_annotation,
        )
        gene_info2 = GeneInfo(
            gene2,
            reference_haplotype_name,
            haplotypes,
            rs_id_infos,
            drugs,
            rs_id_to_ref_seq_difference_annotation,
        )
        gene_info3 = GeneInfo(
            gene1,
            other_reference_haplotype_name,
            haplotypes,
            rs_id_infos,
            drugs,
            rs_id_to_ref_seq_difference_annotation,
        )
        gene_infos_without_repeat = frozenset([gene_info1, gene_info2])
        gene_infos_with_repeat = frozenset([gene_info1, gene_info3])

        Panel(name, version, gene_infos_without_repeat)
        with self.assertRaises(ValueError):
            Panel(name, version, gene_infos_with_repeat)
Ejemplo n.º 8
0
 def get_gene_to_haplotypes_call(
         cls, full_call_data: FullCallData,
         panel: Panel) -> Dict[str, Set[HaplotypeCall]]:
     gene_to_haplotype_calls = {}
     for gene_info in panel.get_gene_infos():
         logging.info(f"Calling haplotypes for {gene_info.gene}")
         gene_to_haplotype_calls[
             gene_info.gene] = cls.__get_haplotypes_call(
                 full_call_data, gene_info)
     return gene_to_haplotype_calls
Ejemplo n.º 9
0
 def __coordinates_of_call_overlap_with_panel_coordinates(
         cls, call_index: int, panel: Panel, variants: Dict[str, Any],
         vcf_reference_assembly: ReferenceAssembly) -> bool:
     chromosome = cls.__get_chromosome_from_variants(call_index, variants)
     position = cls.__get_position_from_variants(call_index, variants)
     reference_allele = cls.__get_reference_allele_from_variants(
         call_index, variants)
     relevant_coordinates = cls.__get_relevant_coordinates(
         chromosome, position, reference_allele)
     coordinate_match_to_panel_exists = any(
         panel.contains_rs_id_with_coordinate(coord, vcf_reference_assembly)
         for coord in relevant_coordinates)
     return coordinate_match_to_panel_exists
Ejemplo n.º 10
0
 def __fill_in_rs_ids_if_needed(
         cls, call: SimpleCall, panel: Panel,
         reference_assembly: ReferenceAssembly) -> SimpleCall:
     rs_ids: Tuple[str, ...]
     if call.rs_ids == (".", ) and panel.contains_rs_id_matching_call(
             call, reference_assembly):
         rs_id_info = panel.get_matching_rs_id_info(call.start_coordinate,
                                                    call.reference_allele,
                                                    reference_assembly)
         rs_ids = (rs_id_info.rs_id, )
         new_simple_call = SimpleCall(
             call.start_coordinate,
             call.reference_allele,
             call.alleles,
             call.gene,
             rs_ids,
             call.variant_annotation,
             call.filter,
         )
         return new_simple_call
     else:
         return call
Ejemplo n.º 11
0
    def __get_calls_for_panel_variants_without_calls(
            cls, simple_call_data: SimpleCallData,
            panel: Panel) -> FrozenSet[SimpleCall]:
        # assume ref call when no call is found. Set filter to NO_CALL
        reference_assembly = simple_call_data.reference_assembly

        rs_ids_found_in_patient = {
            rs_id
            for call in simple_call_data.calls for rs_id in call.rs_ids
            if rs_id != "."
        }
        coordinates_covered_by_found_calls = {
            coordinate
            for call in simple_call_data.calls
            for coordinate in call.get_relevant_coordinates()
        }

        uncalled_calls = set()
        for gene_info in panel.get_gene_infos():
            for rs_id_info in gene_info.rs_id_infos:
                coordinates_partially_handled = bool(
                    rs_id_info.get_relevant_coordinates(reference_assembly).
                    intersection(coordinates_covered_by_found_calls))
                if rs_id_info.rs_id not in rs_ids_found_in_patient and not coordinates_partially_handled:
                    # Assuming REF/REF relative to reference assembly
                    start_coordinate = rs_id_info.get_start_coordinate(
                        reference_assembly)
                    reference_allele = rs_id_info.get_reference_allele(
                        reference_assembly)
                    uncalled_ref_call = SimpleCall(
                        start_coordinate,
                        reference_allele,
                        (reference_allele, reference_allele),
                        gene_info.gene,
                        (rs_id_info.rs_id, ),
                        REF_CALL_ANNOTATION_STRING,
                        SimpleCallFilter.NO_CALL,
                    )
                    uncalled_calls.add(uncalled_ref_call)
        return frozenset(uncalled_calls)
Ejemplo n.º 12
0
 def __rs_id_exists_in_panel(cls, call_index: int, panel: Panel,
                             variants: Dict[str, Any]) -> bool:
     rs_ids = cls.__get_rs_ids_from_variants(call_index, variants)
     rs_id_match_to_panel_exists = any(
         panel.contains_rs_id(rs_id) for rs_id in rs_ids)
     return rs_id_match_to_panel_exists
Ejemplo n.º 13
0
 def __assert_gene_in_panel(cls, gene: str, panel: Panel) -> None:
     if gene not in panel.get_genes():
         error_msg = f"Call for unknown gene:\ngene={gene}"
         raise ValueError(error_msg)
Ejemplo n.º 14
0
    def test_panel_with_overlapping_rs_id_infos_for_different_genes(
            self) -> None:
        """Error when panel has overlapping rs id infos for different genes, but not when they are exactly the same"""
        name = "FakePanel"
        version = "1.0"

        gene1 = "FAKE"
        gene2 = "OTHER"

        chromosome_v37 = "X"
        chromosome_v38 = "chrX"
        reference_haplotype_name = "*1"
        haplotypes: FrozenSet[Haplotype] = frozenset()
        drugs: FrozenSet[DrugInfo] = frozenset()
        rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict()

        rs_id_info1 = RsIdInfo(
            "rs294924",
            "AT",
            "AT",
            GeneCoordinate(chromosome_v37, 499593),
            GeneCoordinate(chromosome_v38, 399483),
        )
        rs_id_info2 = RsIdInfo(
            "rs3949923",
            "C",
            "C",
            GeneCoordinate(chromosome_v37, 293993),
            GeneCoordinate(chromosome_v38, 1388323),
        )
        rs_id_info3 = RsIdInfo(
            "rs12993",
            "GG",
            "GG",
            GeneCoordinate(chromosome_v37, 499592),
            GeneCoordinate(chromosome_v38, 399482),
        )

        rs_id_infos1 = frozenset([rs_id_info1])
        rs_id_infos2 = frozenset([rs_id_info1, rs_id_info2])
        rs_id_infos3 = frozenset([rs_id_info3])
        gene_info1 = GeneInfo(
            gene1,
            reference_haplotype_name,
            haplotypes,
            rs_id_infos1,
            drugs,
            rs_id_to_ref_seq_difference_annotation,
        )
        gene_info2 = GeneInfo(
            gene2,
            reference_haplotype_name,
            haplotypes,
            rs_id_infos2,
            drugs,
            rs_id_to_ref_seq_difference_annotation,
        )
        gene_info3 = GeneInfo(
            gene2,
            reference_haplotype_name,
            haplotypes,
            rs_id_infos3,
            drugs,
            rs_id_to_ref_seq_difference_annotation,
        )

        Panel(name, version, frozenset([gene_info1, gene_info2]))
        with self.assertRaises(ValueError):
            Panel(name, version, frozenset([gene_info1, gene_info3]))
Ejemplo n.º 15
0
    def test_load_panel(self) -> None:
        """Load panel from json"""
        panel_path = get_test_resource("test_panel.json")
        panel = load_panel(str(panel_path))

        dpyd_two_a_variant = Variant("rs3918290", "T")
        dpyd_two_b_variant = Variant("rs1801159", "C")
        dpyd_three_variant = Variant("rs72549303", "TG")
        fake_variant = Variant("rs1212125", "C")
        fake2_variant = Variant("rs1212127", "C")

        dpyd_haplotypes_expected = frozenset({
            Haplotype("*2A", "No Function", frozenset({dpyd_two_a_variant})),
            Haplotype("*2B", "No Function",
                      frozenset({dpyd_two_a_variant, dpyd_two_b_variant})),
            Haplotype("*3", "Normal Function",
                      frozenset({dpyd_three_variant})),
        })
        dpyd_rs_id_infos_expected = frozenset({
            RsIdInfo("rs3918290", "C", "C", GeneCoordinate("1", 97915614),
                     GeneCoordinate("chr1", 97450058)),
            RsIdInfo("rs72549309", "GATGA", "GATGA",
                     GeneCoordinate("1", 98205966),
                     GeneCoordinate("chr1", 97740410)),
            RsIdInfo("rs1801159", "T", "T", GeneCoordinate("1", 97981395),
                     GeneCoordinate("chr1", 97515839)),
            RsIdInfo("rs72549303", "TG", "TC", GeneCoordinate("1", 97915621),
                     GeneCoordinate("chr1", 97450065)),
            RsIdInfo("rs1801265", "G", "A", GeneCoordinate("1", 98348885),
                     GeneCoordinate("chr1", 97883329)),
        })
        dpyd_drugs_expected = frozenset({
            DrugInfo("5-Fluorouracil",
                     "https://www.source_url.org/5-Fluorouracil"),
            DrugInfo("Capecitabine",
                     "https://www.source_url.org/Capecitabine"),
        })
        dpyd_rs_id_to_difference_annotations = {
            "rs72549303": Annotation("6744CA>GA", "6744GA>CA"),
            "rs1801265": Annotation("85C>T", "85T>C"),
        }
        fake_haplotypes_expected = frozenset({
            Haplotype("*4A", "Reduced Function", frozenset({fake_variant})),
        })
        fake_rs_id_infos_expected = frozenset({
            RsIdInfo("rs1212125", "T", "T", GeneCoordinate("5", 97915617),
                     GeneCoordinate("chr5", 97450060)),
        })
        fake_drugs_expected = frozenset({
            DrugInfo("Aspirin", "https://www.source_url.org/Aspirin"),
        })
        fake_rs_id_to_difference_annotations: Dict[str, Annotation] = {}

        fake2_haplotypes_expected = frozenset({
            Haplotype("*4A", "Reduced Function", frozenset({fake2_variant})),
        })
        fake2_rs_id_infos_expected = frozenset({
            RsIdInfo("rs1212127", "C", "T", GeneCoordinate("16", 97915617),
                     GeneCoordinate("chr16", 97450060)),
        })
        fake2_drugs_expected = frozenset({
            DrugInfo("Aspirin", "https://www.source_url.org/Aspirin"),
        })
        fake2_rs_id_to_difference_annotations = {
            "rs1212127": Annotation("1324C>T", "1324T>C")
        }

        gene_infos_expected = frozenset({
            GeneInfo("DPYD", "*1", dpyd_haplotypes_expected,
                     dpyd_rs_id_infos_expected, dpyd_drugs_expected,
                     dpyd_rs_id_to_difference_annotations),
            GeneInfo("FAKE", "*1", fake_haplotypes_expected,
                     fake_rs_id_infos_expected, fake_drugs_expected,
                     fake_rs_id_to_difference_annotations),
            GeneInfo("FAKE2", "*1", fake2_haplotypes_expected,
                     fake2_rs_id_infos_expected, fake2_drugs_expected,
                     fake2_rs_id_to_difference_annotations),
        })
        name_expected = "fake_panel"
        version_expected = "0.3"
        panel_expected = Panel(name_expected, version_expected,
                               gene_infos_expected)

        self.assertEqual(panel_expected, panel)