def test_region_df(): var1 = Variant("chr1", position=999, ref="A", alt=["a"]) var2 = Variant("chr1", position=6789, ref="A", alt=["a"]) var3 = Variant("chr2", position=999, ref="A", alt=["a"]) var4 = Variant("chr3", position=25622, ref="A", alt=["a"]) df = pd.DataFrame({ f"var{idx+1}": GenotypeArray([ var.make_genotype_from_str("A/A"), ] * 10) for idx, var in enumerate([var1, var2, var3, var4]) }) assert_frame_equal( df.genomics.in_regions(Region("chr1", 900, 1000)), df[[ "var1", ]], ) assert_frame_equal( df.genomics.not_in_regions(Region("chr1", 900, 1000)), df[["var2", "var3", "var4"]], ) assert_frame_equal( df.genomics.in_regions( [Region("chr1", 900, 1000), Region("chr2", 900, 1000)]), df[["var1", "var3"]], )
def test_region_series(): var = Variant("chr1", position=999, ref="A", alt=["a"]) s = pd.Series(GenotypeArray([ var.make_genotype_from_str("A/A"), ] * 10)) assert s.genomics.contained_by(Region("chr1", 900, 1000)) assert not s.genomics.contained_by(Region("chr2", 900, 1000)) assert not s.genomics.contained_by(Region("chr1", 900, 999))
def __get_data_for_encoding(): variant = Variant(id=None, ref="A", alt=["T", "C"]) a = variant.make_genotype("A", "A") b = variant.make_genotype("A", "T") c = variant.make_genotype("T", "T") d = variant.make_genotype("T", "C") na = variant.make_genotype() return GenotypeArray([a, b, c, d, na])
def data_missing(): """Length-2 array with [NA, Valid]""" variant = Variant(chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"]) genotypes = [variant.make_genotype(), variant.make_genotype("T", "T")] return GenotypeArray(values=genotypes)
def ga_AA_Aa_aa_BB_Bb_bb(): var = Variant("chr1", ref="A", alt=["a", "B", "b"]) return GenotypeArray([ var.make_genotype_from_str("A/A"), var.make_genotype_from_str("A/a"), var.make_genotype_from_str("a/a"), var.make_genotype_from_str("B/B"), var.make_genotype_from_str("B/b"), var.make_genotype_from_str("b/b"), ])
def ga_nothwe(): """1000-sample array not in HWE""" var = Variant("chr1", ref="A", alt=["a"]) return GenotypeArray([ var.make_genotype_from_str("A/A"), ] * 800 + [ var.make_genotype_from_str("A/a"), ] * 0 + [ var.make_genotype_from_str("a/a"), ] * 200)
def na_value(): """The scalar missing value for this type. Default 'None'""" variant = Variant( chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"], score=30, ) return variant.make_genotype()
def ga_inhwe(): """ 1000-sample array in HWE """ var = Variant("chr1", ref="A", alt=["a"]) return GenotypeArray([ var.make_genotype_from_str("A/A"), ] * 640 + [ var.make_genotype_from_str("A/a"), ] * 320 + [ var.make_genotype_from_str("a/a"), ] * 40)
def _validate_params(pen_table, penetrance_base, penetrance_diff, snp1, snp2): """Validate parameters and calculate final penetrance table""" # Process Enum if type(pen_table) is PenetranceTables: pen_table = np.array(pen_table.value).reshape((3, 3)) elif isinstance(pen_table, np.ndarray): if pen_table.shape != (3, 3): raise ValueError(f"Incorrect shape for pen_table, must be 3x3") else: raise ValueError( f"pen_table must be a 3x3 numpy array or PenetranceTables enum, not {type(pen_table)}" ) if (pen_table < 0).any(): raise ValueError(f"Penetrance table values cannot be negative.") # Scale penetrance table if needed if (pen_table.min() != 0) or (pen_table.max() != 1): pen_table_min = pen_table.min() pen_table_range = pen_table.max() - pen_table_min if pen_table_range > 0: pen_table = (pen_table - pen_table_min) / pen_table_range # Otherwise the penetrance table is flat, i.e. a null model # Process base and diff if (penetrance_base < 0) or (penetrance_base > 1): raise ValueError( f"penetrance_base must be in [0,1], {penetrance_base} was outside this range" ) if penetrance_diff is None: penetrance_diff = 1 - (2 * penetrance_base) elif penetrance_diff < 0: raise ValueError("penetrance_diff must be > 0") elif (penetrance_diff + penetrance_base) > 1: raise ValueError(f"penetrance_base + penetrance_diff must be <= 1") # SNPs if snp1 is None: snp1 = Variant(id="rs1", ref="A", alt=["a"]) if snp2 is None: snp2 = Variant(id="rs2", ref="B", alt=["b"]) if len(snp1.alt) != 1: raise ValueError(f"SNP1 is not Bialleleic: {snp1}") if len(snp2.alt) != 1: raise ValueError(f"SNP2 is not Bialleleic: {snp2}") # Create final pen_table pen_table = penetrance_base + penetrance_diff * pen_table return pen_table, snp1, snp2
def data_missing_for_sorting(): """Length-3 array with a known sort order. This should be three items [B, NA, A] with A < B and NA missing. """ variant = Variant(chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"]) a = variant.make_genotype("A", "A") b = variant.make_genotype("A", "T") na = variant.make_genotype() return GenotypeArray(values=[b, na, a])
def data_for_grouping(): """Data for factorization, grouping, and unique tests. Expected to be like [B, B, NA, NA, A, A, B, C] Where A < B < C and NA is missing """ variant = Variant(chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"]) a = variant.make_genotype("A", "A") b = variant.make_genotype("A", "T") c = variant.make_genotype("T", "T") na = variant.make_genotype() return GenotypeArray([b, b, na, na, a, a, b, c])
def var_complete_triploid(): return Variant("12", 12345678, "complete_diploid", ref="A", alt=["C", "G", "T"], ploidy=3)
def data_for_sorting(): """Length-3 array with a known sort order. This should be three items [B, C, A] with A < B < C """ variant = Variant( chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"], score=30, ) a = variant.make_genotype("A", "A") b = variant.make_genotype("A", "T") c = variant.make_genotype("T", "T") return GenotypeArray(values=[b, c, a])
def test_HWE(ga_inhwe, ga_nothwe): var = Variant("chr1", ref="A", alt=["a"]) # One var, can't calculate ga_onevar = GenotypeArray( [ var.make_genotype_from_str("A/A"), ] ) assert ga_onevar.hwe_pval is np.nan assert ga_inhwe.hwe_pval == 1.0 assert ga_nothwe.hwe_pval < 1e-20 # NaN for non-diploid var = Variant("chr1", ref="A", alt=["B", "C"], ploidy=3) ga_triploid = GenotypeArray( [ var.make_genotype_from_str("A/A/A"), ] * 50 + [ var.make_genotype_from_str("A/A/B"), ] * 50, ) assert ga_triploid.hwe_pval is np.nan
def encoding_df(): """ 5 variants, 5 genotypes each: Homozygous Ref Heterozygous Homozygous Alt Missing one allele Missing both alleles """ data = dict() for idx, base in enumerate("ABCDE"): var = Variant( chromosome="chr1", position=idx + 1, id=f"rs{idx+1}", ref=base, alt=[base.lower()], ) data[f"var{idx}"] = GenotypeArray( [ var.make_genotype(base, base), var.make_genotype(base, base.lower()), var.make_genotype(base.lower(), base.lower()), var.make_genotype(base), var.make_genotype(), ] ) return pd.DataFrame(data)
def dtype(): variant = Variant( chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"], score=30, ) return GenotypeDtype(variant=variant)
def __init__(self, variant: Optional[Variant] = None): # Set variant if variant is None: variant = Variant() self.variant = variant # Data backing the GenotypeArray is stored as a numpy structured array # An unsigned integer for each allele in the genotype indexing the list of possible alleles # An unsigned integer for the genotype score (255 if missing) self._record_type = np.dtype([ ("allele_idxs", np.uint8, (self.variant.ploidy, )), ("gt_score", np.uint8), ]) self.itemsize = self._record_type.itemsize
def data(): """Length-100 array for this type. * data[0] and data[1] should both be non missing * data[0] and data[1] should not be equal """ alleles = ["A", "T", "G"] variant = Variant( chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"], score=30, ) genotypes = [ variant.make_genotype("A", "T"), variant.make_genotype("T", "T") ] for i in range(98): genotypes.append( variant.make_genotype(random.choice(alleles), random.choice(alleles))) return GenotypeArray(values=genotypes)
def construct_from_string(cls, string): """ Construct a GenotypeDtype from a string. Parameters ---------- string : str The string alias for this GenotypeDtype. Should be formatted like `genotype(<ploidy>n)[<chromosome>; <position>; <id>; <ref>; <alt>]` Examples -------- >>> GenotypeDtype.construct_from_string('genotype(2n)[chr1; 123456; rs12345; A; T,G]') genotype(2n)[chr1; 123456; rs12345; A; T,G] """ if isinstance(string, str): msg = "Cannot construct a 'GenotypeDtype' from '{}'" try: match = cls._match.match(string) if match is not None: d = match.groupdict() # Score is optional, so it may be None score = d["score"] if score is not None: score = int(score) variant = Variant( chromosome=d["chromosome"], position=int(d["position"]), id=d["id"], ref=d["ref"], alt=d["alt"].split(","), ploidy=int(d["ploidy"]), score=score, ) return cls(variant=variant) else: raise TypeError(msg.format(string)) except Exception: raise TypeError(msg.format(string)) else: raise TypeError( f"'construct_from_string' expects a string, got {type(string)}>" )
def test_is_missing(): var = Variant("chr1", ref="A", alt=["T", "C"]) ga_fff = GenotypeArray( [ var.make_genotype_from_str("A/A"), var.make_genotype_from_str("A/A"), var.make_genotype_from_str("A/A"), ] ) assert (ga_fff.is_missing == np.array([False, False, False])).all() ga_ftf = GenotypeArray( [ var.make_genotype_from_str("A/A"), var.make_genotype(), var.make_genotype_from_str("A/A"), ] ) assert (ga_ftf.is_missing == np.array([False, True, False])).all() assert (ga_ftf.is_missing == (ga_ftf == "./.")).all()
def var_min(): return Variant("13", 12345678, "min", ref="A", alt=[])
def var_two_allele(): return Variant("12", 12345678, "complete", ref="A", alt=["C"])
def test_create_variant(): variant = Variant("12", 112161652, "rs12462", ref="C", alt=["T"]) assert variant.alleles == ["C", "T"]
def test_methods(): variant = Variant("12", 112161652, "rs12462", ref="C", alt=["T"]) variant_also = Variant("12", 112161652, "rs12462", ref="C", alt=["T"]) assert variant.is_same_position(variant_also) # Get Allele Index assert variant.get_idx_from_allele("T") == 1 assert variant.get_idx_from_allele("G", add=True) == 2 assert len(variant.alleles) == 3 # Add Allele variant.add_allele("GT") assert len(variant.alleles) == 4 # Is Valid Allele Index assert variant.is_valid_allele_idx(1) assert not variant.is_valid_allele_idx(10) # Same variant despite adding additional alleles assert variant.is_same_position(variant_also) # But variant not equal assert not variant == variant_also
""" Test GenotypeDtype """ import pandas as pd import pytest from pandas._testing import assert_series_equal, assert_extension_array_equal from pandas_genomics.arrays import GenotypeDtype from pandas_genomics.scalars import Variant TEST_VAR = Variant( chromosome="12", position=112161652, id="rs12462", ref="T", alt=["C"] ) @pytest.mark.parametrize( "input_str,variant", [ ( "genotype(2n)[12; 112161652; rs12462; T; C]", Variant( chromosome="12", position=112161652, id="rs12462", ref="T", alt=["C"] ), ), ( "genotype(3n)[12; 112161652; rs12462; T; C]", Variant( chromosome="12", position=112161652, id="rs12462", ref="T",
6, 5, marks=pytest.mark.xfail( raises=TypeError, strict=True, reason="chromosome not string"), ), ], ) def test_create_region(chromosome, start, end): region = Region(chromosome, start, end) @pytest.mark.parametrize( "variant,region,result", [ ( Variant(chromosome="chr1", position=1), Region(chromosome="chr1", start=1, end=2), True, ), ( Variant(chromosome="chr1", position=1), Region(chromosome="chr2", start=1, end=2), False, ), ( Variant(chromosome="chr1", position=1), Region(chromosome="1", start=1, end=2), False, ), ( Variant(chromosome="chr1", position=99),
def test(): var = Variant(chromosome="1", position=123456, ref="T", alt=["A"]) gta = sim.generate_random_gt(var, alt_allele_freq=0.3) var2 = Variant(chromosome="1", position=223456, ref="T", alt=["A", "C"]) gta_2 = sim.generate_random_gt(var2, alt_allele_freq=[0.25, 0.05])
def test_maf(): # Zero var = Variant("chr1", ref="A", alt=["T", "C"]) ga_zero = GenotypeArray( [ var.make_genotype_from_str("A/A"), var.make_genotype_from_str("A/A"), var.make_genotype_from_str("A/A"), ] ) assert ga_zero.maf == 0.0 # Only Missing missing = GenotypeArray([var.make_genotype()] * 3) assert missing.maf is np.nan # 50% ga_50 = GenotypeArray( [ var.make_genotype_from_str("A/A"), var.make_genotype_from_str("T/T"), var.make_genotype_from_str("T/A"), ] ) assert ga_50.maf == 0.50 # 2nd of 3 alleles ga_2nd = GenotypeArray( [ var.make_genotype_from_str("A/C"), var.make_genotype_from_str("C/C"), var.make_genotype_from_str("T/T"), ] ) assert ga_2nd.maf == 0.5 # Triploid var = Variant("chr1", ref="A", alt=["T", "C"], ploidy=3) ga_33 = GenotypeArray( [ var.make_genotype_from_str("A/A/T"), var.make_genotype_from_str("A/T/C"), var.make_genotype_from_str("A/A/T"), ] ) assert ga_33.maf == 1 / 3