Example #1
0
def test_region_df():
    var1 = Variant("chr1", position=999, ref="A", alt=["a"])
    var2 = Variant("chr1", position=6789, ref="A", alt=["a"])
    var3 = Variant("chr2", position=999, ref="A", alt=["a"])
    var4 = Variant("chr3", position=25622, ref="A", alt=["a"])
    df = pd.DataFrame({
        f"var{idx+1}": GenotypeArray([
            var.make_genotype_from_str("A/A"),
        ] * 10)
        for idx, var in enumerate([var1, var2, var3, var4])
    })
    assert_frame_equal(
        df.genomics.in_regions(Region("chr1", 900, 1000)),
        df[[
            "var1",
        ]],
    )
    assert_frame_equal(
        df.genomics.not_in_regions(Region("chr1", 900, 1000)),
        df[["var2", "var3", "var4"]],
    )
    assert_frame_equal(
        df.genomics.in_regions(
            [Region("chr1", 900, 1000),
             Region("chr2", 900, 1000)]),
        df[["var1", "var3"]],
    )
Example #2
0
def test_region_series():
    var = Variant("chr1", position=999, ref="A", alt=["a"])
    s = pd.Series(GenotypeArray([
        var.make_genotype_from_str("A/A"),
    ] * 10))
    assert s.genomics.contained_by(Region("chr1", 900, 1000))
    assert not s.genomics.contained_by(Region("chr2", 900, 1000))
    assert not s.genomics.contained_by(Region("chr1", 900, 999))
Example #3
0
 def __get_data_for_encoding():
     variant = Variant(id=None, ref="A", alt=["T", "C"])
     a = variant.make_genotype("A", "A")
     b = variant.make_genotype("A", "T")
     c = variant.make_genotype("T", "T")
     d = variant.make_genotype("T", "C")
     na = variant.make_genotype()
     return GenotypeArray([a, b, c, d, na])
Example #4
0
def data_missing():
    """Length-2 array with [NA, Valid]"""
    variant = Variant(chromosome="chr1",
                      position=123456,
                      id="rs12345",
                      ref="A",
                      alt=["T", "G"])
    genotypes = [variant.make_genotype(), variant.make_genotype("T", "T")]
    return GenotypeArray(values=genotypes)
Example #5
0
def ga_AA_Aa_aa_BB_Bb_bb():
    var = Variant("chr1", ref="A", alt=["a", "B", "b"])
    return GenotypeArray([
        var.make_genotype_from_str("A/A"),
        var.make_genotype_from_str("A/a"),
        var.make_genotype_from_str("a/a"),
        var.make_genotype_from_str("B/B"),
        var.make_genotype_from_str("B/b"),
        var.make_genotype_from_str("b/b"),
    ])
Example #6
0
def ga_nothwe():
    """1000-sample array not in HWE"""
    var = Variant("chr1", ref="A", alt=["a"])
    return GenotypeArray([
        var.make_genotype_from_str("A/A"),
    ] * 800 + [
        var.make_genotype_from_str("A/a"),
    ] * 0 + [
        var.make_genotype_from_str("a/a"),
    ] * 200)
Example #7
0
def na_value():
    """The scalar missing value for this type. Default 'None'"""
    variant = Variant(
        chromosome="chr1",
        position=123456,
        id="rs12345",
        ref="A",
        alt=["T", "G"],
        score=30,
    )
    return variant.make_genotype()
Example #8
0
def ga_inhwe():
    """
    1000-sample array in HWE
    """
    var = Variant("chr1", ref="A", alt=["a"])
    return GenotypeArray([
        var.make_genotype_from_str("A/A"),
    ] * 640 + [
        var.make_genotype_from_str("A/a"),
    ] * 320 + [
        var.make_genotype_from_str("a/a"),
    ] * 40)
    def _validate_params(pen_table, penetrance_base, penetrance_diff, snp1,
                         snp2):
        """Validate parameters and calculate final penetrance table"""
        # Process Enum
        if type(pen_table) is PenetranceTables:
            pen_table = np.array(pen_table.value).reshape((3, 3))
        elif isinstance(pen_table, np.ndarray):
            if pen_table.shape != (3, 3):
                raise ValueError(f"Incorrect shape for pen_table, must be 3x3")
        else:
            raise ValueError(
                f"pen_table must be a 3x3 numpy array or PenetranceTables enum, not {type(pen_table)}"
            )

        if (pen_table < 0).any():
            raise ValueError(f"Penetrance table values cannot be negative.")

        # Scale penetrance table if needed
        if (pen_table.min() != 0) or (pen_table.max() != 1):
            pen_table_min = pen_table.min()
            pen_table_range = pen_table.max() - pen_table_min
            if pen_table_range > 0:
                pen_table = (pen_table - pen_table_min) / pen_table_range
                # Otherwise the penetrance table is flat, i.e. a null model

        # Process base and diff
        if (penetrance_base < 0) or (penetrance_base > 1):
            raise ValueError(
                f"penetrance_base must be in [0,1], {penetrance_base} was outside this range"
            )
        if penetrance_diff is None:
            penetrance_diff = 1 - (2 * penetrance_base)
        elif penetrance_diff < 0:
            raise ValueError("penetrance_diff must be > 0")
        elif (penetrance_diff + penetrance_base) > 1:
            raise ValueError(f"penetrance_base + penetrance_diff must be <= 1")

        # SNPs
        if snp1 is None:
            snp1 = Variant(id="rs1", ref="A", alt=["a"])
        if snp2 is None:
            snp2 = Variant(id="rs2", ref="B", alt=["b"])

        if len(snp1.alt) != 1:
            raise ValueError(f"SNP1 is not Bialleleic: {snp1}")
        if len(snp2.alt) != 1:
            raise ValueError(f"SNP2 is not Bialleleic: {snp2}")

        # Create final pen_table
        pen_table = penetrance_base + penetrance_diff * pen_table

        return pen_table, snp1, snp2
Example #10
0
def data_missing_for_sorting():
    """Length-3 array with a known sort order.
    This should be three items [B, NA, A] with
    A < B and NA missing.
    """
    variant = Variant(chromosome="chr1",
                      position=123456,
                      id="rs12345",
                      ref="A",
                      alt=["T", "G"])
    a = variant.make_genotype("A", "A")
    b = variant.make_genotype("A", "T")
    na = variant.make_genotype()
    return GenotypeArray(values=[b, na, a])
Example #11
0
def data_for_grouping():
    """Data for factorization, grouping, and unique tests.
    Expected to be like [B, B, NA, NA, A, A, B, C]
    Where A < B < C and NA is missing
    """
    variant = Variant(chromosome="chr1",
                      position=123456,
                      id="rs12345",
                      ref="A",
                      alt=["T", "G"])
    a = variant.make_genotype("A", "A")
    b = variant.make_genotype("A", "T")
    c = variant.make_genotype("T", "T")
    na = variant.make_genotype()
    return GenotypeArray([b, b, na, na, a, a, b, c])
Example #12
0
def var_complete_triploid():
    return Variant("12",
                   12345678,
                   "complete_diploid",
                   ref="A",
                   alt=["C", "G", "T"],
                   ploidy=3)
Example #13
0
def data_for_sorting():
    """Length-3 array with a known sort order.
    This should be three items [B, C, A] with
    A < B < C
    """
    variant = Variant(
        chromosome="chr1",
        position=123456,
        id="rs12345",
        ref="A",
        alt=["T", "G"],
        score=30,
    )
    a = variant.make_genotype("A", "A")
    b = variant.make_genotype("A", "T")
    c = variant.make_genotype("T", "T")
    return GenotypeArray(values=[b, c, a])
Example #14
0
def test_HWE(ga_inhwe, ga_nothwe):
    var = Variant("chr1", ref="A", alt=["a"])
    # One var, can't calculate
    ga_onevar = GenotypeArray(
        [
            var.make_genotype_from_str("A/A"),
        ]
    )
    assert ga_onevar.hwe_pval is np.nan
    assert ga_inhwe.hwe_pval == 1.0
    assert ga_nothwe.hwe_pval < 1e-20

    # NaN for non-diploid
    var = Variant("chr1", ref="A", alt=["B", "C"], ploidy=3)
    ga_triploid = GenotypeArray(
        [
            var.make_genotype_from_str("A/A/A"),
        ]
        * 50
        + [
            var.make_genotype_from_str("A/A/B"),
        ]
        * 50,
    )
    assert ga_triploid.hwe_pval is np.nan
Example #15
0
def encoding_df():
    """
    5 variants, 5 genotypes each:
      Homozygous Ref
      Heterozygous
      Homozygous Alt
      Missing one allele
      Missing both alleles
    """
    data = dict()
    for idx, base in enumerate("ABCDE"):
        var = Variant(
            chromosome="chr1",
            position=idx + 1,
            id=f"rs{idx+1}",
            ref=base,
            alt=[base.lower()],
        )
        data[f"var{idx}"] = GenotypeArray(
            [
                var.make_genotype(base, base),
                var.make_genotype(base, base.lower()),
                var.make_genotype(base.lower(), base.lower()),
                var.make_genotype(base),
                var.make_genotype(),
            ]
        )
    return pd.DataFrame(data)
Example #16
0
def dtype():
    variant = Variant(
        chromosome="chr1",
        position=123456,
        id="rs12345",
        ref="A",
        alt=["T", "G"],
        score=30,
    )
    return GenotypeDtype(variant=variant)
Example #17
0
    def __init__(self, variant: Optional[Variant] = None):
        # Set variant
        if variant is None:
            variant = Variant()
        self.variant = variant

        # Data backing the GenotypeArray is stored as a numpy structured array
        # An unsigned integer for each allele in the genotype indexing the list of possible alleles
        # An unsigned integer for the genotype score (255 if missing)
        self._record_type = np.dtype([
            ("allele_idxs", np.uint8, (self.variant.ploidy, )),
            ("gt_score", np.uint8),
        ])
        self.itemsize = self._record_type.itemsize
Example #18
0
def data():
    """Length-100 array for this type.
    * data[0] and data[1] should both be non missing
    * data[0] and data[1] should not be equal
    """
    alleles = ["A", "T", "G"]
    variant = Variant(
        chromosome="chr1",
        position=123456,
        id="rs12345",
        ref="A",
        alt=["T", "G"],
        score=30,
    )
    genotypes = [
        variant.make_genotype("A", "T"),
        variant.make_genotype("T", "T")
    ]
    for i in range(98):
        genotypes.append(
            variant.make_genotype(random.choice(alleles),
                                  random.choice(alleles)))
    return GenotypeArray(values=genotypes)
Example #19
0
    def construct_from_string(cls, string):
        """
        Construct a GenotypeDtype from a string.

        Parameters
        ----------
        string : str
            The string alias for this GenotypeDtype.
            Should be formatted like `genotype(<ploidy>n)[<chromosome>; <position>; <id>; <ref>; <alt>]`

        Examples
        --------
        >>> GenotypeDtype.construct_from_string('genotype(2n)[chr1; 123456; rs12345; A; T,G]')
        genotype(2n)[chr1; 123456; rs12345; A; T,G]
        """
        if isinstance(string, str):
            msg = "Cannot construct a 'GenotypeDtype' from '{}'"
            try:
                match = cls._match.match(string)
                if match is not None:
                    d = match.groupdict()
                    # Score is optional, so it may be None
                    score = d["score"]
                    if score is not None:
                        score = int(score)
                    variant = Variant(
                        chromosome=d["chromosome"],
                        position=int(d["position"]),
                        id=d["id"],
                        ref=d["ref"],
                        alt=d["alt"].split(","),
                        ploidy=int(d["ploidy"]),
                        score=score,
                    )
                    return cls(variant=variant)
                else:
                    raise TypeError(msg.format(string))
            except Exception:
                raise TypeError(msg.format(string))
        else:
            raise TypeError(
                f"'construct_from_string' expects a string, got {type(string)}>"
            )
Example #20
0
def test_is_missing():
    var = Variant("chr1", ref="A", alt=["T", "C"])
    ga_fff = GenotypeArray(
        [
            var.make_genotype_from_str("A/A"),
            var.make_genotype_from_str("A/A"),
            var.make_genotype_from_str("A/A"),
        ]
    )
    assert (ga_fff.is_missing == np.array([False, False, False])).all()
    ga_ftf = GenotypeArray(
        [
            var.make_genotype_from_str("A/A"),
            var.make_genotype(),
            var.make_genotype_from_str("A/A"),
        ]
    )
    assert (ga_ftf.is_missing == np.array([False, True, False])).all()
    assert (ga_ftf.is_missing == (ga_ftf == "./.")).all()
Example #21
0
def var_min():
    return Variant("13", 12345678, "min", ref="A", alt=[])
Example #22
0
def var_two_allele():
    return Variant("12", 12345678, "complete", ref="A", alt=["C"])
Example #23
0
def test_create_variant():
    variant = Variant("12", 112161652, "rs12462", ref="C", alt=["T"])
    assert variant.alleles == ["C", "T"]
Example #24
0
def test_methods():
    variant = Variant("12", 112161652, "rs12462", ref="C", alt=["T"])
    variant_also = Variant("12", 112161652, "rs12462", ref="C", alt=["T"])
    assert variant.is_same_position(variant_also)
    # Get Allele Index
    assert variant.get_idx_from_allele("T") == 1
    assert variant.get_idx_from_allele("G", add=True) == 2
    assert len(variant.alleles) == 3
    # Add Allele
    variant.add_allele("GT")
    assert len(variant.alleles) == 4
    # Is Valid Allele Index
    assert variant.is_valid_allele_idx(1)
    assert not variant.is_valid_allele_idx(10)
    # Same variant despite adding additional alleles
    assert variant.is_same_position(variant_also)
    # But variant not equal
    assert not variant == variant_also
Example #25
0
"""
Test GenotypeDtype
"""
import pandas as pd
import pytest
from pandas._testing import assert_series_equal, assert_extension_array_equal

from pandas_genomics.arrays import GenotypeDtype
from pandas_genomics.scalars import Variant

TEST_VAR = Variant(
    chromosome="12", position=112161652, id="rs12462", ref="T", alt=["C"]
)


@pytest.mark.parametrize(
    "input_str,variant",
    [
        (
            "genotype(2n)[12; 112161652; rs12462; T; C]",
            Variant(
                chromosome="12", position=112161652, id="rs12462", ref="T", alt=["C"]
            ),
        ),
        (
            "genotype(3n)[12; 112161652; rs12462; T; C]",
            Variant(
                chromosome="12",
                position=112161652,
                id="rs12462",
                ref="T",
Example #26
0
            6,
            5,
            marks=pytest.mark.xfail(
                raises=TypeError, strict=True, reason="chromosome not string"),
        ),
    ],
)
def test_create_region(chromosome, start, end):
    region = Region(chromosome, start, end)


@pytest.mark.parametrize(
    "variant,region,result",
    [
        (
            Variant(chromosome="chr1", position=1),
            Region(chromosome="chr1", start=1, end=2),
            True,
        ),
        (
            Variant(chromosome="chr1", position=1),
            Region(chromosome="chr2", start=1, end=2),
            False,
        ),
        (
            Variant(chromosome="chr1", position=1),
            Region(chromosome="1", start=1, end=2),
            False,
        ),
        (
            Variant(chromosome="chr1", position=99),
Example #27
0
def test():
    var = Variant(chromosome="1", position=123456, ref="T", alt=["A"])
    gta = sim.generate_random_gt(var, alt_allele_freq=0.3)
    var2 = Variant(chromosome="1", position=223456, ref="T", alt=["A", "C"])
    gta_2 = sim.generate_random_gt(var2, alt_allele_freq=[0.25, 0.05])
Example #28
0
def test_maf():
    # Zero
    var = Variant("chr1", ref="A", alt=["T", "C"])
    ga_zero = GenotypeArray(
        [
            var.make_genotype_from_str("A/A"),
            var.make_genotype_from_str("A/A"),
            var.make_genotype_from_str("A/A"),
        ]
    )
    assert ga_zero.maf == 0.0

    # Only Missing
    missing = GenotypeArray([var.make_genotype()] * 3)
    assert missing.maf is np.nan

    # 50%
    ga_50 = GenotypeArray(
        [
            var.make_genotype_from_str("A/A"),
            var.make_genotype_from_str("T/T"),
            var.make_genotype_from_str("T/A"),
        ]
    )
    assert ga_50.maf == 0.50

    # 2nd of 3 alleles
    ga_2nd = GenotypeArray(
        [
            var.make_genotype_from_str("A/C"),
            var.make_genotype_from_str("C/C"),
            var.make_genotype_from_str("T/T"),
        ]
    )
    assert ga_2nd.maf == 0.5

    # Triploid
    var = Variant("chr1", ref="A", alt=["T", "C"], ploidy=3)
    ga_33 = GenotypeArray(
        [
            var.make_genotype_from_str("A/A/T"),
            var.make_genotype_from_str("A/T/C"),
            var.make_genotype_from_str("A/A/T"),
        ]
    )
    assert ga_33.maf == 1 / 3