Exemple #1
0
def test_main_mixture(capsys):
    arglist = [
        "seq",
        "--seeds",
        "42",
        "1776",
        "--proportions",
        "0.8",
        "0.2",
        "--num-reads",
        "500",
        data_file("def/yellow-offsets.tsv"),
        data_file("refr/yellow-refr.fasta.gz"),
        data_file("prof/yellow-mix-gt.json"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.seq.main(args)
    terminal = capsys.readouterr()
    outlines = terminal.out.strip().split("\n")
    nrecords = len(outlines) / 4
    assert nrecords == pytest.approx(500, abs=25)
    assert outlines[-3] == (
        "TCAATTCAATTTCTACCCTCAGCATCAAGGCAGGGGTTCATCATAATGGGTATTGGAGGCTCAAAGAAA"
        "ATTTAGGCTCAGCACACACACACACACACACACACACACACAGCGATTTTTAATGCTGGTACAATCACA"
        "GGAGACTGCGACCCAGCCCTCCTCAGCGCCTCGGGTGCTCACGGGCACTCCTGGAGTCTCGGCCACACT"
        "AAGTCCCCCTGGTGGCCACACAGAAGAAGAGGTGGTAAAACTTTCTGGGAGTGAGATCAAAAATTTTAG"
        "GAGTCTAAAAACATACTTTTCTAAG")
Exemple #2
0
def test_convert_counts(tmp_path, counts, expfile):
    csvfile = str(tmp_path / "out.csv")
    result = TypingResult(fromfile=data_file("prof/deep-filt-clean.json"))
    result.dump_csv(csvfile, "MySample", counts=counts)
    observed = pd.read_csv(csvfile)
    expected = pd.read_csv(data_file(expfile))
    assert observed.equals(expected)
Exemple #3
0
def test_mixture_failure_modes():
    freqs = microhapulator.load_marker_frequencies(
        data_file("freq/russ4-freq.tsv"))
    markers = microhapulator.load_marker_definitions(
        data_file("def/russ4-offsets.tsv"))
    seqs = microhapulator.load_marker_reference_sequences(
        data_file("refr/russ4-refr.fasta.gz"))
    profiles = [mhapi.sim(freqs) for _ in range(3)]
    with pytest.raises(ValueError,
                       match=r"number of profiles must match number of seeds"):
        for read in mhapi.seq(profiles, markers, seqs, seeds=[42, 1776]):
            pass
    with pytest.raises(
            ValueError,
            match=r"mismatch between contributor number and proportions"):
        for read in mhapi.seq(profiles,
                              markers,
                              seqs,
                              proportions=[0.5, 0.3, 0.1, 0.1]):
            pass
    message = r"specified proportions result in 0 reads for 1 or more individuals"
    with pytest.raises(ValueError, match=message):
        for read in mhapi.seq(profiles,
                              markers,
                              seqs,
                              totalreads=500,
                              proportions=[1, 100, 10000]):
            pass
Exemple #4
0
def test_filter_config_file():
    config = pd.read_csv(data_file("filters.csv"), sep=None, engine="python")
    result = TypingResult(fromfile=data_file("prof/deep-raw.json"))
    result.filter(static=5, dynamic=0.02, config=config)
    assert len(result.haplotypes("mh01XYZ-1")) == 8
    assert len(result.haplotypes("mh02XYZ-2")) == 2
    assert len(result.haplotypes("mh02XYZ-3")) == 2
def test_unite_basic(momgt, dadgt, kidgt, seed):
    mom = Profile(fromfile=data_file(f"prof/{momgt}"))
    dad = Profile(fromfile=data_file(f"prof/{dadgt}"))
    kid = Profile(fromfile=data_file(f"prof/{kidgt}"))
    numpy.random.seed(seed)
    test = Profile.unite(mom, dad)
    assert test == kid
def test_unite_unshared(capsys):
    mom = Profile(fromfile=data_file("prof/swedish-mom.json"))
    dad = Profile(fromfile=data_file("prof/swedish-dad.json"))
    kid = Profile.unite(mom, dad)
    terminal = capsys.readouterr()
    message = "markers not common to mom and dad profiles are excluded"
    assert message in terminal.err
Exemple #7
0
def test_filter_simple():
    bam = data_file("pashtun-sim/aligned-reads.bam")
    tsv = data_file("pashtun-sim/tiny-panel.tsv")
    observed = mhapi.type(bam, tsv)
    observed.filter(static=10, dynamic=0.05)
    expected = TypingResult(fromfile=data_file("pashtun-sim/test-output.json"))
    assert observed == expected
Exemple #8
0
def test_diff_basic():
    gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json"))
    gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-2.json"))
    diff = list(mhapi.diff(gt1, gt2))
    assert diff == [
        ("MHDBL000140", {"C,C,A,A"}, {"C,C,T,A"}),
        ("MHDBL000163", {"A,A,G,A,T"}, {"C,G,A,A,T"}),
    ]
def test_interlocus_balance_basic(capfd):
    profile = Profile(fromfile=data_file("prof/three-contrib-log.json"))
    chisq, obs_data = mhapi.interlocus_balance(profile)
    exp_data = pd.read_csv(data_file("three-contrib-log-balance.csv"))
    assert obs_data.equals(exp_data)
    assert chisq == pytest.approx(0.00928395)
    terminal = capfd.readouterr()
    assert "MHDBL000212: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50.00" in terminal.out
def test_heterozygote_balance_basic(tmp_path):
    figfile = tmp_path / "figure.png"
    profile = Profile(fromfile=data_file("prof/single-contrib-2.json"))
    tstat, obs_data = mhapi.heterozygote_balance(profile, tofile=figfile)
    assert tstat == pytest.approx(3.90845)
    exp_data = pd.read_csv(data_file("het-balance.tsv"), sep="\t")
    assert obs_data.equals(exp_data)
    assert figfile.is_file()
def test_dist_even_mixture():
    with microhapulator.open(data_file("murica/x-obs-genotype.json"),
                             "r") as fh:
        p1 = TypingResult(fh)
    p2 = SimulatedProfile.populate_from_bed(
        data_file("murica/x-sim-genotype.bed"))
    assert mhapi.dist(p1, p2) == 0
    assert p1 == p2
Exemple #12
0
def test_filter_dupl_marker():
    config = pd.read_csv(data_file("filters-redundant.csv"),
                         sep=None,
                         engine="python")
    result = TypingResult(fromfile=data_file("prof/deep-raw.json"))
    message = "filter config file contains duplicate entries for some markers"
    with pytest.raises(ValueError, match=message):
        result.filter(static=5, dynamic=0.02, config=config)
Exemple #13
0
def test_diff_nonmatching_alleles():
    p1 = SimulatedProfile(fromfile=data_file("prof/red-strict-profile.json"))
    p2 = SimulatedProfile(fromfile=data_file("prof/red-relaxed-profile.json"))
    diff = list(mhapi.diff(p1, p2))
    print(diff)
    assert diff == [
        ("mh07CP-004", set(), {"T,T,T,A,T", "A,A,T,A,T"}),
        ("mh09KK-157", set(), {"G,C,C,A,T"}),
    ]
Exemple #14
0
def test_filter_missing_column():
    config = pd.read_csv(data_file("filters-missing.csv"),
                         sep=None,
                         engine="python")
    result = TypingResult(fromfile=data_file("prof/deep-raw.json"))
    with pytest.raises(
            ValueError,
            match=r"filter config file missing column\(s\): Static"):
        result.filter(static=5, dynamic=0.02, config=config)
Exemple #15
0
def test_contain_cli(capsys):
    arglist = [
        "contain",
        data_file("prof/one-brit-sim.json"),
        data_file("prof/one-italian-sim.json"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.contain.main(args)
    terminal = capsys.readouterr()
    assert '"containment": 0.4444' in terminal.out
def test_haplotypes():
    simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz"))
    typeprof = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof.haplotypes("BoGuSlOcUs") == set()
    assert typeprof.haplotypes("BoGuSlOcUs") == set()
    assert simprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"])
    assert typeprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"])
    assert simprof.haplotypes("MHDBL000135", index=0) == set(["G,C,T"])
    assert simprof.haplotypes("MHDBL000135", index=1) == set(["G,T,C"])
    assert typeprof.haplotypes("MHDBL000135", index=0) == set()
Exemple #17
0
def test_complex_genotype(capsys):
    profile = Profile(fromfile=data_file("prof/mixture-genotype.json"))
    markers = microhapulator.load_marker_definitions(
        data_file("def/russ4-offsets.tsv"))
    seqs = microhapulator.load_marker_reference_sequences(
        data_file("refr/russ4-refr.fasta.gz"))
    sequencer = mhapi.seq(list(profile.unmix()), markers, seqs, totalreads=200)
    for n, read in enumerate(sequencer):
        pass
    terminal = capsys.readouterr()
    assert terminal.err.count("Individual seed=") == 3
Exemple #18
0
def test_type_filter_threshold():
    bam = data_file("bam/dyncut-test-reads.bam")
    tsv = data_file("def/dyncut-panel.tsv")
    rslt = mhapi.type(bam, tsv)
    rslt.filter(static=10, dynamic=0.005)
    assert rslt.haplotypes("MHDBL000018") == set(["C,A,C,T,G", "T,G,C,T,G"])
    assert rslt.haplotypes("MHDBL000156") == set(["T,C,A,C", "T,C,G,G"])
    rslt = mhapi.type(bam, tsv)
    rslt.filter(static=4, dynamic=0.005)
    assert rslt.haplotypes("MHDBL000018") == set(
        ["C,A,C,T,G", "T,G,C,T,G", "C,A,C,T,A", "T,G,C,T,A"])
    assert rslt.haplotypes("MHDBL000156") == set(["T,C,A,C", "T,C,G,G"])
def test_locbalance_cli(tmp_path, capfd):
    outfile = str(tmp_path / "balance.csv")
    arglist = ["locbalance", "--csv", outfile, data_file("prof/three-contrib-log.json")]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.locbalance.main(args)
    obs_data = pd.read_csv(outfile)
    exp_data = pd.read_csv(data_file("three-contrib-log-balance.csv"))
    assert obs_data.equals(exp_data)
    terminal = capfd.readouterr()
    print(terminal.out)
    assert "Extent of imbalance (chi-square statistic): 0.0093" in terminal.out
    assert "MHDBL000212: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50.00" in terminal.out
Exemple #20
0
def test_diff_cli():
    f1 = data_file("prof/diff-comp-1.json")
    f2 = data_file("prof/diff-comp-3.json")
    with NamedTemporaryFile(suffix=".json") as outfile:
        arglist = ["diff", "-o", outfile.name, f1, f2]
        args = microhapulator.cli.get_parser().parse_args(arglist)
        microhapulator.cli.diff.main(args)
        with microhapulator.open(outfile.name, "r") as fh:
            output = fh.read().strip()
        with microhapulator.open(data_file("diff-comp-1-3.txt"), "r") as fh:
            testoutput = fh.read().strip()
        assert output == testoutput
def test_dist_cli():
    with NamedTemporaryFile() as outfile:
        arglist = [
            "dist",
            "--out",
            outfile.name,
            data_file("prof/gujarati-ind2-gt.json"),
            data_file("prof/gujarati-ind3-gt.json"),
        ]
        args = microhapulator.cli.get_parser().parse_args(arglist)
        microhapulator.cli.dist.main(args)
        with open(outfile.name, "r") as fh:
            assert json.load(fh) == {"hamming_distance": 3}
Exemple #22
0
def test_type_cli_simple(tmp_path):
    outfile = str(tmp_path / "typing-result.json")
    arglist = [
        "type",
        "--out",
        outfile,
        data_file("pashtun-sim/tiny-panel.tsv"),
        data_file("pashtun-sim/aligned-reads.bam"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.type.main(args)
    result = TypingResult(fromfile=outfile)
    assert result.haplotypes("mh13KK-218") == set()
    assert result.data["markers"]["mh13KK-218"]["typing_result"] == {
        "C,T,C,G": 1,
        "C,T,T,T": 1,
        "G,T,C,T": 1,
        "T,A,C,T": 1,
        "T,G,C,T": 3,
        "T,G,T,T": 2,
        "T,T,A,T": 5,
        "T,T,C,A": 1,
        "T,T,C,C": 2,
        "T,T,C,G": 2,
        "T,T,C,T": 1178,
        "T,T,G,T": 2,
        "T,T,T,A": 2,
        "T,T,T,G": 6,
        "T,T,T,T": 1170,
    }
    assert result.haplotypes("mh21KK-320") == set()
    assert result.data["markers"]["mh21KK-320"]["typing_result"] == {
        "G,A,A,A": 1,
        "G,A,C,A": 3,
        "G,A,G,A": 3,
        "G,A,T,A": 1075,
        "G,A,T,C": 1,
        "G,A,T,G": 1,
        "G,A,T,T": 2,
        "G,C,C,A": 1,
        "G,C,T,A": 4,
        "G,G,A,A": 2,
        "G,G,A,T": 1,
        "G,G,C,A": 1075,
        "G,G,C,C": 3,
        "G,G,C,G": 12,
        "G,G,C,T": 5,
        "G,G,T,A": 4,
        "G,T,C,A": 1,
        "T,G,C,A": 1,
    }
Exemple #23
0
def test_diff_large():
    gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json"))
    gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-3.json"))
    diff = list(mhapi.diff(gt1, gt2))
    loci = [d[0] for d in diff]
    print(diff[9], diff[17], diff[21])
    assert loci == [
        "MHDBL000002",
        "MHDBL000003",
        "MHDBL000007",
        "MHDBL000013",
        "MHDBL000017",
        "MHDBL000018",
        "MHDBL000030",
        "MHDBL000036",
        "MHDBL000038",
        "MHDBL000047",
        "MHDBL000058",
        "MHDBL000061",
        "MHDBL000076",
        "MHDBL000079",
        "MHDBL000082",
        "MHDBL000085",
        "MHDBL000088",
        "MHDBL000101",
        "MHDBL000106",
        "MHDBL000108",
        "MHDBL000111",
        "MHDBL000112",
        "MHDBL000122",
        "MHDBL000124",
        "MHDBL000128",
        "MHDBL000129",
        "MHDBL000135",
        "MHDBL000136",
        "MHDBL000138",
        "MHDBL000140",
        "MHDBL000144",
        "MHDBL000152",
        "MHDBL000154",
        "MHDBL000163",
        "MHDBL000181",
        "MHDBL000183",
        "MHDBL000194",
        "MHDBL000210",
        "MHDBL000211",
        "MHDBL000212",
    ]
    assert diff[9] == ("MHDBL000047", set(), {"T,T"})
    assert diff[17] == ("MHDBL000101", {"C,C,C,T"}, {"T,C,C,C"})
    assert diff[21] == ("MHDBL000112", {"G,G,A,C"}, set())
def test_dist_sim_vs_obs(hdist):
    with NamedTemporaryFile() as outfile:
        filename = "murica/z-obs-genotype-dist{:d}.json".format(hdist)
        arglist = [
            "dist",
            "--out",
            outfile.name,
            data_file(filename),
            data_file("murica/z-sim-genotype.json"),
        ]
        args = microhapulator.cli.get_parser().parse_args(arglist)
        microhapulator.cli.dist.main(args)
        with open(outfile.name, "r") as fh:
            assert json.load(fh) == {"hamming_distance": hdist}
Exemple #25
0
def test_main_out_stdout(capsys):
    arglist = [
        "seq",
        "--num-reads",
        "100",
        data_file("def/orange-offsets.tsv"),
        data_file("refr/orange-refr.fasta"),
        data_file("prof/orange-sim-profile.json"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.seq.main(args)
    terminal = capsys.readouterr()
    outlines = terminal.out.strip().split("\n")
    nrecords = len(outlines) / 4
    assert nrecords == pytest.approx(100, abs=5)
Exemple #26
0
def test_main(tmp_path):
    outfile = str(tmp_path / "profile.json")
    arglist = [
        "sim",
        "--out",
        outfile,
        "--seed",
        "1985",
        data_file("freq/ceu50-freq.tsv"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.sim.main(args)
    observed = SimulatedProfile(fromfile=outfile)
    expected = SimulatedProfile(fromfile=data_file("prof/bitusa-profile.json"))
    assert observed == expected
def test_unite_cli():
    with NamedTemporaryFile(suffix=".json") as outfile:
        arglist = [
            "unite",
            "--seed",
            "113817",
            "--out",
            outfile.name,
            data_file("prof/green-mom-3-gt.json"),
            data_file("prof/green-dad-3-gt.json"),
        ]
        args = microhapulator.cli.get_parser().parse_args(arglist)
        microhapulator.cli.unite.main(args)
        p = Profile(fromfile=outfile.name)
        testp = Profile(fromfile=data_file("prof/green-kid-3-gt.json"))
        assert p == testp
def test_merge_sim_genotypes():
    prof1 = SimulatedProfile()
    prof1.add(0, "mh11CP-004", "C,G,G")
    prof1.add(1, "mh11CP-004", "C,G,G")
    prof1.add(0, "mh05KK-123", "A,C")
    prof1.add(1, "mh05KK-123", "A,T")
    prof2 = SimulatedProfile()
    prof2.add(0, "mh11CP-004", "C,T,A")
    prof2.add(1, "mh11CP-004", "C,T,G")
    prof2.add(0, "mh05KK-123", "A,T")
    prof2.add(1, "mh05KK-123", "A,T")
    prof3 = SimulatedProfile()
    prof3.add(0, "mh11CP-004", "C,G,G")
    prof3.add(1, "mh11CP-004", "T,G,G")
    prof3.add(0, "mh05KK-123", "G,C")
    prof3.add(1, "mh05KK-123", "G,T")
    profile = SimulatedProfile.merge([prof1, prof2, prof3])
    markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t")
    output = profile.bedstr(markers)
    print(output)
    assert output == ("mh05KK-123\t121\t122\tA|A|A|A|G|G\n"
                      "mh05KK-123\t228\t229\tC|T|T|T|C|T\n"
                      "mh11CP-004\t162\t163\tC|C|C|C|C|T\n"
                      "mh11CP-004\t163\t164\tG|G|T|T|G|G\n"
                      "mh11CP-004\t187\t188\tG|G|A|G|G|G\n")
def test_bed_error():
    p = SimulatedProfile()
    p.add(0, "BOGUS", "A,C,C")
    p.add(1, "BOGUS", "A,C,C")
    markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t")
    with pytest.raises(ValueError, match=r"unknown marker identifier 'BOGUS'"):
        print(p.bedstr(markers))
Exemple #30
0
def test_mix_main():
    with NamedTemporaryFile(suffix=".json.gz") as outfile:
        arglist = [
            "mix",
            "--out",
            outfile.name,
            data_file("prof/green-sim-gt-1.json.gz"),
            data_file("prof/green-sim-gt-2.json.gz"),
            data_file("prof/green-sim-gt-3.json.gz"),
        ]
        args = microhapulator.cli.get_parser().parse_args(arglist)
        microhapulator.cli.mix.main(args)
        p = SimulatedProfile(fromfile=outfile.name)
        testp = SimulatedProfile(
            fromfile=data_file("prof/green-sim-gt-combined.json.gz"))
        assert p == testp