Ejemplo n.º 1
0
def test_bed_error():
    p = SimulatedProfile()
    p.add(0, "BOGUS", "A,C,C")
    p.add(1, "BOGUS", "A,C,C")
    markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t")
    with pytest.raises(ValueError, match=r"unknown marker identifier 'BOGUS'"):
        print(p.bedstr(markers))
Ejemplo n.º 2
0
def sim(frequencies, seed=None):
    """Simulate a diploid genotype from the specified microhaplotype frequencies

    :param pandas.DataFrame frequencies: population haplotype frequencies
    :param int seed: seed for random number generator
    :returns: a simulated genotype profile for all markers specified in the haplotype frequencies
    :rtype: microhapulator.profile.SimulatedProfile
    """
    profile = SimulatedProfile(ploidy=2)
    if seed is None:
        seed = np.random.randint(2**32 - 1)
    profile.data["metadata"] = {
        "HaploSeed": seed,
    }
    np.random.seed(seed)
    markers = sorted(frequencies.Marker.unique())
    for haploindex in range(2):
        for marker in markers:
            haplofreqs = frequencies[frequencies.Marker == marker]
            haplotypes = list(haplofreqs.Haplotype)
            freqs = list(haplofreqs.Frequency)
            freqs = [x / sum(freqs) for x in freqs]
            sampled_haplotype = np.random.choice(haplotypes, p=freqs)
            profile.add(haploindex, marker, sampled_haplotype)
    message = f"simulated microhaplotype variation at {len(markers)} markers"
    print("[MicroHapulator::sim]", message, file=sys.stderr)
    return profile
Ejemplo n.º 3
0
def test_diff_basic():
    gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json"))
    gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-2.json"))
    diff = list(mhapi.diff(gt1, gt2))
    assert diff == [
        ("MHDBL000140", {"C,C,A,A"}, {"C,C,T,A"}),
        ("MHDBL000163", {"A,A,G,A,T"}, {"C,G,A,A,T"}),
    ]
Ejemplo n.º 4
0
def test_diff_nonmatching_alleles():
    p1 = SimulatedProfile(fromfile=data_file("prof/red-strict-profile.json"))
    p2 = SimulatedProfile(fromfile=data_file("prof/red-relaxed-profile.json"))
    diff = list(mhapi.diff(p1, p2))
    print(diff)
    assert diff == [
        ("mh07CP-004", set(), {"T,T,T,A,T", "A,A,T,A,T"}),
        ("mh09KK-157", set(), {"G,C,C,A,T"}),
    ]
Ejemplo n.º 5
0
def test_diff_large():
    gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json"))
    gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-3.json"))
    diff = list(mhapi.diff(gt1, gt2))
    loci = [d[0] for d in diff]
    print(diff[9], diff[17], diff[21])
    assert loci == [
        "MHDBL000002",
        "MHDBL000003",
        "MHDBL000007",
        "MHDBL000013",
        "MHDBL000017",
        "MHDBL000018",
        "MHDBL000030",
        "MHDBL000036",
        "MHDBL000038",
        "MHDBL000047",
        "MHDBL000058",
        "MHDBL000061",
        "MHDBL000076",
        "MHDBL000079",
        "MHDBL000082",
        "MHDBL000085",
        "MHDBL000088",
        "MHDBL000101",
        "MHDBL000106",
        "MHDBL000108",
        "MHDBL000111",
        "MHDBL000112",
        "MHDBL000122",
        "MHDBL000124",
        "MHDBL000128",
        "MHDBL000129",
        "MHDBL000135",
        "MHDBL000136",
        "MHDBL000138",
        "MHDBL000140",
        "MHDBL000144",
        "MHDBL000152",
        "MHDBL000154",
        "MHDBL000163",
        "MHDBL000181",
        "MHDBL000183",
        "MHDBL000194",
        "MHDBL000210",
        "MHDBL000211",
        "MHDBL000212",
    ]
    assert diff[9] == ("MHDBL000047", set(), {"T,T"})
    assert diff[17] == ("MHDBL000101", {"C,C,C,T"}, {"T,C,C,C"})
    assert diff[21] == ("MHDBL000112", {"G,G,A,C"}, set())
Ejemplo n.º 6
0
def test_main(tmp_path):
    outfile = str(tmp_path / "profile.json")
    arglist = [
        "sim",
        "--out",
        outfile,
        "--seed",
        "1985",
        data_file("freq/ceu50-freq.tsv"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.sim.main(args)
    observed = SimulatedProfile(fromfile=outfile)
    expected = SimulatedProfile(fromfile=data_file("prof/bitusa-profile.json"))
    assert observed == expected
Ejemplo n.º 7
0
def test_pipe_gbr_usc10(tmp_path):
    hg38 = str(tmp_path / "hg38-placeholder.fasta")
    copyfile(data_file("refr/usc10-refr.fna"), hg38)
    run(["bwa", "index", hg38])
    arglist = [
        "pipe",
        data_file("refr/usc10-refr.fna"),
        data_file("def/usc10-offsets.tsv"),
        data_file(""),
        "gbr-usc",
        "--workdir",
        str(tmp_path),
        "--threads",
        "1",
        "--hg38",
        hg38,
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.pipe.main(args)
    expected = SimulatedProfile(fromfile=data_file("prof/gbr-usc10-sim.json"))
    observed = TypingResult(fromfile=tmp_path / "analysis" / "gbr-usc" /
                            "gbr-usc-type.json")
    diff = list(mhapi.diff(observed, expected))
    assert len(diff) == 0
    assert (tmp_path / "report.html").is_file()
    expected = pd.read_csv(data_file("gbr-usc-summary.tsv"), sep="\t")
    observed = pd.read_csv(tmp_path / "analysis" / "summary.tsv", sep="\t")
    assert observed.equals(expected)
Ejemplo n.º 8
0
def test_mix_main():
    with NamedTemporaryFile(suffix=".json.gz") as outfile:
        arglist = [
            "mix",
            "--out",
            outfile.name,
            data_file("prof/green-sim-gt-1.json.gz"),
            data_file("prof/green-sim-gt-2.json.gz"),
            data_file("prof/green-sim-gt-3.json.gz"),
        ]
        args = microhapulator.cli.get_parser().parse_args(arglist)
        microhapulator.cli.mix.main(args)
        p = SimulatedProfile(fromfile=outfile.name)
        testp = SimulatedProfile(
            fromfile=data_file("prof/green-sim-gt-combined.json.gz"))
        assert p == testp
Ejemplo n.º 9
0
def test_profile_roundtrip(tmp_path):
    seed = numpy.random.randint(1, 2**32 - 1)
    freqs = pd.read_csv(data_file("freq/asw5-freq.tsv"), sep="\t")
    profile = mhapi.sim(freqs, seed=seed)
    profile.dump(tmp_path / "profile.json")
    test = SimulatedProfile(fromfile=tmp_path / "profile.json")
    assert profile == test
    assert str(profile) == str(test)
Ejemplo n.º 10
0
def test_dist_even_mixture():
    with microhapulator.open(data_file("murica/x-obs-genotype.json"),
                             "r") as fh:
        p1 = TypingResult(fh)
    p2 = SimulatedProfile.populate_from_bed(
        data_file("murica/x-sim-genotype.bed"))
    assert mhapi.dist(p1, p2) == 0
    assert p1 == p2
Ejemplo n.º 11
0
def test_haplotypes():
    simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz"))
    typeprof = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof.haplotypes("BoGuSlOcUs") == set()
    assert typeprof.haplotypes("BoGuSlOcUs") == set()
    assert simprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"])
    assert typeprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"])
    assert simprof.haplotypes("MHDBL000135", index=0) == set(["G,C,T"])
    assert simprof.haplotypes("MHDBL000135", index=1) == set(["G,T,C"])
    assert typeprof.haplotypes("MHDBL000135", index=0) == set()
Ejemplo n.º 12
0
def test_sim_obs_profile_not_equal():
    simprof1 = SimulatedProfile.populate_from_bed(
        data_file("gttest-mismatch1.bed.gz"))
    assert simprof1 is not None
    assert simprof1 != 42
    assert simprof1 != 3.14159
    assert simprof1 != "A,C,C,T"

    typeprof1 = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof1 != typeprof1
    assert typeprof1 != simprof1
    assert typeprof1 != 1985
    assert typeprof1 != 98.6

    simprof2 = SimulatedProfile.populate_from_bed(
        data_file("gttest-mismatch2.bed.gz"))
    assert simprof1 != simprof2
    assert simprof2 != typeprof1
    assert typeprof1 != simprof2

    typeprof2 = TypingResult(fromfile=data_file("prof/gttest-altered.json"))
    assert typeprof1 != typeprof2
Ejemplo n.º 13
0
def test_main_haplo_seq(tmp_path):
    profile = str(tmp_path / "profile.json")
    hapseq = str(tmp_path / "haplo.fasta")
    arglist = [
        "sim",
        "--seed",
        "293847",
        "--out",
        profile,
        "--haplo-seq",
        hapseq,
        "--sequences",
        data_file("refr/orange-refr.fasta"),
        "--markers",
        data_file("def/orange-offsets.tsv"),
        data_file("freq/asw2-freq.tsv"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.sim.main(args)
    observed = SimulatedProfile(fromfile=profile)
    expected = SimulatedProfile(
        fromfile=data_file("prof/orange-sim-profile.json"))
    assert observed == expected
    assert filecmp.cmp(hapseq, data_file("orange-haplo.fasta"))
Ejemplo n.º 14
0
def test_dist_log_mixture():
    p1 = TypingResult(data_file("murica/y-obs-genotype.json"))
    p2 = SimulatedProfile.populate_from_bed(
        data_file("murica/y-sim-genotype.bed"))
    assert mhapi.dist(p1, p2) == 19
    assert p1 != p2
Ejemplo n.º 15
0
def test_merge_sim_genotypes():
    prof1 = SimulatedProfile()
    prof1.add(0, "mh11CP-004", "C,G,G")
    prof1.add(1, "mh11CP-004", "C,G,G")
    prof1.add(0, "mh05KK-123", "A,C")
    prof1.add(1, "mh05KK-123", "A,T")
    prof2 = SimulatedProfile()
    prof2.add(0, "mh11CP-004", "C,T,A")
    prof2.add(1, "mh11CP-004", "C,T,G")
    prof2.add(0, "mh05KK-123", "A,T")
    prof2.add(1, "mh05KK-123", "A,T")
    prof3 = SimulatedProfile()
    prof3.add(0, "mh11CP-004", "C,G,G")
    prof3.add(1, "mh11CP-004", "T,G,G")
    prof3.add(0, "mh05KK-123", "G,C")
    prof3.add(1, "mh05KK-123", "G,T")
    profile = SimulatedProfile.merge([prof1, prof2, prof3])
    markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t")
    output = profile.bedstr(markers)
    print(output)
    assert output == ("mh05KK-123\t121\t122\tA|A|A|A|G|G\n"
                      "mh05KK-123\t228\t229\tC|T|T|T|C|T\n"
                      "mh11CP-004\t162\t163\tC|C|C|C|C|T\n"
                      "mh11CP-004\t163\t164\tG|G|T|T|G|G\n"
                      "mh11CP-004\t187\t188\tG|G|A|G|G|G\n")
Ejemplo n.º 16
0
def test_sim_obs_profile_equality():
    simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz"))
    typeprof = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof == typeprof
    assert typeprof == simprof
Ejemplo n.º 17
0
def test_haploindexes():
    simprof = SimulatedProfile.populate_from_bed(
        data_file("gttest-mismatch1.bed.gz"))
    assert simprof.haploindexes() == set([0, 1])
    typeprof = TypingResult(data_file("pashtun-sim/test-output.json"))
    assert typeprof.haploindexes() == set()
Ejemplo n.º 18
0
def test_diff2():
    gt1 = SimulatedProfile(fromfile=data_file("prof/euramer-sim-gt.json"))
    gt2 = SimulatedProfile(fromfile=data_file("prof/euramer-inf-gt.json"))
    diff = list(mhapi.diff(gt1, gt2))
    assert diff == [("MHDBL000018", set(), {"T,G,C,T,A"})]
Ejemplo n.º 19
0
def test_meaning_of_life():
    freqs = pd.read_csv(data_file("freq/ceu50-freq.tsv"), sep="\t")
    observed = mhapi.sim(freqs, seed=42)
    expected = SimulatedProfile(
        fromfile=data_file("prof/meaning-of-life.json.gz"))
    assert observed == expected
Ejemplo n.º 20
0
def main(args):
    profiles = [SimulatedProfile(pfile) for pfile in args.profiles]
    combined = SimulatedProfile.merge(profiles)
    with mhopen(args.out, "w") as fh:
        combined.dump(fh)