Ejemplo n.º 1
0
def run_simulation(ts, n, Ne, theta, rho):

    sample_data = tsinfer.formats.SampleData.from_tree_sequence(ts)
    inferred_ts = tsinfer.infer(sample_data) 
    dated_ts = tsdate.age_inference(ts,theta=theta, rho=rho)
    dated_inferred_ts_mut = tsdate.age_inference(inferred_ts, theta=theta, rho=rho)
    return(ts, dated_ts, dated_inferred_ts_mut)
Ejemplo n.º 2
0
 def test_random_data_inferred_no_simplify(self):
     samples = self.get_random_data_example(
         10 * np.arange(10), num_samples=10, seed=2)
     inferred_ts = tsinfer.infer(samples, simplify=False)
     ts = self.verify(inferred_ts, 55, 57)
     self.assertTrue(np.array_equal(
         ts.genotype_matrix(), inferred_ts.genotype_matrix()))
Ejemplo n.º 3
0
 def test_simple_sim_multi_tree(self):
     ts = msprime.simulate(8,
                           mutation_rate=5,
                           recombination_rate=5,
                           random_seed=2)
     self.assertGreater(ts.num_trees, 1)
     for use_times in [True, False]:
         sample_data = tsinfer.SampleData.from_tree_sequence(
             ts, use_sites_time=use_times)
         inferred_ts = tsinfer.infer(sample_data)
         max_dated_ts = tsdate.date(inferred_ts,
                                    Ne=1,
                                    mutation_rate=5,
                                    method="maximization")
         self.assertTrue(
             all([
                 a == b
                 for a, b in zip(ts.haplotypes(), max_dated_ts.haplotypes())
             ]))
         io_dated_ts = tsdate.date(inferred_ts, Ne=1, mutation_rate=5)
         self.assertTrue(
             all([
                 a == b
                 for a, b in zip(ts.haplotypes(), io_dated_ts.haplotypes())
             ]))
Ejemplo n.º 4
0
 def test_simple_sim_1_tree(self):
     ts = msprime.simulate(8, mutation_rate=5, random_seed=2)
     for use_times in [True, False]:
         sample_data = tsinfer.SampleData.from_tree_sequence(ts, use_times=use_times)
         inferred_ts = tsinfer.infer(sample_data)
         dated_ts = tsdate.date(inferred_ts, Ne=1, mutation_rate=5)
         self.assertTrue(
             all([a == b for a, b in zip(ts.haplotypes(), dated_ts.haplotypes())]))
Ejemplo n.º 5
0
 def test_inferred_no_simplify(self):
     ts = msprime.simulate(10,
                           recombination_rate=2,
                           mutation_rate=10,
                           random_seed=3)
     samples = tsinfer.SampleData.from_tree_sequence(ts, use_times=False)
     ts = tsinfer.infer(samples, simplify=False)
     self.verify(ts)
Ejemplo n.º 6
0
 def test_two_populations_high_migration_inferred_no_simplify(self):
     ts = self.two_populations_high_migration_example()
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     inferred_ts = tsinfer.infer(samples, simplify=False)
     assert inferred_ts.num_populations == ts.num_populations
     self.verify(inferred_ts,
                 [inferred_ts.samples(0),
                  inferred_ts.samples(1)])
Ejemplo n.º 7
0
def infer_ts(filename):
    ''' Inferes tree sequence from genotype matrix
        Args: filename'''
    sample_data = read_samples(filename)
    inferred_ts = tsinfer.infer(sample_data)
    for tree in inferred_ts.trees():
        print(tree.draw(format="unicode"))
    return inferred_ts
Ejemplo n.º 8
0
 def test_inferred(self):
     ts = msprime.simulate(10,
                           recombination_rate=2,
                           mutation_rate=10,
                           random_seed=3)
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     ts = tsinfer.infer(samples)
     self.verify(ts)
Ejemplo n.º 9
0
 def test_random_data_inferred_simplify(self):
     samples = self.get_random_data_example(5 * np.arange(10),
                                            num_samples=10,
                                            seed=2)
     inferred_ts = tsinfer.infer(samples, simplify=True)
     ts = self.verify(inferred_ts, 12, 15)
     assert np.array_equal(ts.genotype_matrix(),
                           inferred_ts.genotype_matrix())
Ejemplo n.º 10
0
 def test_two_populations_high_migration_inferred(self):
     ts = self.two_populations_high_migration_example()
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     inferred_ts = tsinfer.infer(samples)
     self.assertEqual(inferred_ts.num_populations, ts.num_populations)
     self.verify(inferred_ts,
                 [inferred_ts.samples(0),
                  inferred_ts.samples(1)])
Ejemplo n.º 11
0
 def test_equivalance(self):
     rho = 2
     ts = msprime.simulate(5,
                           mutation_rate=2,
                           recombination_rate=rho,
                           random_seed=2)
     G = ts.genotype_matrix()
     positions = [site.position for site in ts.sites()]
     ts1 = tsinfer.infer(genotypes=G,
                         positions=positions,
                         sequence_length=ts.sequence_length,
                         num_threads=1)
     ts2 = tsinfer.infer(genotypes=G,
                         positions=positions,
                         sequence_length=ts.sequence_length,
                         num_threads=5)
     self.assertTreeSequencesEqual(ts1, ts2)
Ejemplo n.º 12
0
 def test_inferred_random_data(self):
     np.random.seed(10)
     num_sites = 40
     num_samples = 8
     G = np.random.randint(2, size=(num_sites, num_samples)).astype(np.int8)
     with tsinfer.SampleData() as sample_data:
         for j in range(num_sites):
             sample_data.add_site(j, G[j])
     ts = tsinfer.infer(sample_data)
     self.verify(ts)
Ejemplo n.º 13
0
 def verify_from_inferred(self, remove_leaves):
     ts = msprime.simulate(15, recombination_rate=1, mutation_rate=2, random_seed=3)
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     inferred = tsinfer.infer(samples)
     ancestors_ts = tsinfer.make_ancestors_ts(
         samples, inferred, remove_leaves=remove_leaves)
     tsinfer.check_ancestors_ts(ancestors_ts)
     for engine in [tsinfer.PY_ENGINE, tsinfer.C_ENGINE]:
         final_ts = tsinfer.match_samples(samples, ancestors_ts, engine=engine)
     tsinfer.verify(samples, final_ts)
Ejemplo n.º 14
0
def infer_from_msprime(simulation):
    ''' Given msprime simulation results, obtains the corresponding inferred
        tree sequence using tsinfer
        Args: result - msprime output
    '''

    with tsinfer.SampleData (sequence_length=simulation.sequence_length, num_flush_threads=2) as sample_data:
        for var in simulation.variants ():
            sample_data.add_site ( var.site.position, var.genotypes, var.alleles )
    inferred_ts = tsinfer.infer (sample_data)
    return inferred_ts
def main():

    description = """Simple CLI wrapper for tsinfer
        tskit version: {}
        tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--verbosity', '-v', action='count', default=0)
    parser.add_argument(
        "samples",
        help="The samples file name, as saved by tsinfer.SampleData.initialise()")
    parser.add_argument(
        "output",
        help="The path to write the output file to")
    parser.add_argument(
        "-l", "--length", default=None, type=int,
        help="The total sequence length")
    parser.add_argument(
        "-t", "--threads", default=1, type=int,
        help="The number of worker threads to use")
    parser.add_argument(
        "-m", "--method", default="C", choices=['C','P'],
        help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)")
    parser.add_argument(
        "--inject-real-ancestors-from-ts", default=None,
        help="Instead of inferring ancestors, construct known ones from this tree sequence file path")
    parser.add_argument(
        "-V", "--version", action='version', version=description)

    args = parser.parse_args()

    engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE

    if not os.path.isfile(args.samples):
        raise ValueError("No samples file")
    sample_data = tsinfer.load(args.samples)
    if all(False for _ in sample_data.genotypes(inference_sites=True)):
        raise ValueError("No inference sites")
    if args.inject_real_ancestors_from_ts is not None:
        ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None)
        orig_ts = tskit.load(args.inject_real_ancestors_from_ts)
        eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts)
        ancestor_data.finalise()
        ancestors_ts = tsinfer.match_ancestors(
            sample_data, ancestor_data, engine=engine)
        ts = tsinfer.match_samples(
            sample_data, ancestors_ts, engine=engine, simplify=True)
    else:
        ts = tsinfer.infer(
            sample_data, num_threads=args.threads, engine=engine)
    ts.dump(args.output)
Ejemplo n.º 16
0
def run_infer(args):
    setup_logging(args)
    progress_monitor = ProgressMonitor(enabled=args.progress,
                                       generate_ancestors=True,
                                       match_ancestors=True,
                                       match_samples=True)
    sample_data = tsinfer.SampleData.load(args.samples)
    ts = tsinfer.infer(sample_data,
                       progress_monitor=progress_monitor,
                       num_threads=args.num_threads)
    output_trees = get_output_trees_path(args.output_trees, args.samples)
    logger.info("Writing output tree sequence to {}".format(output_trees))
    ts.dump(output_trees)
    summarise_usage()
Ejemplo n.º 17
0
 def test_no_error(self):
     num_sites = 10
     G, positions = get_random_data_example(5, num_sites)
     for method in ["python", "c"]:
         ts = tsinfer.infer(genotypes=G,
                            positions=positions,
                            sequence_length=num_sites,
                            method=method)
         self.assertEqual(ts.num_sites, num_sites)
         self.assertEqual(ts.num_mutations, num_sites)
         for site in ts.sites():
             self.assertEqual(site.ancestral_state, "0")
             self.assertEqual(len(site.mutations), 1)
             mutation = site.mutations[0]
             self.assertEqual(mutation.derived_state, "1")
             self.assertEqual(mutation.parent, -1)
Ejemplo n.º 18
0
 def verify_data_round_trip(self,
                            genotypes,
                            positions,
                            sequence_length=None):
     if sequence_length is None:
         sequence_length = positions[-1] + 1
     # import daiquiri
     # daiquiri.setup(level="DEBUG")
     for method in ["python", "C"]:
         ts = tsinfer.infer(genotypes=genotypes,
                            positions=positions,
                            sequence_length=sequence_length,
                            method=method)
         self.assertEqual(ts.sequence_length, sequence_length)
         self.assertEqual(ts.num_sites, len(positions))
         for v in ts.variants():
             self.assertEqual(v.position, positions[v.index])
             self.assertTrue(np.array_equal(genotypes[v.index],
                                            v.genotypes))
def iteration_tsdate(constr_sample_data,
                     constr_sites,
                     Ne,
                     mut_rate,
                     adjust_priors=True):
    iter_infer = tsinfer.infer(constr_sample_data).simplify()
    priors = tsdate.build_prior_grid(iter_infer)
    if adjust_priors and constr_sites:
        for mut_pos, limit in constr_sites.items():
            infer_mut_pos = np.where(
                mut_pos == iter_infer.tables.sites.position)[0][0]
            node = (iter_infer.tables.mutations.node[infer_mut_pos] -
                    iter_infer.num_samples)
            priors.grid_data[node][:(np.abs(priors.timepoints * 20000 -
                                            limit)).argmin()] = 0
    iter_dates, _, _, _, _ = tsdate.get_dates(iter_infer,
                                              Ne=Ne,
                                              mutation_rate=mut_rate,
                                              priors=priors)
    return iter_infer, iter_dates * 2 * Ne
Ejemplo n.º 20
0
def run_infer(args):
    setup_logging(args)
    try:
        sample_data = tsinfer.SampleData.load(args.samples)
    except exceptions.FileFormatError as e:
        # Check if the user has tried to infer a tree sequence, a common basic mistake
        try:
            tskit.load(args.samples)
        except tskit.FileFormatError:
            raise e  # Re-raise the original error
        raise exceptions.FileFormatError(
            "Expecting a sample data file, not a tree sequence (you can create one "
            "via the Python function `tsinfer.SampleData.from_tree_sequence()`)."
        )
    sample_data = tsinfer.SampleData.load(args.samples)
    ts = tsinfer.infer(sample_data,
                       progress_monitor=args.progress,
                       num_threads=args.num_threads)
    output_trees = get_output_trees_path(args.output_trees, args.samples)
    logger.info("Writing output tree sequence to {}".format(output_trees))
    ts.dump(output_trees)
    summarise_usage()
Ejemplo n.º 21
0
 def test_random_data_inferred_no_simplify(self):
     samples = self.get_random_data_example(num_sites=20, num_samples=3)
     inferred_ts = tsinfer.infer(samples, simplify=False)
     samples = inferred_ts.samples()
     self.verify(inferred_ts, [samples[:1], samples[1:]])
Ejemplo n.º 22
0
 def test_infer(self):
     ts = msprime.simulate(10, mutation_rate=1, random_seed=1)
     assert ts.num_sites > 1
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     inferred_ts = tsinfer.infer(samples)
     self.validate_ts(inferred_ts)
Ejemplo n.º 23
0
import os
import sys

import msprime

sys.path.insert(0, os.path.abspath(".."))
import tsinfer  # noqa


ts = msprime.simulate(5, mutation_rate=0.7, random_seed=10)
tree = ts.first()
print(ts.num_sites)
print(tree.draw(format="unicode"))

with tsinfer.SampleData(path="toy.samples") as sample_data:
    sample_data.add_site(10, [0, 1, 0, 0, 0], ["A", "T"])
    sample_data.add_site(12, [0, 0, 0, 1, 1], ["G", "C"])
    sample_data.add_site(23, [0, 1, 1, 0, 0], ["C", "A"])
    sample_data.add_site(37, [0, 1, 1, 0, 0], ["G", "C"])
    sample_data.add_site(40, [0, 0, 0, 1, 1], ["A", "C"])
    sample_data.add_site(50, [0, 1, 0, 0, 0], ["T", "G"])

print(sample_data)

inferred_ts = tsinfer.infer(sample_data)
for tree in inferred_ts.trees():
    print(tree.draw(format="unicode"))

for sample_id, h in enumerate(inferred_ts.haplotypes()):
    print(sample_id, h, sep="\t")
Ejemplo n.º 24
0
    pickle.dump(M, f)

with open(os.path.join(write_loc, 'pickles_root_kids_list'), 'wb') as f:
    pickle.dump(list_of_root_and_kids, f)

## loading example
#with open(file_loc + 'pickled_pop_list.pickle', 'rb') as f:
#    pop_list = pickle.load(f)

########### inference on truncation part ###########

# ts infer

sd = tsinfer.SampleData.from_tree_sequence(truncated_ts, use_times=False)

ts_inferred = tsinfer.infer(sd, simplify=False)

ts_inferred = ts_inferred.simplify(filter_sites=False, keep_unary=True)

ts_inferred
ts_inferred.dump(os.path.join(write_loc, 'inferred_tree.trees'))
#Out[43]: <tskit.trees.TreeSequence at 0x1ed55ca9710>

X = 18
Y = 20
i = 0
for tree in ts_inferred.trees():
    if i > X and i <= Y:
        display(
            SVG(tree.draw(height=800, width=2000, tree_height_scale='rank')))
        print("Tree {} covers [{:.2f}, {:.2f}); TMRCA = {:.4f}".format(
Ejemplo n.º 25
0
def evaluate_tsdate_accuracy(
    parameter,
    parameters_arr,
    node_mut=False,
    inferred=True,
    prior_distr="lognorm",
    progress=True,
):
    Ne = 10000
    if node_mut and inferred:
        raise ValueError(
            "cannot evaluate node accuracy on inferred tree sequence")
    mutation_rate = 1e-8
    recombination_rate = 1e-8
    all_results = {
        i: {i: []
            for i in ["io", "max", "true_times"]}
        for i in list(map(str, parameters_arr))
    }

    random_seeds = range(1, 6)

    if inferred:
        inferred_progress = "using tsinfer"
    else:
        inferred_progress = "true topology"
    if node_mut:
        node_mut_progress = "comparing true and estimated node times"
    else:
        node_mut_progress = "comparing true and estimated mutation times"
    for _, param in tqdm(
            enumerate(parameters_arr),
            desc="Testing " + parameter + " " + inferred_progress +
            ". Evaluation by " + node_mut_progress,
            total=len(parameters_arr),
            disable=not progress,
    ):
        for random_seed in random_seeds:
            if parameter == "sample_size":
                sample_size = param
            else:
                sample_size = 100
            ts = msprime.simulate(
                sample_size=sample_size,
                Ne=Ne,
                length=1e6,
                mutation_rate=mutation_rate,
                recombination_rate=recombination_rate,
                random_seed=random_seed,
            )

            if parameter == "length":
                ts = msprime.simulate(
                    sample_size=sample_size,
                    Ne=Ne,
                    length=param,
                    mutation_rate=mutation_rate,
                    recombination_rate=recombination_rate,
                    random_seed=random_seed,
                )
            if parameter == "mutation_rate":
                mutated_ts = msprime.mutate(ts,
                                            rate=param,
                                            random_seed=random_seed)
            else:
                mutated_ts = msprime.mutate(ts,
                                            rate=mutation_rate,
                                            random_seed=random_seed)
            if inferred:
                sample_data = tsinfer.formats.SampleData.from_tree_sequence(
                    mutated_ts, use_times=False)
                target_ts = tsinfer.infer(sample_data).simplify()
            else:
                target_ts = mutated_ts

            if parameter == "mutation_rate":
                io_dated = tsdate.date(
                    target_ts,
                    mutation_rate=param,
                    Ne=Ne,
                    progress=False,
                    method="inside_outside",
                )
                max_dated = tsdate.date(
                    target_ts,
                    mutation_rate=param,
                    Ne=Ne,
                    progress=False,
                    method="maximization",
                )
            elif parameter == "timepoints":
                prior = tsdate.build_prior_grid(
                    target_ts,
                    timepoints=param,
                    approximate_prior=True,
                    prior_distribution=prior_distr,
                    progress=False,
                )
                io_dated = tsdate.date(
                    target_ts,
                    mutation_rate=mutation_rate,
                    prior=prior,
                    Ne=Ne,
                    progress=False,
                    method="inside_outside",
                )
                max_dated = tsdate.date(
                    target_ts,
                    mutation_rate=mutation_rate,
                    prior=prior,
                    Ne=Ne,
                    progress=False,
                    method="maximization",
                )
            else:
                io_dated = tsdate.date(
                    target_ts,
                    mutation_rate=mutation_rate,
                    Ne=Ne,
                    progress=False,
                    method="inside_outside",
                )
                max_dated = tsdate.date(
                    target_ts,
                    mutation_rate=mutation_rate,
                    Ne=Ne,
                    progress=False,
                    method="maximization",
                )
            if node_mut and not inferred:
                all_results[str(param)]["true_times"].append(
                    mutated_ts.tables.nodes.time[ts.num_samples:])
                all_results[str(param)]["io"].append(
                    io_dated.tables.nodes.time[ts.num_samples:])
                all_results[str(param)]["max"].append(
                    max_dated.tables.nodes.time[ts.num_samples:])
            else:
                all_results[str(param)]["true_times"].append(
                    mutated_ts.tables.nodes.time[
                        mutated_ts.tables.mutations.node])
                all_results[str(param)]["io"].append(
                    io_dated.tables.nodes.time[io_dated.tables.mutations.node])
                all_results[str(param)]["max"].append(
                    max_dated.tables.nodes.time[
                        max_dated.tables.mutations.node])

    return all_results, prior_distr, inferred, node_mut
Ejemplo n.º 26
0
 def test_tsinfer_output(self, small_sd_fixture):
     ts = tsinfer.infer(small_sd_fixture)
     with pytest.raises(ValueError):
         tsinfer.check_ancestors_ts(ts)
Ejemplo n.º 27
0
 def test_inferred_no_simplify(self, medium_sd_fixture):
     ts = tsinfer.infer(medium_sd_fixture, simplify=False)
     self.verify(ts)
Ejemplo n.º 28
0
 def test_inferred(self, medium_sd_fixture):
     ts = tsinfer.infer(medium_sd_fixture)
     self.verify(ts)
Ejemplo n.º 29
0
 def test_random_data_inferred(self):
     n = 20
     samples = self.get_random_data_example(num_sites=52, num_samples=n)
     inferred_ts = tsinfer.infer(samples)
     samples = inferred_ts.samples()
     self.verify(inferred_ts, [samples[:n // 2], samples[n // 2:]])
Ejemplo n.º 30
0
 def test_tsinfer_output(self):
     ts = msprime.simulate(10, mutation_rate=1, random_seed=1)
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     ts = tsinfer.infer(samples)
     with self.assertRaises(ValueError):
         tsinfer.check_ancestors_ts(ts)