def visualise(ts, recombination_rate, error_rate, engine="C", box_size=8, perfect_ancestors=False, path_compression=False, time_chunking=False): sample_data = tsinfer.SampleData.from_tree_sequence(ts) if perfect_ancestors: ancestor_data = tsinfer.AncestorData(sample_data) tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts, time_chunking=time_chunking) ancestor_data.finalise() else: ancestor_data = tsinfer.generate_ancestors(sample_data, engine=engine) ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data, engine=engine, path_compression=path_compression, extended_checks=True) inferred_ts = tsinfer.match_samples(sample_data, ancestors_ts, engine=engine, simplify=False, path_compression=path_compression, extended_checks=True) prefix = "tmp__NOBACKUP__/" visualiser = Visualiser(ts, sample_data, ancestor_data, inferred_ts, box_size=box_size) visualiser.draw_copying_paths(os.path.join(prefix, "copying_{}.png")) # tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=False) inferred_ts = tsinfer.match_samples(sample_data, ancestors_ts, engine=engine, simplify=True, path_compression=False, stabilise_node_ordering=True) tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=True) sys.stdout.flush() print("num_sites = ", inferred_ts.num_sites, "num_mutations= ", inferred_ts.num_mutations) for site in inferred_ts.sites(): if len(site.mutations) > 1: print("Multiple mutations at ", site.id, "over", [mut.node for mut in site.mutations])
def setUp(self): self.tempdir = tempfile.TemporaryDirectory(prefix="tsinfer_cli_test") self.sample_file = str( pathlib.Path(self.tempdir.name, "input-data.samples")) self.ancestor_file = str( pathlib.Path(self.tempdir.name, "input-data.ancestors")) self.ancestor_trees = str( pathlib.Path(self.tempdir.name, "input-data.ancestors.trees")) self.output_trees = str( pathlib.Path(self.tempdir.name, "input-data.trees")) self.input_ts = msprime.simulate(10, mutation_rate=10, recombination_rate=10, random_seed=10) sample_data = tsinfer.SampleData( sequence_length=self.input_ts.sequence_length, path=self.sample_file) for var in self.input_ts.variants(): sample_data.add_site(var.site.position, var.genotypes, var.alleles) sample_data.finalise() tsinfer.generate_ancestors(sample_data, path=self.ancestor_file, chunk_size=10) ancestor_data = tsinfer.load(self.ancestor_file) ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data) ancestors_ts.dump(self.ancestor_trees) ts = tsinfer.match_samples(sample_data, ancestors_ts) ts.dump(self.output_trees) sample_data.close()
def infer(self, ts, method, path_compression=False): sample_data = tsinfer.SampleData.initialise( num_samples=ts.num_samples, sequence_length=ts.sequence_length, compressor=None) for v in ts.variants(): sample_data.add_variant(v.site.position, v.alleles, v.genotypes) sample_data.finalise() ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None) tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts) ancestor_data.finalise() ancestors_ts = tsinfer.match_ancestors( sample_data, ancestor_data, method=method, path_compression=path_compression, extended_checks=True) inferred_ts = tsinfer.match_samples(sample_data, ancestors_ts, method=method, simplify=True, path_compression=path_compression, extended_checks=True) return inferred_ts
def run_match_samples(sample_data, ancestors_ts, num_threads): progress_monitor = tsinfer.cli.ProgressMonitor(enabled=True, match_samples=True) return tsinfer.match_samples(sample_data, ancestors_ts, num_threads=num_threads, simplify=False, progress_monitor=progress_monitor)
def verify_from_source(self, remove_leaves): ts = msprime.simulate(15, recombination_rate=1, mutation_rate=2, random_seed=3) samples = tsinfer.SampleData.from_tree_sequence(ts) ancestors_ts = tsinfer.make_ancestors_ts( samples, ts, remove_leaves=remove_leaves) tsinfer.check_ancestors_ts(ancestors_ts) for engine in [tsinfer.PY_ENGINE, tsinfer.C_ENGINE]: final_ts = tsinfer.match_samples(samples, ancestors_ts, engine=engine) tsinfer.verify(samples, final_ts)
def main(): description = """Simple CLI wrapper for tsinfer tskit version: {} tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--verbosity', '-v', action='count', default=0) parser.add_argument( "samples", help="The samples file name, as saved by tsinfer.SampleData.initialise()") parser.add_argument( "output", help="The path to write the output file to") parser.add_argument( "-l", "--length", default=None, type=int, help="The total sequence length") parser.add_argument( "-t", "--threads", default=1, type=int, help="The number of worker threads to use") parser.add_argument( "-m", "--method", default="C", choices=['C','P'], help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)") parser.add_argument( "--inject-real-ancestors-from-ts", default=None, help="Instead of inferring ancestors, construct known ones from this tree sequence file path") parser.add_argument( "-V", "--version", action='version', version=description) args = parser.parse_args() engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE if not os.path.isfile(args.samples): raise ValueError("No samples file") sample_data = tsinfer.load(args.samples) if all(False for _ in sample_data.genotypes(inference_sites=True)): raise ValueError("No inference sites") if args.inject_real_ancestors_from_ts is not None: ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None) orig_ts = tskit.load(args.inject_real_ancestors_from_ts) eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts) ancestor_data.finalise() ancestors_ts = tsinfer.match_ancestors( sample_data, ancestor_data, engine=engine) ts = tsinfer.match_samples( sample_data, ancestors_ts, engine=engine, simplify=True) else: ts = tsinfer.infer( sample_data, num_threads=args.threads, engine=engine) ts.dump(args.output)
def run_match_samples(args): setup_logging(args) sample_data = tsinfer.SampleData.load(args.input) ancestors_ts = get_ancestors_ts(args.ancestors_ts, args.input) output_ts = get_output_ts(args.output_ts, args.input) logger.info("Loading ancestral genealogies from {}".format(ancestors_ts)) ancestors_ts = msprime.load(ancestors_ts) ts = tsinfer.match_samples(sample_data, ancestors_ts, num_threads=args.num_threads, path_compression=not args.no_path_compression, progress=args.progress) logger.info("Writing output tree sequence to {}".format(output_ts)) ts.dump(output_ts)
def match_samples(samples_fn, inferred_anc_ts, num_threads, r_prob, m_prob, precision, prefix): sample_data = tsinfer.load(samples_fn) inferred_ts = tsinfer.match_samples( sample_data, inferred_anc_ts, num_threads=num_threads, recombination=r_prob, mismatch=m_prob, precision=precision, progress_monitor=True, force_sample_times=True, simplify=False, ) ts_path = prefix + ".nosimplify.trees" inferred_ts.dump(ts_path) return inferred_ts
def infer_with_mismatch( sample_data, path_to_genetic_map, ma_mismatch=1, ms_mismatch=1, precision=15, num_threads=1, path_compression=True, progress_monitor=False, ): ancestors = tsinfer.generate_ancestors( sample_data, num_threads=num_threads, progress_monitor=progress_monitor ) gmap = msprime.RateMap.read_hapmap( path_to_genetic_map, sequence_length=ancestors.sequence_length ) genetic_dists = tsinfer.Matcher.recombination_rate_to_dist( gmap, ancestors.sites_position[:] ) recombination = tsinfer.Matcher.recombination_dist_to_prob(genetic_dists) recombination[recombination == 0] = 1e-20 mismatch = np.full( len(ancestors.sites_position[:]), tsinfer.Matcher.mismatch_ratio_to_prob(1, np.median(genetic_dists), 2), ) ancestors_ts = tsinfer.match_ancestors( sample_data, ancestors, recombination=recombination, mismatch=mismatch, precision=precision, num_threads=num_threads, path_compression=path_compression, progress_monitor=progress_monitor, ) return tsinfer.match_samples( sample_data, ancestors_ts, recombination=recombination, mismatch=mismatch, precision=precision, num_threads=num_threads, path_compression=path_compression, progress_monitor=progress_monitor, )
def tsinfer_dev(n, L, seed, num_threads=1, recombination_rate=1e-8, error_rate=0, method="C", log_level="WARNING", debug=True, progress=False, path_compression=True): np.random.seed(seed) random.seed(seed) L_megabases = int(L * 10**6) # daiquiri.setup(level=log_level) ts = msprime.simulate(n, Ne=10**4, length=L_megabases, recombination_rate=recombination_rate, mutation_rate=1e-8, random_seed=seed) if debug: print("num_sites = ", ts.num_sites) assert ts.num_sites > 0 G = generate_samples(ts, error_rate) sample_data = tsinfer.SampleData.initialise( num_samples=ts.num_samples, sequence_length=ts.sequence_length) for site, genotypes in zip(ts.sites(), G): sample_data.add_variant(site.position, ["0", "1"], genotypes) sample_data.finalise() ancestor_data = tsinfer.AncestorData.initialise(sample_data) tsinfer.build_ancestors(sample_data, ancestor_data, method=method) ancestor_data.finalise() print(ancestor_data) ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data, method=method) output_ts = tsinfer.match_samples(sample_data, ancestors_ts, method=method) print("inferred_num_edges = ", output_ts.num_edges)
def run_infer(args): setup_logging(args) sample_data = tsinfer.SampleData.load(args.input) ancestor_data = tsinfer.AncestorData.initialise(sample_data) tsinfer.build_ancestors(sample_data, ancestor_data, progress=args.progress) ancestor_data.finalise() ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data, num_threads=args.num_threads, progress=args.progress) output_ts = get_output_ts(args.output_ts, args.input) ts = tsinfer.match_samples(sample_data, ancestors_ts, num_threads=args.num_threads, progress=args.progress) logger.info("Writing output tree sequence to {}".format(output_ts)) ts.dump(output_ts)
def run_match_samples(args): setup_logging(args) sample_data = tsinfer.SampleData.load(args.samples) ancestors_trees = get_ancestors_trees_path(args.ancestors_trees, args.samples) output_trees = get_output_trees_path(args.output_trees, args.samples) logger.info(f"Loading ancestral genealogies from {ancestors_trees}") ancestors_trees = tskit.load(ancestors_trees) ts = tsinfer.match_samples( sample_data, ancestors_trees, num_threads=args.num_threads, path_compression=not args.no_path_compression, simplify=not args.no_simplify, progress_monitor=args.progress, ) logger.info(f"Writing output tree sequence to {output_trees}") ts.dump(output_trees) summarise_usage()
def verify_inserted_ancestors(self, ts): # Verifies that we can round-trip the specified tree sequence # using the generated ancestors. NOTE: this must be an SMC # consistent tree sequence! sample_data = formats.SampleData.initialise( num_samples=ts.num_samples, sequence_length=ts.sequence_length, compressor=None) for v in ts.variants(): sample_data.add_variant(v.position, v.alleles, v.genotypes) sample_data.finalise() ancestor_data = formats.AncestorData.initialise(sample_data, compressor=None) tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts) ancestor_data.finalise() A = np.zeros((ancestor_data.num_sites, ancestor_data.num_ancestors), dtype=np.uint8) start = ancestor_data.start[:] end = ancestor_data.end[:] ancestors = ancestor_data.ancestor[:] for j in range(ancestor_data.num_ancestors): A[start[j]:end[j], j] = ancestors[j] for method in ["P", "C"]: ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data, method=method) self.assertEqual(ancestor_data.num_sites, ancestors_ts.num_sites) self.assertEqual(ancestor_data.num_ancestors, ancestors_ts.num_samples) self.assertTrue(np.array_equal(ancestors_ts.genotype_matrix(), A)) inferred_ts = tsinfer.match_samples(sample_data, ancestors_ts, method=method) self.assertTrue( np.array_equal(inferred_ts.genotype_matrix(), ts.genotype_matrix()))
def run(params): """ Run a single inference, with the specified rates """ rho = params.rec_rate[1:] base_rec_prob = np.quantile(rho, 0.5) ma_mis_rate = ms_mis_rate = 1.0 if params.precision is None: # Smallest recombination rate min_rho = int(np.ceil(-np.min(np.log10(rho)))) # Smallest mean av_min = int( np.ceil( -np.log10(min(1, ma_mis_rate, ms_mis_rate) * base_rec_prob))) precision = max(min_rho, av_min) + 3 else: precision = params.precision ma_mis = base_rec_prob * ma_mis_rate ms_mis = base_rec_prob * ms_mis_rate print( f"Starting {params.cutoff_power}, trim_oldest={params.trim_oldest}", f"with base rho {base_rec_prob:.5g}", f"(mean {np.mean(rho):.4g} median {np.quantile(rho, 0.5):.4g}", f"min {np.min(rho):.4g}, 2.5% quantile {np.quantile(rho, 0.025):.4g})", f"precision {precision}") prefix = None if params.sample_data.path is not None: assert params.sample_data.path.endswith(".samples") prefix = params.sample_data.path[0:-len(".samples")] inf_prefix = "{}_rma{}_rms{}_N{}_{}_p{}".format( prefix, ma_mis_rate, ms_mis_rate, params.cutoff_power, "trim" if params.trim_oldest else "norm", precision) start_time = time.process_time() anc = tsinfer.generate_ancestors( params.sample_data, cutoff_power=params.cutoff_power, trim_oldest=params.trim_oldest, num_threads=params.num_threads, path=None if inf_prefix is None else inf_prefix + ".ancestors", ) print(f"GA done (rel_ma_mis:{ma_mis_rate}, rel_ms_mis:{ms_mis_rate})") inferred_anc_ts = tsinfer.match_ancestors( params.sample_data, anc, num_threads=params.num_threads, precision=precision, recombination_rate=params.rec_rate, mismatch_rate=ma_mis, ) inferred_anc_ts.dump(path=inf_prefix + ".atrees") print(f"MA done: abs_ma_mis rate = {ma_mis}") inferred_ts = tsinfer.match_samples(params.sample_data, inferred_anc_ts, num_threads=params.num_threads, precision=precision, recombination_rate=params.rec_rate, mismatch_rate=ms_mis) process_time = time.process_time() - start_time ts_path = inf_prefix + ".trees" inferred_ts.dump(path=ts_path) print(f"MS done: abs_ms_mis rate = {ms_mis}") simplified_inferred_ts = inferred_ts.simplify() # Remove unary nodes # Calculate mean num children (polytomy-measure) for internal nodes nc_sum = 0 nc_sum_sq = 0 nc_tot = 0 root_lengths = collections.defaultdict(float) for tree in simplified_inferred_ts.trees(): for n in tree.nodes(): n_children = tree.num_children(n) if n_children > 0: # exclude leaves/samples nc_sum += n_children * tree.span nc_sum_sq += (n_children**2) * tree.span nc_tot += tree.span nc_mean = nc_sum / nc_tot nc_var = nc_sum_sq / nc_tot - (nc_mean**2 ) # can't be bothered to adjust for n # Calculate span of root nodes in simplified tree # Calculate KC try: kc = simplified_inferred_ts.kc_distance(tskit.load(prefix + ".trees")) except FileNotFoundError: kc = None return Results(abs_ma_mis=ma_mis, abs_ms_mis=ms_mis, rel_ma_mis=ma_mis_rate, rel_ms_mis=ms_mis_rate, cutoff_power=params.cutoff_power, trim_oldest=params.trim_oldest, precision=precision, edges=inferred_ts.num_edges, muts=inferred_ts.num_mutations, num_trees=inferred_ts.num_trees, kc=kc, mean_node_children=nc_mean, var_node_children=nc_var, process_time=process_time, ts_size=os.path.getsize(ts_path), ts_path=ts_path)
def run(params): """ Run a single inference, with the specified rates """ prefix = None if params.sample_data.path is not None: assert params.sample_data.path.endswith(".samples") prefix = params.sample_data.path[0:-len(".samples")] start_time = time.process_time() ga_start_time = time.process_time() if os.path.isfile(prefix + ".ancestors") == False: anc = tsinfer.generate_ancestors( params.sample_data, num_threads=params.num_threads, path=prefix + ".ancestors", progress_monitor=tsinfer.cli.ProgressMonitor(1, 1, 0, 0, 0), ) print( f"GA done (ma_mut: {params.ma_mut_rate}, ms_mut: {params.ms_mut_rate})" ) else: anc = tsinfer.load(prefix + ".ancestors") ga_process_time = time.process_time() - ga_start_time anc_w_proxy = anc.insert_proxy_samples(params.sample_data, allow_mutation=True) # If any proxy ancestors were added, save the proxy ancestors file and use for matching if anc_w_proxy.num_ancestors != anc.num_ancestors: anc = anc_w_proxy.copy(path=prefix + ".proxy.ancestors") anc.finalise() path_compression = False else: path_compression = True rec_rate = get_rho(anc, params.filename) rho = rec_rate[1:] base_rec_prob = np.quantile(rho, 0.5) if params.precision is None: # Smallest recombination rate min_rho = int(np.ceil(-np.min(np.log10(rho)))) # Smallest mean av_min = int( np.ceil(-np.log10( min(1, params.ma_mut_rate, params.ms_mut_rate) * base_rec_prob))) precision = max(min_rho, av_min) + 3 else: precision = params.precision print( f"Starting {params.ma_mut_rate} {params.ms_mut_rate}", f"with base rho {base_rec_prob:.5g}", f"(mean {np.mean(rho):.4g} median {np.quantile(rho, 0.5):.4g}", f"min {np.min(rho):.4g}, 2.5% quantile {np.quantile(rho, 0.025):.4g})", f"precision {precision}") ma_start_time = time.process_time() if os.path.isfile(prefix + ".atrees") == False: inferred_anc_ts = tsinfer.match_ancestors( params.sample_data, anc, num_threads=params.num_threads, precision=precision, recombination_rate=rec_rate, mismatch_rate=base_rec_prob * params.ma_mut_rate, path_compression=path_compression, progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 1, 0, 0), ) inferred_anc_ts.dump(path=prefix + ".atrees") print( f"MA done (ma_mut:{params.ma_mut_rate} ms_mut{params.ms_mut_rate})" ) else: inferred_anc_ts = tskit.load(prefix + ".atrees") ma_process_time = time.process_time() - ma_start_time ms_start_time = time.process_time() if os.path.isfile(prefix + ".trees") == False: inferred_ts = tsinfer.match_samples( params.sample_data, inferred_anc_ts, num_threads=params.num_threads, precision=precision, recombination_rate=rec_rate, mismatch_rate=base_rec_prob * params.ms_mut_rate, progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 0, 0, 1), force_sample_times=True, simplify=False) print(f"MS done: ms_mut rate = {params.ms_mut_rate})") process_time = time.process_time() - start_time ms_process_time = time.process_time() - ms_start_time ts_path = prefix + ".nosimplify.trees" inferred_ts.dump(path=ts_path) else: raise ValueError("Inferred tree sequence already present") return Results(ma_mut=params.ma_mut_rate, ms_mut=params.ms_mut_rate, precision=precision, edges=inferred_ts.num_edges, muts=inferred_ts.num_mutations, num_trees=inferred_ts.num_trees, process_time=process_time, ga_process_time=ga_process_time, ma_process_time=ma_process_time, ms_process_time=ms_process_time, ts_size=os.path.getsize(ts_path), ts_path=ts_path)
def tsinfer_dev( n, L, seed, num_threads=1, recombination_rate=1e-8, error_rate=0, engine="C", log_level="WARNING", precision=None, debug=True, progress=False, path_compression=True, ): np.random.seed(seed) random.seed(seed) L_megabases = int(L * 10**6) # daiquiri.setup(level=log_level) ts = msprime.simulate( n, Ne=10**4, length=L_megabases, recombination_rate=recombination_rate, mutation_rate=1e-8, random_seed=seed, ) if debug: print("num_sites = ", ts.num_sites) assert ts.num_sites > 0 # ts = msprime.mutate(ts, rate=1e-8, random_seed=seed, # model=msprime.InfiniteSites(msprime.NUCLEOTIDES)) samples = tsinfer.SampleData.from_tree_sequence(ts) rho = recombination_rate mu = 1e-3 # 1e-15 # num_alleles = samples.num_alleles(inference_sites=True) # num_sites = samples.num_inference_sites # with tsinfer.AncestorData(samples) as ancestor_data: # t = np.sum(num_alleles) + 1 # for j in range(num_sites): # for allele in range(num_alleles[j]): # ancestor_data.add_ancestor(j, j + 1, t, [j], [allele]) # t -= 1 ancestor_data = tsinfer.generate_ancestors(samples, engine=engine, num_threads=num_threads) ancestors_ts = tsinfer.match_ancestors( samples, ancestor_data, engine=engine, path_compression=True, extended_checks=False, precision=precision, recombination_rate=rho, mutation_rate=mu, ) # print(ancestors_ts.tables) # print("ancestors ts") # for tree in ancestors_ts.trees(): # print(tree.draw_text()) # for site in tree.sites(): # if len(site.mutations) > 1: # print(site.id) # for mutation in site.mutations: # print("\t", mutation.node, mutation.derived_state) # for var in ancestors_ts.variants(): # print(var.genotypes) # print(ancestors_ts.tables) # ancestors_ts = tsinfer.augment_ancestors(samples, ancestors_ts, # [5, 6, 7], engine=engine) ts = tsinfer.match_samples( samples, ancestors_ts, recombination_rate=rho, mutation_rate=mu, path_compression=False, engine=engine, precision=precision, simplify=False, ) print("num_edges = ", ts.num_edges) # # print(ts.draw_text()) # for tree in ts.trees(): # print(tree.draw_text()) # for site in tree.sites(): # if len(site.mutations) > 1: # print(site.id) # for mutation in site.mutations: # print("\t", mutation.node, mutation.derived_state) # # print(ts.tables.edges) # print(ts.dump_tables()) # simplified = ts.simplify() # print("edges before = ", simplified.num_edges) # new_ancestors_ts = insert_srb_ancestors(ts) # ts = tsinfer.match_samples(samples, new_ancestors_ts, # path_compression=False, engine=engine, # simplify=True) # for tree in ts.trees(): # print(tree.interval) # print(tree.draw(format="unicode")) # print(ts.tables.edges) # for tree in ts.trees(): # print(tree.draw(format="unicode")) tsinfer.verify(samples, ts)
def visualise(ts, recombination_rate, error_rate, method="C", box_size=8, perfect_ancestors=False, path_compression=False, time_chunking=False): sample_data = tsinfer.SampleData.initialise( num_samples=ts.num_samples, sequence_length=ts.sequence_length, compressor=None) for v in ts.variants(): sample_data.add_variant(v.site.position, v.alleles, v.genotypes) sample_data.finalise() ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None) if perfect_ancestors: tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts, time_chunking=time_chunking) else: tsinfer.build_ancestors(sample_data, ancestor_data, method=method) ancestor_data.finalise() ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data, method=method, path_compression=path_compression, extended_checks=True) inferred_ts = tsinfer.match_samples(sample_data, ancestors_ts, method=method, simplify=False, path_compression=path_compression, extended_checks=True) prefix = "tmp__NOBACKUP__/" visualiser = Visualiser(ts, sample_data, ancestor_data, inferred_ts, box_size=box_size) visualiser.draw_copying_paths(os.path.join(prefix, "copying_{}.png")) # tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=False) inferred_ts = tsinfer.match_samples(sample_data, ancestors_ts, method=method, simplify=True, path_compression=False, stabilise_node_ordering=True) tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=True) sys.stdout.flush() print("num_sites = ", inferred_ts.num_sites, "num_mutations= ", inferred_ts.num_mutations) for site in inferred_ts.sites(): if len(site.mutations) > 1: print("Multiple mutations at ", site.id, "over", [mut.node for mut in site.mutations])
def run(params): """ Run a single inference, with the specified rates """ precision = params.precision logger.info( f"Starting {params.ma_mis_ratio} {params.ms_mis_ratio}. Precision {precision}" ) prefix = None assert params.sample_file.endswith(".samples") assert params.anc_file.endswith(".ancestors") samples = tsinfer.load(params.sample_file) ancestors = tsinfer.load(params.anc_file) start_time = time.process_time() prefix = params.sample_file[0:-len(".samples")] inf_prefix = "{}_rma{:g}_rms{:g}_p{}".format(prefix, params.ma_mis_ratio, params.ms_mis_ratio, precision) ats_path = inf_prefix + ".atrees" if params.skip_existing and os.path.exists(ats_path): logger.info( f"Ancestors ts file {ats_path} already exists, loading that.") inferred_anc_ts = tskit.load(ats_path) prov = json.loads(inferred_anc_ts.provenances()[-1].record.encode()) if ancestors.uuid != prov['parameters']['source']['uuid']: logger.warning( "The loaded ancestors ts does not match the ancestors file. " "Checking the site positions, and will abort if they don't match!" ) # We might be re-running this, but the simulation file is the same # So double-check that the positions in the ats are a subset of those in the # used sample data file assert np.all( np.isin(inferred_anc_ts.tables.sites.position, samples.sites_position[:])) else: logger.info(f"MA running: will save to {ats_path}") inferred_anc_ts = tsinfer.match_ancestors( samples, ancestors, num_threads=params.num_threads, precision=precision, recombination_rate=params.rec_rate, mismatch_ratio=params.ma_mis_ratio) inferred_anc_ts.dump(ats_path) logger.info(f"MA done: mismatch ratio = {params.ma_mis_ratio}") ts_path = inf_prefix + ".trees" if params.skip_existing and os.path.exists(ts_path): logger.info( f"Inferred ts file {ts_path} already exists, loading that.") inferred_ts = tskit.load(ts_path) try: user_data = inferred_ts.metadata['user_data'] try: assert np.allclose(params.kc_max, user_data['kc_max']) except (KeyError, TypeError): pass # could be NaN e.g. if this is real data return user_data except (TypeError, KeyError): logging.warning( "No metadata in {ts_path}: re-inferring these parameters") # Otherwise finish off the inference logger.info( f"MS running with {params.num_threads} threads: will save to {ts_path}" ) inferred_ts = tsinfer.match_samples(samples, inferred_anc_ts, num_threads=params.num_threads, precision=precision, recombination_rate=params.rec_rate, mismatch_ratio=params.ms_mis_ratio) process_time = time.process_time() - start_time logger.info(f"MS done: mismatch ratio = {params.ms_mis_ratio}") simplified_inferred_ts = inferred_ts.simplify() # Remove unary nodes # Calculate mean num children (polytomy-measure) for internal nodes nc_sum = 0 nc_sum_sq = 0 nc_tot = 0 root_lengths = collections.defaultdict(float) for tree in simplified_inferred_ts.trees(): for n in tree.nodes(): n_children = tree.num_children(n) if n_children > 0: # exclude leaves/samples nc_sum += n_children * tree.span nc_sum_sq += (n_children**2) * tree.span nc_tot += tree.span arity_mean = nc_sum / nc_tot arity_var = nc_sum_sq / nc_tot - (arity_mean**2 ) # can't be bothered to adjust for n # Calculate span of root nodes in simplified tree sim_ts_bytes = sim_ts_min_bytes = None kc_poly = kc_split = None if params.ts_file is not None: try: simulated_ts = tskit.load(params.ts_file + ".trees") logger.info(f"Calculating KC distances for {ts_path}") sim_ts_bytes = simulated_ts.nbytes sim_ts_min_bytes = simulated_ts.simplify( keep_unary=True, reduce_to_site_topology=True, filter_sites=False).nbytes kc_poly = simplified_inferred_ts.kc_distance(simulated_ts) logger.debug("KC poly calculated") kc_split = 0 for interval, orig_tree, new_tree in simulated_ts.coiterate( simplified_inferred_ts, sample_lists=True): kc_split += interval.span * orig_tree.kc_distance( new_tree.split_polytomies(random_seed=int(interval.left), sample_lists=True)) kc_split /= simulated_ts.sequence_length logger.debug("KC split calculated") except FileNotFoundError: pass results = { 'arity_mean': arity_mean, 'arity_var': arity_var, 'edges': inferred_ts.num_edges, 'error': params.error, 'kc_max_split': params.kc_max_split, 'kc_max': params.kc_max, 'kc_poly': kc_poly, 'kc_split': kc_split, 'muts': inferred_ts.num_mutations, 'n': inferred_ts.num_samples, 'num_sites': inferred_ts.num_sites, 'num_trees': inferred_ts.num_trees, 'precision': precision, 'proc_time': process_time, 'ma_mis_ratio': params.ma_mis_ratio, 'ms_mis_ratio': params.ms_mis_ratio, 'seed': params.seed, 'sim_ts_min_bytes': sim_ts_min_bytes, 'sim_ts_bytes': sim_ts_bytes, 'source': params.source, 'ts_bytes': inferred_ts.nbytes, 'ts_path': ts_path, } # Save the results into the ts metadata - this should allow us to reconstruct the # results table should anything go awry, or if we need to add more tables = inferred_ts.dump_tables() if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}): if tables.metadata: raise RuntimeError( "Metadata already exists in the ts, and is not JSON") tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = {} tables.metadata = {"user_data": results, **tables.metadata} tables.tree_sequence().dump(ts_path) return results
def run(params): """ Run a single inference, with the specified rates """ rho = params.rec_rate av_rho = np.quantile(rho, 0.5) ma_mis = av_rho * params.ma_mis_rate ms_mis = av_rho * params.ms_mis_rate if params.precision is None: # Smallest nonzero recombination rate min_rho = int(np.ceil(-np.min(np.log10(rho[rho > 0])))) # Smallest mean av_min = int(np.ceil(-np.log10(min(ma_mis, ms_mis)))) precision = max(min_rho, av_min) + 3 else: precision = params.precision print( f"Starting {params.ma_mis_rate} {params.ms_mis_rate}", f"with av rho {av_rho:.5g}", f"(mean {np.mean(rho):.4g}, median {np.quantile(rho, 0.5):.4g}, ", f"nonzero min {np.min(rho[rho > 0]):.4g}, ", f"2.5% quantile {np.quantile(rho, 0.025):.4g}) precision {precision}") prefix = None if params.sample_data.path is not None: assert params.sample_data.path.endswith(".samples") prefix = params.sample_data.path[0:-len(".samples")] inf_prefix = "{}_ma{}_ms{}_N{}_p{}".format(prefix, params.ma_mis_rate, params.ms_mis_rate, params.cutoff_exponent, precision) start_time = time.process_time() extra_params = dict(num_threads=params.num_threads) if params.cutoff_exponent is not None: extra_params['cutoff_power'] = params.cutoff_exponent anc = tsinfer.generate_ancestors( params.sample_data, path=None if inf_prefix is None else inf_prefix + ".ancestors", progress_monitor=tsinfer.cli.ProgressMonitor(1, 1, 0, 0, 0), **extra_params, ) print(f"GA done (cutoff exponent: {params.cutoff_exponent}") extra_params = dict( num_threads=params.num_threads, recombination_rate=rho, precision=precision, ) inferred_anc_ts = tsinfer.match_ancestors( params.sample_data, anc, mismatch_rate=ma_mis, progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 1, 0, 0), **extra_params, ) inferred_anc_ts.dump(path=inf_prefix + ".atrees") print(f"MA done (ma_mis:{ma_mis}") inferred_ts = tsinfer.match_samples( params.sample_data, inferred_anc_ts, mismatch_rate=ms_mis, progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 0, 0, 1), **extra_params, ) process_time = time.process_time() - start_time ts_path = inf_prefix + ".trees" inferred_ts.dump(path=ts_path) print(f"MS done: ms_mis rate = {ms_mis})") simplified_inferred_ts = inferred_ts.simplify() # Remove unary nodes # Calculate mean num children (polytomy-measure) for internal nodes nc_sum = 0 nc_sum_sq = 0 nc_tot = 0 root_lengths = collections.defaultdict(float) for tree in simplified_inferred_ts.trees(): for n in tree.nodes(): n_children = tree.num_children(n) if n_children > 0: # exclude leaves/samples nc_sum += n_children * tree.span nc_sum_sq += (n_children**2) * tree.span nc_tot += tree.span nc_mean = nc_sum / nc_tot nc_var = nc_sum_sq / nc_tot - (nc_mean**2 ) # can't be bothered to adjust for n # Calculate span of root nodes in simplified tree # Calculate KC try: kc = simplified_inferred_ts.kc_distance(tskit.load(prefix + ".trees")) except FileNotFoundError: kc = None return Results(abs_ma_mis=ma_mis, abs_ms_mis=ms_mis, rel_ma_mis=params.ma_mis_rate, rel_ms_mis=params.ms_mis_rate, precision=precision, edges=inferred_ts.num_edges, muts=inferred_ts.num_mutations, num_trees=inferred_ts.num_trees, kc=kc, cutoff_exponent=params.cutoff_exponent, mean_node_children=nc_mean, var_node_children=nc_var, process_time=process_time, ts_size=os.path.getsize(ts_path), ts_path=ts_path)
def tsinfer_dev(n, L, seed, num_threads=1, recombination_rate=1e-8, error_rate=0, engine="C", log_level="WARNING", debug=True, progress=False, path_compression=True): np.random.seed(seed) random.seed(seed) L_megabases = int(L * 10**6) # daiquiri.setup(level=log_level) ts = msprime.simulate(n, Ne=10**4, length=L_megabases, recombination_rate=recombination_rate, mutation_rate=1e-8, random_seed=seed) if debug: print("num_sites = ", ts.num_sites) assert ts.num_sites > 0 samples = tsinfer.SampleData.from_tree_sequence(ts) ancestor_data = tsinfer.generate_ancestors(samples, engine=engine, num_threads=num_threads) ancestors_ts = tsinfer.match_ancestors(samples, ancestor_data, engine=engine, path_compression=True, extended_checks=False) ancestors_ts = tsinfer.augment_ancestors(samples, ancestors_ts, [5, 6, 7], engine=engine) ts = tsinfer.match_samples(samples, ancestors_ts, path_compression=False, engine=engine, simplify=True) # print(ts.tables.edges) # print(ts.dump_tables()) # simplified = ts.simplify() # print("edges before = ", simplified.num_edges) # new_ancestors_ts = insert_srb_ancestors(ts) # ts = tsinfer.match_samples(samples, new_ancestors_ts, # path_compression=False, engine=engine, # simplify=True) # for tree in ts.trees(): # print(tree.interval) # print(tree.draw(format="unicode")) # print(ts.tables.edges) # for tree in ts.trees(): # print(tree.draw(format="unicode")) tsinfer.verify(samples, ts)
alleles=v.alleles, genotypes=v.genotypes[ancient_sample_indices]) ancient_samples.finalise() #%% # Infer and date tree from modern samples primary_ts = ts.simplify(modern_sample_indices, filter_sites=False) primary_samples = tsinfer.SampleData.from_tree_sequence(primary_ts) ancestors = tsinfer.generate_ancestors(primary_samples) ancestors_ts = tsinfer.match_ancestors( primary_samples, ancestors) # This only has inference sites primary_inferred_ts = tsinfer.match_samples(primary_samples, ancestors_ts, simplify=False) primary_inferred_ts_simplified = primary_inferred_ts.simplify( np.where(primary_inferred_ts.tables.nodes.flags == 1)[0], keep_unary=True) tsdate.date(primary_inferred_ts_simplified, Ne=stable_pop_size, mutation_rate=2.5e-5) #%% # rest of inference- augmenting older samples in augment_samples = ancient_samples ## re-inserting older samples augment_samples = augment_samples.copy()