def run_benchmark_tskit(args): before = time.perf_counter() ts = msprime.load(args.input) duration = time.perf_counter() - before print("Loaded in {:.2f}s".format(duration)) print("num_nodes = ", ts.num_nodes) print("num_edges = ", ts.num_edges) print("num_trees = ", ts.num_trees) print("size = ", humanize.naturalsize(os.path.getsize(args.input), binary=True)) before = time.perf_counter() j = 0 for tree in ts.trees(sample_counts=False): j += 1 assert j == ts.num_trees duration = time.perf_counter() - before print("Iterated over trees in {:.2f}s".format(duration)) before = time.perf_counter() num_variants = 0 # As of msprime 0.6.1, it's a little bit more efficient to specify the full # samples and use the tree traversal based decoding algorithm than the full # sample-lists for UKBB trees. This'll be fixed in the future. for var in ts.variants(samples=ts.samples()): if num_variants == args.num_variants: break num_variants += 1 duration = time.perf_counter() - before total_genotypes = (ts.num_samples * num_variants) / 10**6 print("Iterated over {} variants in {:.2f}s @ {:.2f} M genotypes/s".format( num_variants, duration, total_genotypes / duration))
def run_snip_centromere(args): with open(args.centromeres) as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row["chrom"] == args.chrom: start = int(row["start"]) end = int(row["end"]) break else: raise ValueError("Did not find row") ts = msprime.load(args.input) position = ts.tables.sites.position s_index = np.searchsorted(position, start) e_index = np.searchsorted(position, end) # We have a bunch of sites within the centromere. Get the largest # distance between these and call these the start and end. Probably # pointless having the centromere coordinates as input in the first place, # since we're just searching for the largest gap anyway. However, it can # be useful in UKBB, since it's perfectly possible that the largest # gap between sites isn't in the centromere. X = position[s_index:e_index + 1] j = np.argmax(X[1:] - X[:-1]) real_start = X[j] + 1 real_end = X[j + 1] print("Centromere at", start, end, "Snipping topology from ", real_start, real_end) snipped_ts = tsinfer.snip_centromere(ts, real_start, real_end) snipped_ts.dump(args.output)
def run_trees(args): tree_sequence = msprime.load(args.history_file) N = tree_sequence.get_num_nodes() records = list(tree_sequence.records()) l = [record[0] for record in records] r = [record[1] for record in records] u = [record[2] for record in records] c = [record[3] for record in records] t = [record[4] for record in records] local_trees = [] print("Trees:") for pi in generate_trees(l, r, u, c, t): local_trees.append(list(pi)) print("\t", pi) local_counts = [] S = set(range(1, tree_sequence.get_sample_size() + 1)) print("Counts:") for pi, beta in count_leaves(l, r, u, c, t, S): local_counts.append(list(beta)) print("\t", beta) msp_trees = [] msp_counts = [] for t in tree_sequence.trees(): pi = [t.get_parent(j) for j in range(N + 1)] beta = [t.get_num_leaves(j) for j in range(N + 1)] msp_trees.append(pi) msp_counts.append(beta) assert msp_trees == local_trees assert msp_counts == local_counts
def ts_to_bcf_single(ts_file, out_file, runner): # Need to remove non-sample individuals from ts or else tskit gets confused ts = msprime.load(ts_file) if ts.num_individuals == 0: bcf_cmd = "tskit vcf --ploidy 2 {} | bcftools view -O b > {}".format( ts_file, out_file) runner.run(bcf_cmd) else: ts_only_sample_inds = ts_clean_inds(ts) # TODO: This will probably fail if metadata isn't present ids = [ ind.metadata.decode('utf8') for ind in ts_only_sample_inds.individuals() ] read_fd, write_fd = os.pipe() write_pipe = os.fdopen(write_fd, "w") with open(out_file, "w") as f: proc = subprocess.Popen(["bcftools", "view", "-O", "b"], stdin=read_fd, stdout=f) ts_only_sample_inds.write_vcf(write_pipe, individual_names=ids) write_pipe.close() os.close(read_fd) proc.wait() if proc.returncode != 0: raise RuntimeError("bcftools failed with status:", proc.returncode)
def run_simplify_subsample_size_benchmark(args): ts = msprime.load(args.file) np.random.seed(1) print("Running simplify benchmarks") N = 20 num_replicates = 5 subsample_size = np.logspace(1, 5, N).astype(int) print(subsample_size) T = np.zeros(N) for j in range(N): X = np.zeros(num_replicates) for k in range(num_replicates): sample = np.random.choice(ts.num_samples, subsample_size[j], replace=False).astype(np.int32) before = time.process_time() sub_ts = ts.simplify(sample) X[k] = time.process_time() - before T[j] = np.mean(X) print(subsample_size[j], T[j]) df = pd.DataFrame({"subsample_size": subsample_size, "time": T}) df.to_csv("data/simplify_subsample.dat") plt.semilogx(subsample_size, T, marker="o") plt.xlabel("Subsample size") plt.ylabel("Time to simplify (s)") plt.savefig("simplify_subsample_perf.pdf", format='pdf')
def run_benchmark(): print("msprime version:", msprime.__version__) before = time.perf_counter() filename = os.path.join(data_prefix, "{}.trees".format(10**7)) ts = msprime.load(filename) duration = time.perf_counter() - before print("loaded {} tree sequence in {:.2f}s".format( humanize.naturalsize(os.path.getsize(filename), binary=True), duration)) run_benchmark_newick(ts, 2) size = ts.num_samples * ts.num_sites print("Total size of genotype matrix = ", humanize.naturalsize(size, binary=True)) before = time.perf_counter() j = 0 for tree in ts.trees(sample_counts=False, sample_lists=False): j += 1 assert j == ts.num_trees duration = time.perf_counter() - before print("Iterated over {} trees in {:.2f}s".format(ts.num_trees, duration)) before = time.perf_counter() samples = np.arange(10**6) freq = np.zeros(ts.num_sites) for tree in ts.trees(tracked_samples=samples): for site in tree.sites(): node = site.mutations[0].node freq[site.id] = tree.num_tracked_samples(node) duration = time.perf_counter() - before print("Computed {} allele frequencies in {:.2f}s".format(ts.num_sites, duration)) benchmark_bcf(ts)
def main(args): nhaps = map(int, args.nhaps.split(',')) recomb = args.recomb_map ncausal = args.ncausal # generate/load coalescent simulations if args.tree is None: (pop_config, mig_mat, demog) = out_of_africa(nhaps) simulation = simulate_ooa(pop_config, mig_mat, demog, recomb) simulation.dump(args.out+ '_nhaps_' + '_'.join(map(str, nhaps)) + '.hdf5', True) else: simulation = msprime.load(args.tree) eprint(simulation) eprint('Number of haplotypes: ' + ','.join(map(str, nhaps))) eprint('Number of trees: ' + str(simulation.get_num_trees())) eprint('Number of mutations: ' + str(simulation.get_num_mutations())) eprint('Sequence length: ' + str(simulation.get_sequence_length())) prs_true = true_prs(simulation, args.ncausal, args.h2, nhaps, args.out) cases_diploid, controls_diploid, prs_norm, environment = case_control(prs_true, args.h2, nhaps, args.prevalence, args.ncontrols, args.out) summary_stats, cases_haploid, controls_haploid = run_gwas(simulation, cases_diploid, controls_diploid, args.p_threshold, args.cc_maf) clumped_snps, usable_positions = clump_variants(simulation, summary_stats, nhaps, args.r2, args.window_size) prs_infer = infer_prs(simulation, nhaps, clumped_snps, summary_stats, usable_positions, args.h2, args.ncausal, args.out) write_summaries(args.out, prs_true, prs_infer, nhaps, cases_diploid, controls_diploid, args.h2, args.ncausal, environment)
def newick_example(): tree_sequence = msprime.load("example.hdf5") with open("example.newick", "w") as f: iterator = tree_sequence.newick_trees(8) for l, ns in iterator: print("[{0}]".format(l), end="", file=f) print(ns, file=f)
def verify_round_trip(self, ts, version): msprime.dump_legacy(ts, self.temp_file, version=version) with silence_stderr(): tsp = msprime.load_legacy(self.temp_file) self.verify_tree_sequences_equal(ts, tsp) tsp.dump(self.temp_file) tsp = msprime.load(self.temp_file) self.verify_tree_sequences_equal(ts, tsp)
def test_simulate_short_args(self): cmd = "simulate" stdout, stdearr = capture_output(cli.msp_main, [ cmd, "100", self._history_file, "-m", "1e2", "-r", "5", "-u", "2"]) tree_sequence = msprime.load(self._history_file) self.assertEqual(tree_sequence.get_sample_size(), 100) self.assertEqual(tree_sequence.get_num_loci(), 100) self.assertGreater(tree_sequence.get_num_mutations(), 0)
def verify_round_trip(self, ts, version): msprime.dump_legacy(ts, self.temp_file, version=version) tsp = msprime.load_legacy(self.temp_file) simplify = version < 10 self.verify_tree_sequences_equal(ts, tsp, simplify=simplify) tsp.dump(self.temp_file) tsp = msprime.load(self.temp_file) self.verify_tree_sequences_equal(ts, tsp, simplify=simplify)
def run_dump_provenances(args): tree_sequence = msprime.load(args.history_file) if args.human: for provenance in tree_sequence.provenances(): d = json.loads(provenance.record) print("id={}, timestamp={}, record={}".format( provenance.id, provenance.timestamp, json.dumps(d, indent=4))) else: tree_sequence.dump_text(provenances=sys.stdout)
def convert_haplotypes(): import sys ts = msprime.load(sys.argv[1]) ts.generate_mutations(0.0001, 1) print("Generated mutations", ts.get_num_mutations()) c = 0 for h in ts.haplotypes(): c += 1 # print(h) print("generated ", c, "haplotypes")
def verify_output(self, output_path): output_ts = msprime.load(output_path) self.assertEqual(output_ts.num_samples, self.input_ts.num_samples) self.assertEqual(output_ts.sequence_length, self.input_ts.sequence_length) self.assertEqual(output_ts.num_sites, self.input_ts.num_sites) self.assertGreater(output_ts.num_sites, 1) self.assertTrue( np.array_equal(output_ts.genotype_matrix(), self.input_ts.genotype_matrix()))
def test_optional_provenance(self): ts = single_locus_no_mutation_example() with tempfile.NamedTemporaryFile() as f: ts.dump(f.name) hfile = h5py.File(f.name, "r+") del hfile["provenance"] hfile.close() del hfile other_ts = msprime.load(f.name) self.assertEqual(other_ts.get_provenance(), [])
def run_newick(args): ts = msprime.load(args.file) t = next(ts.trees()) newick = t.newick() size = len(newick) megabyte = 1024 * 1024 terabyte = megabyte * 1024 * 1024 total = size * ts.num_trees print("newick size 1 tree = {:.2f} MiB".format(size / megabyte)) print("newick size all trees = {:.2f} TiB".format(total / terabyte))
def dump_example(): tree_sequence = msprime.simulate( sample_size=10, num_loci=1000, scaled_recombination_rate=0.1, scaled_mutation_rate=0.01, random_seed=1) haplotypes = list(tree_sequence.haplotypes()) tree_sequence.dump("example.hdf5") # Now, load another tree sequence instance from this file other_tree_sequence = msprime.load("example.hdf5") other_haplotypes = list(other_tree_sequence.haplotypes()) assert haplotypes == other_haplotypes
def test_optional_provenance(self): ts = single_locus_no_mutation_example() ts.dump(self.temp_file) hfile = h5py.File(self.temp_file, "r+") del hfile["provenance"] hfile.close() del hfile other_ts = msprime.load(self.temp_file) self.assertEqual(other_ts.get_provenance(), []) self.verify_tree_dump_format(other_ts)
def load(cls, path): ''' Load a :class:`SlimTreeSequence` from a .trees file on disk. :param string path: The path to a .trees file. :rtype SlimTreeSequence: ''' # roundabout way to load just the tables ts = msprime.load(path) return cls(ts)
def write_treeseq(chrom): treefile = args.tree_file[chrom] mut_rate = args.mut_rate[chrom] seed = seeds[chrom] logfile.write("Simulating mutations on " + treefile + "\n") logfile.flush() ts = msprime.load(treefile) mutated_ts = msprime.mutate(ts, rate=mut_rate, random_seed=seed, keep=True) logfile.write("Saving to" + args.outfile[chrom] + "\n") mutated_ts.dump(args.outfile[chrom]) return True
def verify_dump_load(self, tree_sequence): """ Dump the tree sequence and verify we can load again from the same file. """ with tempfile.NamedTemporaryFile() as f: tree_sequence.dump(f.name) other = msprime.load(f.name) records = list(tree_sequence.records()) other_records = list(other.records()) self.assertEqual(records, other_records)
def write_vcf(chrom): treefile = args.tree_file[chrom] vcf = open(args.vcffile[chrom], "w") mut_rate = args.mut_rate[chrom] seed = seeds[chrom] logfile.write("Simulating mutations on" + treefile + "\n") ts = msprime.load(treefile) mutated_ts = msprime.mutate(ts, rate=mut_rate, random_seed=seed) logfile.write("Saving to" + args.vcffile[chrom] + "\n") mutated_ts.write_vcf(vcf, ploidy=1) return True
def gwas_example(): # n = 100 # ts = msprime.simulate( # n, 1000, scaled_recombination_rate=0.1, scaled_mutation_rate=10, # random_seed=1) ts = msprime.load("tmp__NOBACKUP__/gqt.hdf5") n = ts.get_sample_size() num_cases = n // 2 # write_ped(ts, num_cases, "tmp__NOBACKUP__/test") # write_ped(ts, num_cases, "tmp__NOBACKUP__/plink/gqt") write_plink_assoc(ts, num_cases)
def tw_find_segment(chunk): tmpf = "tmp." + ''.join( random.SystemRandom().choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(32)) + ".hdf5" copyfile(chunk['Data'], tmpf) data = msp.load(chunk['Data']) rslt = [] for line in chunk['List']: rslt.append(find_segment(line, data)) os.remove(tmpf) return rslt
def test_run_defaults(self): cmd = "simulate" sample_size = 10 stdout, stderr = capture_output(cli.msp_main, [ cmd, str(sample_size), self._history_file]) self.assertEqual(len(stderr), 0) self.assertEqual(len(stdout), 0) tree_sequence = msprime.load(self._history_file) self.assertEqual(tree_sequence.get_sample_size(), sample_size) self.assertEqual(tree_sequence.get_num_loci(), 1) self.assertEqual(tree_sequence.get_num_mutations(), 0)
def run_dump_macs(args): """ Write a macs formatted file so we can import into pbwt. """ tree_sequence = msprime.load(args.history_file) n = tree_sequence.get_sample_size() m = tree_sequence.get_sequence_length() print("COMMAND:\tnot_macs {} {}".format(n, m)) print("SEED:\tASEED") site = 0 for position, variant in tree_sequence.variants(): print("SITE:", site, position / m, 0.0, variant, sep="\t") site += 1
def run_dump_macs(args): """ Write a macs formatted file so we can import into pbwt. """ tree_sequence = msprime.load(args.history_file) n = tree_sequence.get_sample_size() m = tree_sequence.get_sequence_length() print("COMMAND:\tnot_macs {} {}".format(n, m)) print("SEED:\tASEED") for variant in tree_sequence.variants(as_bytes=True): print( "SITE:", variant.index, variant.position / m, 0.0, "{}".format(variant.genotypes.decode()), sep="\t")
def verify_dump_load(self, tree_sequence): """ Dump the tree sequence and verify we can load again from the same file. """ tree_sequence.dump(self.temp_file) other = msprime.load(self.temp_file) self.assertIsNotNone(other.file_uuid) records = list(tree_sequence.edges()) other_records = list(other.edges()) self.assertEqual(records, other_records) haplotypes = list(tree_sequence.haplotypes()) other_haplotypes = list(other.haplotypes()) self.assertEqual(haplotypes, other_haplotypes)
def run_mcmc(args): input_data_path = args.input_path haplotype_data_name = args.haplotype_name ancAllele_data_name = args.ancAllele_name snpPos_data_name= args.snpPos_name iteration = args.iteration thin = args.thin burn = args.burn n = args.sample_size seq_length = args.seq_length mu = args.mutation_rate r= args.recombination_rate Ne= args.Ne outpath = args.outpath tsfull = None if args.tsfull !=None:#else real data try: tsfull = msprime.load(args.tsfull.name) #trees is a fh except AttributeError: tsfull = msprime.load(args.tsfull) # random.seed(args.random_seed) # np.random.seed(args.random_seed+1) mcmc = MCMC(tsfull, n, Ne, seq_length, mu, r, input_data_path, haplotype_data_name, ancAllele_data_name, snpPos_data_name, outpath, args.verbose) mcmc.run(iteration, thin, burn, args.verify) if args.plot: # p= comparison.plot.Trace(outpath, name= "summary") p= Trace(outpath) p.arginfer_trace() # if args.plot: # p = plot_summary(outpath) # p.plot() if args.verbose: mcmc.print_state()
def vcf_example(): # n = 6 # 3 diploid samples from each pop # t = 100 # ts = msprime.simulate( # Ne=10**4, # population_configurations=[ # msprime.PopulationConfiguration(sample_size=n), # msprime.PopulationConfiguration(sample_size=n), # msprime.PopulationConfiguration(sample_size=n), # msprime.PopulationConfiguration(sample_size=n), # msprime.PopulationConfiguration(sample_size=n)], # demographic_events=[ # msprime.MassMigration(time=t, source=1, destination=0), # msprime.MassMigration(time=t, source=2, destination=0), # msprime.MassMigration(time=t, source=3, destination=0), # msprime.MassMigration(time=t, source=4, destination=0)], # length=1 * 1e6, # recombination_rate=2e-8, # mutation_rate=2e-8, # random_seed=1) # with open("test.vcf", "w") as f: # ts.write_vcf(f, ploidy=2) ts = msprime.load("tmp__NOBACKUP__/populations.hdf5") before = time.clock() num_genotypes = 0 for variant in ts.variants(): num_genotypes += len(variant.genotypes) print(num_genotypes, ts.get_sample_size() * ts.get_num_mutations()) duration = time.clock() - before print("Done in ", duration, " gives ", num_genotypes * 1e-6 / duration, " MGenotypes decoded per second") print(num_genotypes) before = time.clock() with open("tmp__NOBACKUP__/tmp_1.vcf", "w") as f: ts.write_vcf(f, ploidy=1) size = f.tell() duration = time.clock() - before print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration, "MB/s") before = time.clock() with open("tmp__NOBACKUP__/tmp_2.vcf", "w") as f: ts.write_vcf(f, ploidy=2) duration = time.clock() - before print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration, "MB/s")
def write_vcf(chrom): treefile = args.tree_file[chrom] vcf = open(args.vcffile[chrom], "w") mut_rate = args.mut_rate[chrom] seed = seeds[chrom] logfile.write("Simulating mutations on" + treefile + "\n") ts = msprime.load(treefile) tables = ts.dump_tables() rng = msprime.RandomGenerator(seed) mutgen = msprime.MutationGenerator(rng, mut_rate) mutgen.generate(tables.nodes, tables.edges, tables.sites, tables.mutations) logfile.write("Saving to" + args.vcffile[chrom] + "\n") mutated_ts = msprime.load_tables(**tables.asdict()) mutated_ts.write_vcf(vcf, ploidy=1) return True
def run_match_samples(args): setup_logging(args) sample_data = tsinfer.SampleData.load(args.input) ancestors_ts = get_ancestors_ts(args.ancestors_ts, args.input) output_ts = get_output_ts(args.output_ts, args.input) logger.info("Loading ancestral genealogies from {}".format(ancestors_ts)) ancestors_ts = msprime.load(ancestors_ts) ts = tsinfer.match_samples(sample_data, ancestors_ts, num_threads=args.num_threads, path_compression=not args.no_path_compression, progress=args.progress) logger.info("Writing output tree sequence to {}".format(output_ts)) ts.dump(output_ts)
def write_indivs(chrom): treefile = args.tree_file[chrom] out = open(args.indivfile[chrom], "w") logfile.write("Reading trees from " + treefile + "\n") ts = msprime.load(treefile) node_inds = [np.where(ts.tables.nodes.individual == u)[0] for u in range(ts.tables.individuals.num_rows)] individuals = [slimIndividual(ts.tables.individuals[k]) for k in range(ts.tables.individuals.num_rows)] logfile.write("Saving to" + args.indivfile[chrom] + "\n") out.write("\t".join(header) + "\n"); for k, ind in enumerate(individuals): data = [ind.ped_id, ind.age, ind.subpop] + list(ind.location) + list(node_inds[k]) out.write("\t".join(map(str, data)) + "\n") out.close() return True
def run_dump_macs(args): """ Write a macs formatted file so we can import into pbwt. """ tree_sequence = msprime.load(args.history_file) n = tree_sequence.get_sample_size() m = tree_sequence.get_sequence_length() print("COMMAND:\tnot_macs {} {}".format(n, m)) print("SEED:\tASEED") for variant in tree_sequence.variants(as_bytes=True): print("SITE:", variant.index, variant.position / m, 0.0, "{}".format(variant.genotypes.decode()), sep="\t")
def run_convert_files(): for k in range(1, 8): n = 10**k filename = os.path.join(data_prefix, "{}.trees".format(n)) if not os.path.exists(filename): break ts = msprime.load(filename) filename += ".gz" if k < 7: filename = os.path.join(data_prefix, "{}.vcf".format(n)) with open(filename, "w") as vcf_file: ts.write_vcf(vcf_file, 2) print("Wrote ", filename) gz_filename = filename + ".gz" subprocess.check_call("gzip -c {} > {}".format(filename, gz_filename), shell=True) print("Wrote ", gz_filename)
def run_compute_ukbb_gnn(args): ts = msprime.load(args.input) tables = ts.tables before = time.time() augmented_samples = set(get_augmented_samples(tables)) duration = time.time() - before print("Got augmented:", len(augmented_samples), "in ", duration) reference_sets_map = collections.defaultdict(list) ind_metadata = [None for _ in range(ts.num_individuals)] all_samples = [] for ind in ts.individuals(): md = json.loads(ind.metadata.decode()) ind_metadata[ind.id] = md for node in ind.nodes: if node not in augmented_samples: reference_sets_map[md["CentreName"]].append(node) all_samples.append(node) reference_set_names = list(reference_sets_map.keys()) reference_sets = [reference_sets_map[key] for key in reference_set_names] cols = { "centre": [ ind_metadata[ts.node(u).individual]["CentreName"] for u in all_samples ], "sample_id": [ind_metadata[ts.node(u).individual]["SampleID"] for u in all_samples], "ethnicity": [ ind_metadata[ts.node(u).individual]["Ethnicity"] for u in all_samples ], } print("Computing GNNs for ", len(all_samples), "samples") before = time.time() A = ts.genealogical_nearest_neighbours(all_samples, reference_sets, num_threads=args.num_threads) duration = time.time() - before print("Done in {:.2f} mins".format(duration / 60)) for j, name in enumerate(reference_set_names): cols[name] = A[:, j] df = pd.DataFrame(cols) df.to_csv(args.output)
def __init__(self, max_time, ts=None, ts_file=None): if ts is None and ts_file is None: print("One of ts or ts_file must be specified") raise ValueError if ts is None: ts = msprime.load(ts_file) self.ts = ts self.max_time = max_time self.bps = list(self.ts.breakpoints()) self.node_times = self.get_node_times() self.ca_times = scipy.sparse.lil_matrix((ts.num_samples, ts.num_samples)) self.ca_last = scipy.sparse.lil_matrix((ts.num_samples, ts.num_samples)) self.ca_count = scipy.sparse.lil_matrix((ts.num_samples, ts.num_samples)) self.ibd_list = []
def load_tree_sequence(args, log): # Create a list to fill with tree_sequences. args, tree_sequence_list, tree_sequence_list_geno, m_total, m_geno_total, rec_map, m, m_start, m_geno, m_geno_start = initialise( args) tree_sequence_list.append(msprime.load(args.load_tree_sequence)) args.n = int(tree_sequence_list[0].get_sample_size() / 2) N = args.n n_pops = 1 log.log( "Warning: load tree sequence was included for debugging, we don't support more than 1 population, and more than 1 chromosome." ) common_mutations = [] n_haps = tree_sequence_list[0].get_sample_size() # Get the mutations > MAF. tree_sequence_list[0] = get_common_mutations_ts(args, tree_sequence_list[0], log) m[0] = int(tree_sequence_list[0].get_num_mutations()) m_start[0] = 0 m_total = m[0] log.log('Number of mutations above MAF in the generated data: {m}'.format( m=m[0])) log.log('Running total of sites > MAF cutoff: {m}'.format(m=m_total)) # If genotyped proportion is < 1. if args.geno_prop is not None: tree_sequence_tmp, m_geno_tmp = ts.set_mutations_in_tree( tree_sequence_list[0], args.geno_prop) tree_sequence_list_geno.append(tree_sequence_tmp) m_geno[0] = int(m_geno_tmp) m_geno_start[0] = m_geno_total m_geno_total = m_geno[0] log.log('Number of sites genotyped in the generated data: {m}'.format( m=m_geno[0])) log.log('Running total of sites genotyped: {m}'.format(m=m_geno_total)) else: tree_sequence_list_geno.append(tree_sequence_list[0]) m_geno[0] = m[0] m_geno_start[0] = m_start[0] m_geno_total = m_total return tree_sequence_list, tree_sequence_list_geno, m, m_start, m_total, m_geno, m_geno_start, m_geno_total, N, n_pops
def allele_frequency_example(): # n = 10000 # ts = msprime.simulate( # n, 100000, scaled_recombination_rate=0.1, scaled_mutation_rate=0.1, # random_seed=1) ts = msprime.load("tmp__NOBACKUP__/gqt.hdf5") n = ts.get_sample_size() num_mutations = 0 min_frequency = 0.0001 num_trees = 0 for tree in ts.trees(): num_trees += 1 for pos, node in tree.mutations(): if tree.get_num_leaves(node) / n < min_frequency: num_mutations += 1 print("num_mutatinos = ", num_mutations, "\t", num_mutations / ts.get_num_mutations()) print("total_mutations = ", ts.get_num_mutations()) print("num_trees = ", num_trees)
def build_profile_inputs(n, num_megabases): L = num_megabases * 10**6 input_file = "tmp__NOBACKUP__/profile-n={}-m={}.input.trees".format( n, num_megabases) if os.path.exists(input_file): ts = msprime.load(input_file) else: ts = msprime.simulate( n, length=L, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, random_seed=10, ) print( "Ran simulation: n = ", n, " num_sites = ", ts.num_sites, "num_trees =", ts.num_trees, ) ts.dump(input_file) filename = "tmp__NOBACKUP__/profile-n={}-m={}.samples".format( n, num_megabases) if os.path.exists(filename): os.unlink(filename) # daiquiri.setup(level="DEBUG") with tsinfer.SampleData(sequence_length=ts.sequence_length, path=filename, num_flush_threads=4) as sample_data: # progress_monitor = tqdm.tqdm(total=ts.num_samples) # for j in range(ts.num_samples): # sample_data.add_sample(metadata={"name": "sample_{}".format(j)}) # progress_monitor.update() # progress_monitor.close() progress_monitor = tqdm.tqdm(total=ts.num_sites) for variant in ts.variants(): sample_data.add_site(variant.site.position, variant.genotypes) progress_monitor.update() progress_monitor.close() print(sample_data)
def run_dump_macs(args): """ Write a macs formatted file so we can import into pbwt. """ tree_sequence = msprime.load(args.history_file) n = tree_sequence.get_sample_size() m = tree_sequence.get_num_loci() print("COMMAND:\tnot_macs {} {}".format(n, m)) print("SEED:\tASEED") site = 0 for tree in tree_sequence.trees(): for position, node in tree.mutations(): h = ['0' for _ in range(n)] for u in tree.leaves(node): h[u - 1] = '1' print( "SITE:", site, position / m, 0.0, "".join(h), sep="\t" ) site += 1
def examine(): ts = msprime.load("tmp__NOBACKUP__/bottleneck-example.hdf5") print("num_records = ", ts.get_num_records()) non_binary_records = 0 max_record_length = 0 for r in ts.records(): if len(r.children) > 2: non_binary_records +=1 max_record_length = max(max_record_length, len(r.children)) print("non_binary_records = ", non_binary_records) print("max_record_length = ", max_record_length) num_nodes = collections.Counter() num_trees = 0 for t in ts.trees(): num_nodes[len(list(t.nodes(t.get_root())))] += 1 num_trees += 1 print("num_trees = ", num_trees) for k, v in num_nodes.items(): print(k, "->", v)
def ld_dev(): # ts = msprime.simulate(100, recombination_rate=10, mutation_rate=5, # random_seed=1) num_threads = 10 ts = msprime.load(sys.argv[1]) print("num trees = ", ts.get_num_trees()) print("num mutations = ", ts.get_num_mutations()) # num_mutations = min(ts.get_num_mutations(), 100000) # num_mutations = ts.get_num_mutations() num_mutations = 1000 ld_calcs = [ _msprime.LdCalculator(ts._ll_tree_sequence) for _ in range(num_threads)] k = ts.get_num_trees() // num_threads start = 0 next_block = k intervals = [] for t in ts.trees(): if t.get_index() >= next_block: mutations = list(t.mutations()) if len(mutations) > 0: stop = mutations[-1].index intervals.append((start, stop)) start = stop next_block += k threads = [] lock = threading.Lock() progress = [0 for j in range(num_threads)] for j in range(num_threads): start, stop = intervals[j] t = threading.Thread( name="ld_worker_{}".format(j), target=ld_worker, args=(ld_calcs[j], start, stop, num_mutations, j, lock, progress)) t.start() threads.append(t) print("Main thread joining") for t in threads: t.join() print("Main thread done")
def pop_example(): if False: t = 100 ts = msprime.simulate( Ne=10**4, population_configurations=[ msprime.PopulationConfiguration(sample_size=1000), msprime.PopulationConfiguration(sample_size=1000), msprime.PopulationConfiguration(sample_size=1000), msprime.PopulationConfiguration(sample_size=1000), msprime.PopulationConfiguration(sample_size=1000)], demographic_events=[ msprime.MassMigration(time=t, source=1, destination=0), msprime.MassMigration(time=t, source=2, destination=0), msprime.MassMigration(time=t, source=3, destination=0), msprime.MassMigration(time=t, source=4, destination=0)], length=100 * 1e6, recombination_rate=2e-8, mutation_rate=2e-8, random_seed=1) ts.dump("populations.hdf5") print( ts.get_sample_size(), ts.get_num_trees(), ts.get_num_mutations()) else: ts = msprime.load("populations.hdf5") before = time.clock() R = 1 for i in range(R): for j in range(5): samples = ts.get_samples(population_id=j) pi = ts.get_pairwise_diversity(samples) # pi2 = ts.get_pairwise_diversity2(samples) # print(j, pi, pi2, pi == pi2) # print(j, pi2) duration = time.clock() - before print("duration = ", duration, " per call = ", duration / (5 * R))
assert tail is None else: x = head while x.next is not None: x = x.next assert x == tail x = head.next while x is not None: assert x.left < x.right if x.next is not None: assert x.right <= x.next.left # We should also not have any squashable segments. if x.right == x.next.left: assert x.node != x.next.node x = x.next if __name__ == "__main__": # Simple CLI for running simplifier above. ts = msprime.load(sys.argv[1]) samples = list(map(int, sys.argv[2:])) s = Simplifier(ts, samples) # s.print_state() tss, _ = s.simplify() tables = tss.dump_tables() print("Output:") print(tables.nodes) print(tables.edges) print(tables.sites) print(tables.mutations)
def run_dump_mutations(args): tree_sequence = msprime.load(args.history_file) tree_sequence.write_mutations(sys.stdout, args.header, args.precision)
def run_dump_vcf(args): tree_sequence = msprime.load(args.history_file) tree_sequence.write_vcf(sys.stdout, args.ploidy)
def run_dump_variants(args): tree_sequence = msprime.load(args.history_file) for variant in tree_sequence.variants(as_bytes=True): print(variant.position, end="\t") print("{}".format(variant.genotypes.decode()))
def run_dump_haplotypes(args): tree_sequence = msprime.load(args.history_file) for h in tree_sequence.haplotypes(): print(h)
def run_dump_newick(args): tree_sequence = msprime.load(args.history_file) for l, ns in tree_sequence.newick_trees(args.precision): print(ns)
def test_single_locus_example_recombination(self): from_ts = msprime.load("tests/data/SLiM/single-locus-example.trees") ts = self.finish_simulation(from_ts, recombination_rate=0.1, seed=1) self.verify_completed(from_ts, ts)
def run_dump_mutations(args): tree_sequence = msprime.load(args.history_file) if args.header: print("x", "u", sep="\t") for position, node in tree_sequence.mutations(): print(position, node, sep="\t")
def run_dump_records(args): tree_sequence = msprime.load(args.history_file) if args.header: print("l", "r", "u", "c1", "c2", "t", sep="\t") for l, r, u, c, t in tree_sequence.records(): print(l, r, u, c[0], c[1], t, sep="\t")
def test_minimal_example_no_recombination(self): from_ts = msprime.load("tests/data/SLiM/minimal-example.trees") with self.assertRaises(_msprime.InputError): # Zero recombination rates result in an error as we can't # remap coordinates into the genetic map. self.finish_simulation(from_ts, recombination_rate=0, seed=1)
def dump_file(filename): tree_sequence = msprime.load(filename) for r in tree_sequence.records(): print(r)