def make_sampledata_compatible(args): """ Make a list of sampledata files compatible with the first file. """ # Load all the sampledata files into a list print("Subset sites with {} sampledata files".format( len(args.input_sampledata) - 1)) for index, fn in enumerate(args.input_sampledata): fn = fn.rstrip("\n") if index == 0: target_sd = tsinfer.load(fn) print("Loaded First sampledata file") continue cur_sd = tsinfer.load(fn) print("Loaded sampledata file # {}".format(index)) keep_sites = np.where( np.isin(cur_sd.sites_position[:], target_sd.sites_position[:]))[0] print("Subsetting to {} sites".format(len(keep_sites))) small_cur_sd = cur_sd.subset(sites=keep_sites) print("Done with subset") newname = fn[:-len(".samples")] + ".subset.samples" small_cur_sd_copy = small_cur_sd.copy(newname) small_cur_sd_copy.finalise() print( "Subsetted to {} sites from {}. Output can be found at {}.".format( len(keep_sites), fn, newname))
def get_ancient_constraints_tgp(args): if os.path.exists("all-data/1kg_ancients_only_chr20.samples"): ancient_samples = tsinfer.load( "all-data/1kg_ancients_only_chr20.samples") else: ancient_samples = tsinfer.load("all-data/1kg_ancients_chr20.samples") print("Subsetting SampleData file to only keep ancient samples") ancient_indiv_ids = np.where( ancient_samples.individuals_time[:] != 0)[0] ancient_sample_ids = np.where(ancient_samples.individuals_time[:][ ancient_samples.samples_individual] != 0)[0] ancient_genos = ancient_samples.sites_genotypes[:] ancient_sites = np.where( np.any(ancient_genos[:, ancient_sample_ids] == 1, axis=1))[0] ancient_samples = ancient_samples.subset(individuals=ancient_indiv_ids, sites=ancient_sites) copy = ancient_samples.copy("all-data/1kg_ancients_only_chr20.samples") copy.finalise() print("Subsetted to {} samples and {} sites".format( ancient_samples.num_samples, ancient_samples.num_sites)) genotypes = ancient_samples.sites_genotypes[:] positions = ancient_samples.sites_position[:] alleles = ancient_samples.sites_alleles[:] min_site_times = ancient_samples.min_site_times(individuals_only=True) lower_bound = [(pos, allele[0], allele[1], age, np.sum(geno[geno == 1])) for pos, allele, age, geno in zip( positions, alleles, min_site_times, genotypes)] constraint_df = pd.DataFrame( lower_bound, columns=[ "Position", "Reference Allele", "Alternative Allele", "Ancient Bound", "Number of Ancients", ], ) constraint_df = constraint_df.astype({ "Position": "int64", "Ancient Bound": "float64", "Number of Ancients": "int32" }) constraint_df = constraint_df[constraint_df["Ancient Bound"] != 0] constraint_df.to_csv("all-data/ancient_constraints.csv") try: tgp_mut_ests = pd.read_csv("all-data/tgp_mutations.csv", index_col=0) except FileNotFoundError: raise ValueError( "tgp_mutations.csv does not exist. Must run tgp_dates first") tgp_muts_constraints = pd.merge( tgp_mut_ests, constraint_df, how="left", left_on=[ "Position", "tsdate_ancestral_allele", "tsdate_derived_allele" ], right_on=["Position", "Reference Allele", "Alternative Allele"], ) tgp_muts_constraints.to_csv("all-data/tgp_muts_constraints.csv")
def combined_ts_constrained_samples(args): modern_samples = tsinfer.load(args.modern) high_cov_samples = tsinfer.load(args.high_cov) all_ancient_samples = tsinfer.load(args.all_samples) dated_hgdp_1kg_sgdp_ts = tskit.load(args.dated_ts) sites_time = tsdate.sites_time_from_ts(dated_hgdp_1kg_sgdp_ts) # Only look at sites where the same alleles are found in ancients and moderns. This means the site must be biallelic in moderns and the derived allele is shared between moderns and ancients alleles_equal = np.full(high_cov_samples.num_sites, False, dtype=bool) for index, (modern_alleles, high_cov_alleles, all_ancient_alleles) in enumerate( zip( modern_samples.sites_alleles[:], high_cov_samples.sites_alleles[:], all_ancient_samples.sites_alleles[:], )): modern_alleles = [i for i in modern_alleles if i] high_cov_alleles = [i for i in high_cov_alleles if i] all_ancient_alleles = [i for i in all_ancient_alleles if i] if modern_alleles == high_cov_alleles == all_ancient_alleles: alleles_equal[index] = True # Get the ancient bounds from sampledata file of all ancients all_ancient_samples_bound = all_ancient_samples.min_site_times( individuals_only=True) high_cov_samples_bound = high_cov_samples.min_site_times( individuals_only=True) # Assert that the all ancient samples files has same or older ancient bounds than only high cov assert np.all(all_ancient_samples_bound >= high_cov_samples_bound) print( "Number of ancient lower bounds (with multiallelic sites): ", np.sum(all_ancient_samples_bound != 0), ) # Set time of non-biallelic sites to 0 all_ancient_samples_bound[~alleles_equal] = 0 # If args.transversions_only is True, set time of all transversions to 0 if args.transversions_only: transversions = get_transversions(all_ancient_samples) all_ancient_samples_bound[~transversions] = 0 # Constrain the estimated ages from tree sequence with ancient bounds constrained_sites_time = np.maximum(sites_time, all_ancient_samples_bound) # Add constrained times to sampledata file with moderns and high cov ancients dated_samples = tsdate.add_sampledata_times(high_cov_samples, constrained_sites_time) # Record number of constrained sites print("Total number of sites: ", sites_time.shape[0]) print( "Number of ancient lower bounds: ", np.sum(all_ancient_samples_bound != 0), ) print("Number of corrected times: ", np.sum(dated_samples.sites_time[:] != sites_time)) high_cov_samples_copy = dated_samples.copy(args.output) high_cov_samples_copy.finalise()
def __init__(self, data_file, ancestral_states, samples, target_samples=None): self.data_file = data_file self.ancestral_states = ancestral_states self.samples = samples if target_samples is not None: self.target_sites_pos = set( tsinfer.load(target_samples).sites_position[:]) else: self.target_sites_pos = None self.num_samples = -1 self.num_sites = 0 # ancestral states counters. self.num_no_ancestral_state = 0 self.num_low_confidence_ancestral_state = 0 # Counters for genotypes and sites. self.num_unphased = 0 self.num_missing_data = 0 self.num_invariant = 0 self.num_indels = 0 self.num_non_biallelic = 0 self.num_singletons = 0 # (n - 1)-tons self.num_nmo_tons = 0
def generate_ancestors(samples_fn, num_threads, prefix): sample_data = tsinfer.load(samples_fn) anc = tsinfer.generate_ancestors( sample_data, num_threads=num_threads, path=prefix + ".ancestors", progress_monitor=True, ) if np.any(sample_data.individuals_time[:] != 0): anc_w_proxy = anc.insert_proxy_samples(sample_data, allow_mutation=True) anc = anc_w_proxy.copy(path=prefix + ".proxy.ancestors") anc.finalise() maximum_time = np.max(anc.ancestors_time[:]) if (maximum_time < 3 ): # hacky way of checking if we used frequency to order ancestors anc = anc.truncate_ancestors(0.4, 0.6, length_multiplier=1, path=prefix + ".truncated.ancestors") else: upper_time_limit = maximum_time * 0.6 lower_time_limit = maximum_time * 0.4 anc = anc.truncate_ancestors( lower_time_limit, upper_time_limit, length_multiplier=1, path=prefix + ".truncated.ancestors", ) return anc
def setUp(self): self.tempdir = tempfile.TemporaryDirectory(prefix="tsinfer_cli_test") self.sample_file = str( pathlib.Path(self.tempdir.name, "input-data.samples")) self.ancestor_file = str( pathlib.Path(self.tempdir.name, "input-data.ancestors")) self.ancestor_trees = str( pathlib.Path(self.tempdir.name, "input-data.ancestors.trees")) self.output_trees = str( pathlib.Path(self.tempdir.name, "input-data.trees")) self.input_ts = msprime.simulate(10, mutation_rate=10, recombination_rate=10, random_seed=10) sample_data = tsinfer.SampleData( sequence_length=self.input_ts.sequence_length, path=self.sample_file) for var in self.input_ts.variants(): sample_data.add_site(var.site.position, var.genotypes, var.alleles) sample_data.finalise() tsinfer.generate_ancestors(sample_data, path=self.ancestor_file, chunk_size=10) ancestor_data = tsinfer.load(self.ancestor_file) ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data) ancestors_ts.dump(self.ancestor_trees) ts = tsinfer.match_samples(sample_data, ancestors_ts) ts.dump(self.output_trees) sample_data.close()
def setup_sample_file(args): """ Return a Thousand Genomes Project sample data file, the corresponding recombination rate array, a prefix to use for files, and None """ filename = args.sample_file map = args.genetic_map if not filename.endswith(".samples"): raise ValueError("Sample data file must end with '.samples'") sd = tsinfer.load(filename) inference_pos = sd.sites_position[:][sd.sites_inference[:]] match = re.search(r'(chr\d+)', filename) if match or map is not None: if map is not None: chr_map = msprime.RecombinationMap.read_hapmap(map) else: chr = match.group(1) print( f"Using {chr} from HapMapII_GRCh37 for the recombination map") map = stdpopsim.get_species("HomSap").get_genetic_map( id="HapMapII_GRCh37") if not map.is_cached(): map.download() chr_map = map.get_chromosome_map(chr) inference_distances = physical_to_genetic(chr_map, inference_pos) d = np.diff(inference_distances) rho = np.concatenate(([0.0], d)) else: inference_distances = inference_pos d = np.diff(inference_distances) rho = np.concatenate(([0.0], d / sd.sequence_length)) return sd, rho, filename[:-len(".samples")], None
def split_chromosome(args): match = re.search(r"(chr\d+)", args.chrom) if match is None: raise ValueError("chr must be in filename") chrom = match.group(1) with open(args.centromeres) as csvfile: reader = csv.DictReader(csvfile) centromere_positions = list() for row in reader: if row["chrom"] == chrom: centromere_positions.append(int(row["chromStart"])) centromere_positions.append(int(row["chromEnd"])) start = np.min(centromere_positions) end = np.max(centromere_positions) split_point = (start + end) / 2 samples = tsinfer.load(args.input) position = samples.sites_position[:] print(f"Splitting at {split_point}") if args.arm == "p": keep_sites = np.where(position < split_point)[0] print(f"Keeping {keep_sites.shape[0]} sites") arm = samples.subset(sites=keep_sites) snipped_samples = arm.copy(path=args.output) snipped_samples.data.attrs["sequence_length"] = split_point elif args.arm == "q": keep_sites = np.where(position > split_point)[0] print(f"Keeping {keep_sites.shape[0]} sites") arm = samples.subset(sites=keep_sites) snipped_samples = arm.copy(path=args.output) snipped_samples.finalise()
def run_get_dated_samples(args): samples = tsinfer.load(args.samples) ts = tskit.load(args.ts) assert args.samples.endswith(".samples") prefix = args.samples[0:-len(".samples")] copy = samples.copy(prefix + ".dated.samples") copy.sites_time[:] = tsdate.get_sites_time(ts) copy.finalise()
def remove_moderns_reich(args): samples = tsinfer.load(args.input) ancients = samples.subset(individuals=np.where( samples.individuals_time[:] != 0)[0]) genos = ancients.sites_genotypes[:] sites = np.where(np.sum(genos == 1, axis=1) != 0)[0] ancients_pruned = ancients.subset(sites=sites) copy = ancients_pruned.copy(args.output) copy.finalise()
def setup_sample_file(args): """ Return a Thousand Genomes Project sample data file, the corresponding recombination rate array, a prefix to use for files, and None """ filename = args.sample_file if not filename.endswith(".samples"): raise ValueError("Sample data file must end with '.samples'") sd = tsinfer.load(filename) return sd, filename[:-len(".samples")],
def main(): description = """Simple CLI wrapper for tsinfer tskit version: {} tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--verbosity', '-v', action='count', default=0) parser.add_argument( "samples", help="The samples file name, as saved by tsinfer.SampleData.initialise()") parser.add_argument( "output", help="The path to write the output file to") parser.add_argument( "-l", "--length", default=None, type=int, help="The total sequence length") parser.add_argument( "-t", "--threads", default=1, type=int, help="The number of worker threads to use") parser.add_argument( "-m", "--method", default="C", choices=['C','P'], help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)") parser.add_argument( "--inject-real-ancestors-from-ts", default=None, help="Instead of inferring ancestors, construct known ones from this tree sequence file path") parser.add_argument( "-V", "--version", action='version', version=description) args = parser.parse_args() engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE if not os.path.isfile(args.samples): raise ValueError("No samples file") sample_data = tsinfer.load(args.samples) if all(False for _ in sample_data.genotypes(inference_sites=True)): raise ValueError("No inference sites") if args.inject_real_ancestors_from_ts is not None: ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None) orig_ts = tskit.load(args.inject_real_ancestors_from_ts) eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts) ancestor_data.finalise() ancestors_ts = tsinfer.match_ancestors( sample_data, ancestor_data, engine=engine) ts = tsinfer.match_samples( sample_data, ancestors_ts, engine=engine, simplify=True) else: ts = tsinfer.infer( sample_data, num_threads=args.threads, engine=engine) ts.dump(args.output)
def add_indiv_times(args): """ Takes samples 'age' in metadata and add to individuals_time[:] """ samples = tsinfer.load(args.input) times = samples.individuals_time[:] copy = samples.copy(args.output) for indiv in samples.individuals(): if "age" in indiv.metadata: times[indiv.id] = int(indiv.metadata["age"]) copy.individuals_time[:] = times copy.finalise()
def combined_ts_constrained_samples(args): high_cov_samples = tsinfer.load(args.high_cov) dated_hgdp_1kg_sgdp_ts = tskit.load(args.dated_ts) sites_time = tsdate.sites_time_from_ts(dated_hgdp_1kg_sgdp_ts) dated_samples = tsdate.add_sampledata_times(high_cov_samples, sites_time) # Record number of constrained sites print("Total number of sites: ", sites_time.shape[0]) print("Number of ancient lower bounds: ", np.sum(high_cov_samples.min_site_times(individuals_only=True) != 0)) print("Number of corrected times: ", np.sum(dated_samples.sites_time[:] != sites_time)) high_cov_samples_copy = dated_samples.copy(args.output) high_cov_samples_copy.finalise()
def match_ancestors(samples_fn, anc, num_threads, precision, r_prob, m_prob, prefix): sample_data = tsinfer.load(samples_fn) inferred_anc_ts = tsinfer.match_ancestors( sample_data, anc, num_threads=num_threads, precision=precision, recombination=r_prob, mismatch=m_prob, progress_monitor=True, ) inferred_anc_ts.dump(prefix + ".atrees") return inferred_anc_ts
def run_list(args): setup_logging(args) # First try to load with tskit. ts = None try: ts = tskit.load(args.path) except tskit.FileFormatError: pass if ts is None: tsinfer_file = tsinfer.load(args.path) if args.storage: print(tsinfer_file.info) else: print(tsinfer_file) else: summarise_tree_sequence(args.path, ts)
def match_samples(samples_fn, inferred_anc_ts, num_threads, r_prob, m_prob, precision, prefix): sample_data = tsinfer.load(samples_fn) inferred_ts = tsinfer.match_samples( sample_data, inferred_anc_ts, num_threads=num_threads, recombination=r_prob, mismatch=m_prob, precision=precision, progress_monitor=True, force_sample_times=True, simplify=False, ) ts_path = prefix + ".nosimplify.trees" inferred_ts.dump(ts_path) return inferred_ts
def setup_sample_file(base_filename, args, num_threads=1): """ Return a sample data file, the ancestors file, a corresponding recombination rate (a single number or a RateMap), a prefix to use for files, and None """ gmap = args.genetic_map sd = tsinfer.load(base_filename + ".samples") anc = tsinfer.generate_ancestors( sd, num_threads=num_threads, path=base_filename + ".ancestors", ) logger.info("GA done") inference_pos = anc.sites_position[:] match = re.search(r'(chr\d+)', base_filename) if match or gmap is not None: if gmap is not None: logger.info(f"Using {gmap} for the recombination map") rho = intervals.read_hapmap(gmap) else: chr = match.group(1) logger.info( f"Using {chr} from HapMapII_GRCh37 for the recombination map") gmap = stdpopsim.get_species("HomSap").get_genetic_map( id="HapMapII_GRCh37") if not gmap.is_cached(): gmap.download() filename = os.path.join(gmap.map_cache_dir, gmap.file_pattern.format(id=chr)) rho = intervals.read_hapmap(filename) else: rho = 1e-8 # shouldn't matter what this is - it it relative to mismatch #if np.any(d==0): # w = np.where(d==0) # raise ValueError("Zero recombination rates at", w, inference_pos[w]) return sd.path, anc.path, rho, "", None
def remove_outliers(args): tree_seq = tskit.load(args.ts) samples = tsinfer.load(args.samples) # Find number of mutations per site muts_per_site = np.unique(tree_seq.tables.mutations.site, return_counts=True) mean_muts_per_site = np.mean(muts_per_site[1]) std_muts_per_site = np.std(muts_per_site[1]) print("Mean number of muts per site: ", mean_muts_per_site) print("Std number of muts per site: ", std_muts_per_site) # Find outliers: greater than 3 standard deviations from the mean number of mutations # per site outliers = muts_per_site[1] > mean_muts_per_site + 3 * std_muts_per_site # Remove outlier sites from tree sequence and sampledata files tree_seq = tree_seq.delete_sites(np.where(muts_per_site[0][outliers])[0]) tree_seq.dump(args.output_ts) samples_subset = samples.subset( sites=np.where(muts_per_site[0][~outliers])[0]) samples_subset_copy = samples_subset.copy(args.output_samples) samples_subset_copy.finalise() print(" Number of muts removed: ", np.sum(outliers))
def run_sequential_augment(args): base = ".".join(args.input.split(".")[:-1]) sample_data = tsinfer.load(args.input) num_samples = sample_data.num_samples ancestors_ts = tskit.load(base + ".ancestors.trees") # Compute the total samples required. n = 2 total = 0 while n < num_samples // 4: total += n n *= 2 np.random.seed(args.seed) samples = np.random.choice(np.arange(num_samples), size=total, replace=False) np.save(base + ".augmented_samples.npy", samples) n = 2 j = 0 while n < num_samples // 4: augmented_file = base + ".augmented_{}.ancestors.trees".format(n) final_file = base + ".augmented_{}.nosimplify.trees".format(n) subset = samples[j:j + n] subset.sort() ancestors_ts = run_augment(sample_data, ancestors_ts, subset, args.num_threads) ancestors_ts.dump(augmented_file) j += n n *= 2 final_ts = run_match_samples(sample_data, ancestors_ts, args.num_threads) final_ts.dump(final_file)
def merge_sampledata_files(args): samples = [] for cur_sample in args.input_sampledata: samples.append(tsinfer.load(cur_sample)) merged_samples = samples[0] for index, other_samples in enumerate(samples[1:]): print("Loaded sampledata file # {}".format(index)) intersect_sites = np.isin(merged_samples.sites_position[:], other_samples.sites_position[:]) other_intersect_sites = np.where( np.isin(other_samples.sites_position[:], merged_samples.sites_position[:]))[0] other_samples_metadata = other_samples.sites_metadata[:] for site_index, site_metadata in zip( other_intersect_sites, merged_samples.sites_metadata[:][intersect_sites]): other_samples_metadata[site_index] = site_metadata other_samples_copy = other_samples.copy() other_samples_copy.sites_metadata[:] = other_samples_metadata other_samples_copy.finalise() merged_samples = merged_samples.merge(other_samples_copy) print("Merged sampledata file # {}".format(index)) merged_copy = merged_samples.copy(args.output) merged_copy.finalise()
def run_build(): sample_data = tsinfer.load(sys.argv[1]) ad = tsinfer.generate_ancestors(sample_data) print(ad)
def run_combine_ukbb_1kg(args): ukbb_samples_file = "ukbb_{}.samples".format(args.chromosome) tg_ancestors_ts_file = "1kg_{}.trees".format(args.chromosome) ancestors_ts_file = "1kg_ukbb_{}.ancestors.trees".format(args.chromosome) samples_file = "1kg_ukbb_{}.samples".format(args.chromosome) ukbb_samples = tsinfer.load(ukbb_samples_file) tg_ancestors_ts = tskit.load(tg_ancestors_ts_file) print("Loaded ts:", tg_ancestors_ts.num_nodes, tg_ancestors_ts.num_edges) # Subset the sites down to the UKBB sites. tables = tg_ancestors_ts.dump_tables() ukbb_sites = set(ukbb_samples.sites_position[:]) ancestors_sites = set(tables.sites.position[:]) intersecting_sites = ancestors_sites & ukbb_sites print("Intersecting sites = ", len(intersecting_sites)) tables.sites.clear() tables.mutations.clear() for site in tg_ancestors_ts.sites(): if site.position in intersecting_sites: # Sites must be 0/1 for the ancestors ts. site_id = tables.sites.add_row(position=site.position, ancestral_state="0") assert len(site.mutations) == 1 mutation = site.mutations[0] tables.mutations.add_row(site=site_id, node=mutation.node, derived_state="1") # Reduce this to the site topology now to make things as quick as possible. tables.simplify(reduce_to_site_topology=True, filter_sites=False) reduced_ts = tables.tree_sequence() # Rewrite the nodes so that 0 is one older than all the other nodes. nodes = tables.nodes.copy() tables.nodes.clear() tables.nodes.add_row(flags=1, time=np.max(nodes.time) + 2) tables.nodes.append_columns( flags=np.bitwise_or(nodes.flags, 1), # Everything is a sample. time=nodes.time + 1, # Make sure that all times are > 0 population=nodes.population, individual=nodes.individual, metadata=nodes.metadata, metadata_offset=nodes.metadata_offset) # Add one to all node references to account for this. tables.edges.set_columns(left=tables.edges.left, right=tables.edges.right, parent=tables.edges.parent + 1, child=tables.edges.child + 1) tables.mutations.set_columns( node=tables.mutations.node + 1, site=tables.mutations.site, parent=tables.mutations.parent, derived_state=tables.mutations.derived_state, derived_state_offset=tables.mutations.derived_state_offset, metadata=tables.mutations.metadata, metadata_offset=tables.mutations.metadata_offset) trees = reduced_ts.trees() tree = next(trees) left = 0 root = tree.root for tree in trees: if tree.root != root: tables.edges.add_row(left, tree.interval[0], 0, root + 1) root = tree.root left = tree.interval[0] tables.edges.add_row(left, reduced_ts.sequence_length, 0, root + 1) tables.sort() ancestors_ts = tables.tree_sequence() print("Writing ancestors_ts") ancestors_ts.dump(ancestors_ts_file) # Now create a new samples file to get rid of the missing sites. git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]) git_provenance = { "repo": "[email protected]:mcveanlab/treeseq-inference.git", "hash": git_hash.decode().strip(), "dir": "human-data", "notes:": ("Use the Makefile to download and process the upstream data files") } n = args.num_individuals if n is None: n = ukbb_samples.num_individuals with tsinfer.SampleData( path=samples_file, num_flush_threads=4, sequence_length=ukbb_samples.sequence_length) as samples: iterator = tqdm.tqdm(itertools.islice( tqdm.tqdm(ukbb_samples.individuals()), n), total=n) for ind in iterator: samples.add_individual(ploidy=2, location=ind.location, metadata=ind.metadata) for variant in tqdm.tqdm(ukbb_samples.variants(), total=ukbb_samples.num_sites): if variant.site.position in intersecting_sites: samples.add_site(position=variant.site.position, alleles=variant.alleles, genotypes=variant.genotypes[:2 * n], metadata=variant.site.metadata) for timestamp, record in ukbb_samples.provenances(): samples.add_provenance(timestamp, record) samples.record_provenance(command=sys.argv[0], args=sys.argv[1:], git=git_provenance) print(samples)
def setup_sampledata_from_simulation(prefix, random_seed, err=0, num_threads=1, cheat_breakpoints=False, use_sites_time=False, skip_existing=False): """ Take the results of a simulation and return a sample data file, some reconstructed ancestors, a recombination rate array, a suffix to append to the file prefix, and the original tree sequence. If 'err' is 0, we do not inject any errors into the haplotypes. Otherwise we add empirical sequencing error and ancestral allele polarity error If "cheat_recombination" is True, multiply the recombination_rate for known recombination locations from the simulation by 20 If "use_sites_time" is True, use the times If "skip_existing" is True, and the sample_data file and ancestors_file that were going to be generated already exist, then skip the actual simulation and just return those files and their data. """ suffix = "" ts = tskit.load(prefix + ".trees") plain_samples = tsinfer.SampleData.from_tree_sequence( ts, use_sites_time=use_sites_time) if cheat_breakpoints: suffix += "cheat_breakpoints" logger.info("Cheating by using known breakpoints") if use_sites_time: suffix += "use_times" logger.info("Cheating by using known times") if err == 0: sd_path = prefix + suffix + ".samples" if skip_existing and os.path.exists(sd_path): logger.info( f"Simulation file {sd_path} already exists, loading that.") sd = tsinfer.load(sd_path) else: sd = plain_samples.copy(path=sd_path) # Save the samples file sd.finalise() else: logger.info("Adding error") suffix += f"_ae{err}" sd_path = prefix + suffix + ".samples" if skip_existing and os.path.exists(sd_path): logger.info(f"Sample file {sd_path} already exists, loading that.") sd = tsinfer.load(sd_path) else: error_file = add_errors(plain_samples, err, random_seed=random_seed) sd = error_file.copy(path=prefix + suffix + ".samples") if use_sites_time: # Sites that were originally singletons have time 0, but could have been # converted to inference sites when adding error. Give these a nonzero time sites_time = sd.sites_time sites_time[sites_time == 0] = np.min( sites_time[sites_time > 0]) / 1000.0 sd.sites_time[:] = sites_time sd.finalise() for attribute in ('sequence_length', 'num_samples', 'num_sites'): if getattr(sd, attribute) != getattr(ts, attribute): raise ValueError( f"{attribute} differs between original ts and sample_data: " f"{getattr(sd, attribute)} vs {getattr(ts, attribute)}") anc_path = prefix + suffix + ".ancestors" if skip_existing and os.path.exists(anc_path): logger.info(f"Ancestors file {anc_path} already exists, loading that.") anc = tsinfer.load(anc_path) else: anc = tsinfer.generate_ancestors( sd, num_threads=num_threads, path=anc_path, ) logger.info("GA done") inference_pos = anc.sites_position[:] rho = 1e-8 # shouldn't matter what this is - it it relative to mismatch if cheat_breakpoints: raise NotImplementedError( "Need to make a RateMap with higher r at breakpoints") breakpoint_positions = np.array(list(ts.breakpoints())) inference_positions = anc.sites_position[:] breakpoints = np.searchsorted(inference_positions, breakpoint_positions) # Any after the last inference position must be junked # (those before the first inference position make no difference) breakpoints = breakpoints[breakpoints != len(rho)] rho[breakpoints] *= 20 return sd.path, anc.path, rho, suffix, ts
def run(params): """ Run a single inference, with the specified rates """ precision = params.precision logger.info( f"Starting {params.ma_mis_ratio} {params.ms_mis_ratio}. Precision {precision}" ) prefix = None assert params.sample_file.endswith(".samples") assert params.anc_file.endswith(".ancestors") samples = tsinfer.load(params.sample_file) ancestors = tsinfer.load(params.anc_file) start_time = time.process_time() prefix = params.sample_file[0:-len(".samples")] inf_prefix = "{}_rma{:g}_rms{:g}_p{}".format(prefix, params.ma_mis_ratio, params.ms_mis_ratio, precision) ats_path = inf_prefix + ".atrees" if params.skip_existing and os.path.exists(ats_path): logger.info( f"Ancestors ts file {ats_path} already exists, loading that.") inferred_anc_ts = tskit.load(ats_path) prov = json.loads(inferred_anc_ts.provenances()[-1].record.encode()) if ancestors.uuid != prov['parameters']['source']['uuid']: logger.warning( "The loaded ancestors ts does not match the ancestors file. " "Checking the site positions, and will abort if they don't match!" ) # We might be re-running this, but the simulation file is the same # So double-check that the positions in the ats are a subset of those in the # used sample data file assert np.all( np.isin(inferred_anc_ts.tables.sites.position, samples.sites_position[:])) else: logger.info(f"MA running: will save to {ats_path}") inferred_anc_ts = tsinfer.match_ancestors( samples, ancestors, num_threads=params.num_threads, precision=precision, recombination_rate=params.rec_rate, mismatch_ratio=params.ma_mis_ratio) inferred_anc_ts.dump(ats_path) logger.info(f"MA done: mismatch ratio = {params.ma_mis_ratio}") ts_path = inf_prefix + ".trees" if params.skip_existing and os.path.exists(ts_path): logger.info( f"Inferred ts file {ts_path} already exists, loading that.") inferred_ts = tskit.load(ts_path) try: user_data = inferred_ts.metadata['user_data'] try: assert np.allclose(params.kc_max, user_data['kc_max']) except (KeyError, TypeError): pass # could be NaN e.g. if this is real data return user_data except (TypeError, KeyError): logging.warning( "No metadata in {ts_path}: re-inferring these parameters") # Otherwise finish off the inference logger.info( f"MS running with {params.num_threads} threads: will save to {ts_path}" ) inferred_ts = tsinfer.match_samples(samples, inferred_anc_ts, num_threads=params.num_threads, precision=precision, recombination_rate=params.rec_rate, mismatch_ratio=params.ms_mis_ratio) process_time = time.process_time() - start_time logger.info(f"MS done: mismatch ratio = {params.ms_mis_ratio}") simplified_inferred_ts = inferred_ts.simplify() # Remove unary nodes # Calculate mean num children (polytomy-measure) for internal nodes nc_sum = 0 nc_sum_sq = 0 nc_tot = 0 root_lengths = collections.defaultdict(float) for tree in simplified_inferred_ts.trees(): for n in tree.nodes(): n_children = tree.num_children(n) if n_children > 0: # exclude leaves/samples nc_sum += n_children * tree.span nc_sum_sq += (n_children**2) * tree.span nc_tot += tree.span arity_mean = nc_sum / nc_tot arity_var = nc_sum_sq / nc_tot - (arity_mean**2 ) # can't be bothered to adjust for n # Calculate span of root nodes in simplified tree sim_ts_bytes = sim_ts_min_bytes = None kc_poly = kc_split = None if params.ts_file is not None: try: simulated_ts = tskit.load(params.ts_file + ".trees") logger.info(f"Calculating KC distances for {ts_path}") sim_ts_bytes = simulated_ts.nbytes sim_ts_min_bytes = simulated_ts.simplify( keep_unary=True, reduce_to_site_topology=True, filter_sites=False).nbytes kc_poly = simplified_inferred_ts.kc_distance(simulated_ts) logger.debug("KC poly calculated") kc_split = 0 for interval, orig_tree, new_tree in simulated_ts.coiterate( simplified_inferred_ts, sample_lists=True): kc_split += interval.span * orig_tree.kc_distance( new_tree.split_polytomies(random_seed=int(interval.left), sample_lists=True)) kc_split /= simulated_ts.sequence_length logger.debug("KC split calculated") except FileNotFoundError: pass results = { 'arity_mean': arity_mean, 'arity_var': arity_var, 'edges': inferred_ts.num_edges, 'error': params.error, 'kc_max_split': params.kc_max_split, 'kc_max': params.kc_max, 'kc_poly': kc_poly, 'kc_split': kc_split, 'muts': inferred_ts.num_mutations, 'n': inferred_ts.num_samples, 'num_sites': inferred_ts.num_sites, 'num_trees': inferred_ts.num_trees, 'precision': precision, 'proc_time': process_time, 'ma_mis_ratio': params.ma_mis_ratio, 'ms_mis_ratio': params.ms_mis_ratio, 'seed': params.seed, 'sim_ts_min_bytes': sim_ts_min_bytes, 'sim_ts_bytes': sim_ts_bytes, 'source': params.source, 'ts_bytes': inferred_ts.nbytes, 'ts_path': ts_path, } # Save the results into the ts metadata - this should allow us to reconstruct the # results table should anything go awry, or if we need to add more tables = inferred_ts.dump_tables() if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}): if tables.metadata: raise RuntimeError( "Metadata already exists in the ts, and is not JSON") tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = {} tables.metadata = {"user_data": results, **tables.metadata} tables.tree_sequence().dump(ts_path) return results
def min_site_times_ancients(args): samples = tsinfer.load("all-data/1kg_ancients_noreich_chr20.samples") min_times = samples.min_site_times(individuals_only=True) df = pd.DataFrame(np.unique(min_times, return_counts=True)) df.to_csv("data/1kg_ancients_chr20_min_site_times.csv")
def main(): description = """Simple CLI wrapper for tsinfer tskit version: {} tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--verbosity", "-v", action="count", default=0) parser.add_argument( "samples", help= "The samples file name, as saved by tsinfer.SampleData.initialise()", ) parser.add_argument("prefix", help="The prefix of the output filename") parser.add_argument( "-t", "--threads", default=1, type=int, help="The number of worker threads to use", ) parser.add_argument( "-s", "--step", default="infer", choices=["GA", "MA", "MS"], help= "Which step of the algorithm to run: generate ancestors (GA), match ancestors" "(MA), or match samples (MS) or all three (infer)", ) parser.add_argument( "-m", "--genetic-map", default=None, help= "An alternative genetic map to be used for this analysis, in the format" "expected by msprime.RateMap.read_hapmap", ) parser.add_argument( "-p", "--precision", default=None, type=int, help="The precision parameter to pass to the function", ) parser.add_argument("-V", "--version", action="version", version=description) args = parser.parse_args() if not os.path.isfile(args.samples): raise ValueError("No samples file") if args.step == "infer": anc = generate_ancestors(args.samples, args.threads, args.prefix) if args.genetic_map == "None": genetic_map = None r_prob, m_prob = get_rho(anc, genetic_map, args.prefix) inferred_anc_ts = match_ancestors(args.samples, anc, args.threads, args.precision, r_prob, m_prob) match_samples(args.samples, inferred_anc_ts, args.threads, r_prob, m_prob, args.precision) if args.step == "GA": anc = generate_ancestors(args.samples, args.threads, args.prefix) if args.step == "MA": anc = tsinfer.load(args.prefix + ".truncated.ancestors") if args.genetic_map == "None": genetic_map = None else: genetic_map = args.genetic_map r_prob, m_prob = get_rho(anc, genetic_map, args.prefix) inferred_anc_ts = match_ancestors(args.samples, anc, args.threads, args.precision, r_prob, m_prob, args.prefix) if args.step == "MS": anc = tsinfer.load(args.prefix + ".truncated.ancestors") inferred_anc_ts = tskit.load(args.prefix + ".atrees") if args.genetic_map == "None": genetic_map = None else: genetic_map = args.genetic_map r_prob, m_prob = get_rho(anc, genetic_map, args.prefix) match_samples( args.samples, inferred_anc_ts, args.threads, r_prob, m_prob, args.precision, args.prefix, )
and bins the resulting times to the nearest 10 (unless the time is <= 1). """ import argparse import numpy as np import tsinfer import tskit if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("input_file", help="A tsinfer sample file ending in '.samples") parser.add_argument("output_file", help="A tsinfer sample file ending in '.samples") args = parser.parse_args() sd = tsinfer.load(args.input_file).copy(path=args.output_file) times = sd.sites_time[:] times[times > 1] = np.round(times[times > 1], -1) times[times == 0] = 1 sd.sites_time[:] = times print( "Number of samples:", sd.num_samples, ". Number of discrete times:", len(np.unique(sd.sites_time[:])), ) sd.finalise()
parser.add_argument("-p", "--percent_of_genome", type=float, default=10, help="The percent of the genome to include") parser.add_argument( "-s", "--genome_start_percent", type=int, default=0, help= "The genomic point at which to start the subsample, as a percentage of the" " total genome length") args = parser.parse_args() sd = tsinfer.load(args.input_file) num_samples = sd.num_samples if args.num_samples is None else args.num_samples assert num_samples <= sd.num_samples assert 0 < args.percent_of_genome <= 100 assert args.percent_of_genome + args.genome_start_percent <= 100 del_samples = np.random.choice(sd.num_samples, sd.num_samples - num_samples, replace=False) del_sites = np.ones(sd.num_sites, dtype=bool) start_keep = int(args.genome_start_percent / 100.0 * sd.num_sites) end_keep = start_keep + int(args.percent_of_genome / 100.0 * sd.num_sites) del_sites[np.arange(start_keep, end_keep)] = False small_sd = sd.delete(samples=del_samples,
def simulate_stdpopsim( species, model, contig, num_samples, mutation_file=None, seed=123, skip_existing=False, num_procs=1, ): base_fn = f"{model}_{contig}_n{num_samples}" tree_fn = f"{base_fn}_seed{seed}" logger.info( f"Using {species}:{contig} from stdpopsim using the {model} model") if skip_existing and os.path.exists(tree_fn + ".trees"): logger.info( f"Simulation file {tree_fn}.trees already exists, returning that.") return base_fn, tree_fn sample_data = None species = stdpopsim.get_species(species) model = species.get_demographic_model(model) num_pops = model.num_sampling_populations if num_samples < num_pops or num_samples % num_pops != 0: raise ValueError( f"num_samples must be an integer multiple of {num_pops} " f"(or 2 x {num_pops} if diploid sequencing error is injected)") pop_n = num_samples // num_pops logger.info( f"Simulating {num_pops}x{pop_n} samples, seed {seed}, file prefix '{tree_fn}'." ) contig = species.get_contig(contig) l = contig.recombination_map.get_sequence_length() if mutation_file is not None: logger.debug(f"Loading {mutation_file}") sample_data = tsinfer.load(mutation_file) if sample_data.sequence_length != l: raise ValueError( f"Mismatching sequence_length between simulation and {mutation_file}" ) # Reduce mutation rate to 0, as we will insert mutations later contig = stdpopsim.Contig( mutation_rate=0, recombination_map=contig.recombination_map, genetic_map=contig.genetic_map, ) r_map = contig.recombination_map assert len(r_map.get_rates()) == 2 # Ensure a single rate over chr samples = model.get_samples(*([pop_n] * num_pops)) engine = stdpopsim.get_engine('msprime') ts = engine.simulate(model, contig, samples, seed=seed) tables = ts.dump_tables() if sample_data is not None: pos = sample_data.sites_position[:] logger.info( f"Inserting {len(pos)} mutations at variable sites from {mutation_file}" ) for tree in ts.trees(): positions = pos[np.logical_and(pos >= tree.interval[0], pos < tree.interval[1])] if len(positions) == 0: continue muts = list( zip( np.random.uniform(0, tree.total_branch_length, size=len(positions)), positions)) muts.sort() tot = 0 # place a mutation on a random branch, proportional to branch length try: for n in tree.nodes(): tot += tree.branch_length(n) while muts[0][0] < tot: _, position = muts.pop(0) s = tables.sites.add_row(position=position, ancestral_state="0") tables.mutations.add_row(node=n, site=s, derived_state="1") except IndexError: # No more mutations - go to next tree continue tables.sort() logger.debug( f"Inserted mutations at density {ts.num_mutations/ts.sequence_length}" ) interval = [int(l * 2 / 20), int(l * 2 / 20) + 1e7] # 10Mb near the start, not centromeric tables.keep_intervals([interval]) tables.trim() logger.debug( f"Cut down tree seq to {interval} ({tables.sites.num_rows} sites) for speed" ) # Add info to the top-level metadata user_data = {} logger.info( "Calculating the kc distance of the simulation against a flat tree") star_tree = tskit.Tree.generate_star(ts.num_samples, span=tables.sequence_length, record_provenance=False) user_data['kc_max'] = tables.tree_sequence().kc_distance( star_tree.tree_sequence) kc_array = [] max_reps = 100 ts = tables.tree_sequence() logger.info( f"Calculating KC distance of the sim against at most {max_reps} * {ts.num_trees}" f" random trees using {num_procs} parallel threads. This could take a while." ) seeds = range(seed, seed + max_reps) with multiprocessing.Pool(num_procs) as pool: for i, kc in enumerate( pool.imap_unordered(rnd_kc, zip(itertools.repeat(ts), seeds))): kc_array.append(kc) if i > 10: se_mean = np.std(kc_array, ddof=1) / np.sqrt(i) # break if SEM < 1/100th of mean KC. This can take along time if se_mean / np.average(kc_array) < 0.01: logger.info( f"Stopped after {i} replicates as kc_max_split deemed accurate." ) break user_data['kc_max_split'] = np.average(kc_array) if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}): if tables.metadata: raise RuntimeError("Metadata already exists, and is not JSON") tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = {} tables.metadata = {"user_data": user_data, **tables.metadata} tables.tree_sequence().dump(tree_fn + ".trees") return base_fn, tree_fn