def test_format_written(self): ts = msprime.simulate(10, random_seed=1) tszip.compress(ts, self.path) with zarr.ZipStore(str(self.path), mode="r") as store: root = zarr.group(store=store) self.assertEqual(root.attrs["format_name"], compression.FORMAT_NAME) self.assertEqual(root.attrs["format_version"], compression.FORMAT_VERSION)
def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix="tszip_cli_") self.trees_path = pathlib.Path(self.tmpdir.name) / "msprime.trees" self.ts = msprime.simulate(10, mutation_rate=10, random_seed=1) self.compressed_path = pathlib.Path( self.tmpdir.name) / "msprime.trees.tsz" tszip.compress(self.ts, self.compressed_path)
def verify(self, ts): with tempfile.TemporaryDirectory() as tmpdir: path = pathlib.Path(tmpdir) / "treeseq.tsz" tszip.compress(ts, path) other_ts = tszip.decompress(path) G1 = ts.genotype_matrix() G2 = other_ts.genotype_matrix() self.assertTrue(np.array_equal(G1, G2))
def test_provenance(self): ts = msprime.simulate(10, random_seed=1) for variants_only in [True, False]: tszip.compress(ts, self.path, variants_only=variants_only) with zarr.ZipStore(str(self.path), mode='r') as store: root = zarr.group(store=store) self.assertEqual( root.attrs["provenance"], provenance.get_provenance_dict( {"variants_only": variants_only}))
def test_suffix(self): suffix = ".XYGsdf" self.compressed_path = self.compressed_path.with_suffix(suffix) tszip.compress(self.ts, self.compressed_path) self.assertTrue(self.compressed_path.exists()) self.run_decompress([str(self.compressed_path), "-S", suffix]) self.assertFalse(self.compressed_path.exists()) outpath = self.trees_path self.assertTrue(outpath.exists()) ts = tskit.load(str(outpath)) self.assertEqual(ts.tables, self.ts.tables)
def run_compress(args): setup_logging(args) for file_arg in args.files: logger.info("Compressing {}".format(file_arg)) try: ts = tskit.load(file_arg) except tskit.FileFormatError as ffe: exit("Error loading '{}': {}".format(file_arg, ffe)) logger.debug("Loaded tree sequence") infile = pathlib.Path(file_arg) outfile = pathlib.Path(file_arg + args.suffix) check_output(outfile, args) tszip.compress(ts, outfile, variants_only=args.variants_only) remove_input(infile, args)
def run_compress(args): if args.stdout: exit("Compressing to stdout not currently supported;" "Please see https://github.com/tskit-dev/tszip/issues/49") setup_logging(args) for file_arg in args.files: logger.info(f"Compressing {file_arg}") try: ts = tskit.load(file_arg) except (FileNotFoundError, tskit.FileFormatError) as ffe: exit(f"Error loading '{file_arg}': {ffe}") logger.debug("Loaded tree sequence") infile = pathlib.Path(file_arg) outfile = pathlib.Path(file_arg + args.suffix) check_output(outfile, args) tszip.compress(ts, outfile, variants_only=args.variants_only) remove_input(infile, args)
def save_ts(ts, path, tszip=False): """Save ts to a file. Path gives the filename if tszip evaluates to True, the output will be comrpessed tszip tszip compression can can reduce files sizes, but adds time to import and export steps. """ if tszip: # save compressed ts try: import tszip except ImportError: assert False, "tszip compression requires tszip package" tszip.compress(ts, path, variants_only=False) else: # save uncompressed ts ts.dump(path)
def convert_file_worker(k): n = 10**k filename = os.path.join(data_prefix, "{}.trees".format(n)) if not os.path.exists(filename): raise ValueError("Missing simulation") ts = msprime.load(filename) tsz_filename = filename + ".tsz" tszip.compress(ts, tsz_filename, variants_only=True) # Convert to PBWT by piping in VCF. This avoids having the write the # ~10TB VCF to disk. pbwt_filename = os.path.join(data_prefix, "{}.pbwt".format(n)) pbwtgz_filename = pbwt_filename + ".gz" sites_filename = os.path.join(data_prefix, "{}.sites".format(n)) sitesgz_filename = sites_filename + ".gz" cmd = "./tools/pbwt/pbwt -readVcfGT - -write {} -writeSites {}".format( pbwt_filename, sites_filename) read_fd, write_fd = os.pipe() write_pipe = os.fdopen(write_fd, "w") proc = subprocess.Popen(cmd, shell=True, stdin=read_fd) ts.write_vcf(write_pipe, ploidy=2) write_pipe.close() os.close(read_fd) proc.wait() if proc.returncode != 0: raise RuntimeError("pbwt failed with status:", proc.returncode) subprocess.check_call( "gzip -c {} > {}".format(pbwt_filename, pbwtgz_filename), shell=True) subprocess.check_call( "gzip -c {} > {}".format(sites_filename, sitesgz_filename), shell=True) if k < 7: vcf_filename = os.path.join(data_prefix, "{}.vcf".format(n)) with open(vcf_filename, "w") as vcf_file: ts.write_vcf(vcf_file, 2) print("Wrote ", vcf_filename) gz_filename = vcf_filename + ".gz" subprocess.check_call("gzip -c {} > {}".format(vcf_filename, gz_filename), shell=True) print("Wrote ", gz_filename) return k
def verify(self, ts): if ts.num_migrations > 0: raise unittest.SkipTest("Migrations not supported") with tempfile.TemporaryDirectory() as tmpdir: path = pathlib.Path(tmpdir) / "treeseq.tsz" tszip.compress(ts, path, variants_only=True) other_ts = tszip.decompress(path) self.assertEqual(ts.num_sites, other_ts.num_sites) for var1, var2 in zip(ts.variants(), other_ts.variants()): self.assertTrue(np.array_equal(var1.genotypes, var2.genotypes)) self.assertEqual(var1.site.position, var2.site.position) self.assertEqual(var1.alleles, var2.alleles) # Populations, individuals and sites should be untouched if there are no # unreachable individuals. t1 = ts.tables t2 = other_ts.tables self.assertEqual(t1.sequence_length, t2.sequence_length) self.assertEqual(t1.populations, t2.populations) self.assertEqual(t1.individuals, t2.individuals) self.assertEqual(t1.sites, t2.sites) # We should be adding an extra provenance record in here due to simplify. self.assertEqual(len(t1.provenances), len(t2.provenances) - 1)
def test_save_dir(self): ts = msprime.simulate(10, random_seed=1) with self.assertRaises(OSError): tszip.compress(ts, self.path.parent)
def verify(self, ts): with tempfile.TemporaryDirectory() as tmpdir: path = pathlib.Path(tmpdir) / "treeseq.tsz" tszip.compress(ts, path) other_ts = tszip.decompress(path) self.assertEqual(ts.tables, other_ts.tables)
def sim_two_pulse(rec_map=None, L=1e9, Ne=10000, Nadmix=500, T1=4, T2=12, frac1=.2, frac2=.2, seed=None, path=None, tszip=None): """Simulate a simple pulse model of admixture. Using the disrete-time backwards wright-fisher. rec_map = valid msprime recombination map L = length of genome, in base pairs (ignored if rec_map is specified) Ne = diploid population size for all three populations Tadmix = time of admixture Nadmix = number of observed admixed diploid individuals seed = seed passed to msprime.simulate() path = file path, if given will write the ts to this path """ assert T2 > T1, "T2 must be greater than T1" # convert to correct dtypes and catch problems T1 = int(T1) T2 = int(T2) Ne = int(Ne) Nadmix = int(Nadmix) # recombination map if rec_map: recomb_map = rec_map else: L = int(L) recomb_map = msprime.RecombinationMap.uniform_map(L, 1e-8, L) pop_configs = [ msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0), msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0), msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0) ] # no ongoing migration mig_mat = [ [0, 0, 0], [0, 0, 0], [0, 0, 0], ] admixture_events = [ msprime.MassMigration(time=T1, source=2, destination=1, proportion=frac1), msprime.MassMigration(time=T2, source=2, destination=1, proportion=frac2), msprime.MassMigration(time=T2 + 1, source=2, destination=0, proportion=1.0), ] samps = [msprime.Sample(population=2, time=0)] * 2 * Nadmix ts_admix = msprime.simulate( population_configurations=pop_configs, migration_matrix=mig_mat, demographic_events=admixture_events, recombination_map=recomb_map, mutation_rate=0, model='dtwf', samples=samps, random_seed=seed, start_time=0, end_time=T2 + 2 ) if path: if tszip: # save compressed ts import tszip tszip.compress(ts_admix, path, variants_only=False) else: # save uncompressed ts ts_admix.dump(path) return(ts_admix)
T54=70000/generation_time # CEU joins AFR T10=200000/generation_time # Denisovan 1 joins Denisovan 0 T20=200000/generation_time # Denisovan 2 joins Denisovan 0 T03=300000/generation_time # Denisovan 0 joins Denisovan 3 (Altai) T93=400000/generation_time # Neanderthal joins Denisovan 3 (Altai) T34=600000/generation_time # Denisovan 3 (Altai) joins AFR T410=4000000/generation_time # AFR joins Chimp TA1=2500/generation_time TA2=48000/generation_time TA3=68000/generation_time TS_NEA=60000/generation_time TS_DEN3=40000/generation_time NumSamples=80 nS=[10] tS=[0,TS_DEN3,TS_NEA] f=[0.10, 0.04, 0.02] N=[1500,1500,1500,1500,15000,5000,3500,3500,3500,2000,30000] seed=None samples = set_up_pops(nS,tS) demography = set_up_demography(T78, T68, T85, T54, T10, T20, T03, T93, T34, T410, TA1, TA2, TA3, f) pops = [msp.PopulationConfiguration(initial_size = n) for n in N] ts = msp.simulate(samples=samples, Ne=N[0], population_configurations=pops, demographic_events=demography, mutation_rate=mu, length=L, recombination_rate=r, record_migrations=True, random_seed=seed) # output resulting tree sequences to compressed .tsz file tszip.compress(ts, "tree_seq_files/{}_model_{}.tsz".format(model, simrep))
def run_compress(args): logger.info("Compressing {}".format(args.file)) ts = tskit.load(args.file) outfile = args.file + ".zarr" tszip.compress(ts, outfile)