Esempio n. 1
0
 def test_format_written(self):
     ts = msprime.simulate(10, random_seed=1)
     tszip.compress(ts, self.path)
     with zarr.ZipStore(str(self.path), mode="r") as store:
         root = zarr.group(store=store)
         self.assertEqual(root.attrs["format_name"], compression.FORMAT_NAME)
         self.assertEqual(root.attrs["format_version"], compression.FORMAT_VERSION)
Esempio n. 2
0
 def setUp(self):
     self.tmpdir = tempfile.TemporaryDirectory(prefix="tszip_cli_")
     self.trees_path = pathlib.Path(self.tmpdir.name) / "msprime.trees"
     self.ts = msprime.simulate(10, mutation_rate=10, random_seed=1)
     self.compressed_path = pathlib.Path(
         self.tmpdir.name) / "msprime.trees.tsz"
     tszip.compress(self.ts, self.compressed_path)
Esempio n. 3
0
 def verify(self, ts):
     with tempfile.TemporaryDirectory() as tmpdir:
         path = pathlib.Path(tmpdir) / "treeseq.tsz"
         tszip.compress(ts, path)
         other_ts = tszip.decompress(path)
     G1 = ts.genotype_matrix()
     G2 = other_ts.genotype_matrix()
     self.assertTrue(np.array_equal(G1, G2))
Esempio n. 4
0
 def test_provenance(self):
     ts = msprime.simulate(10, random_seed=1)
     for variants_only in [True, False]:
         tszip.compress(ts, self.path, variants_only=variants_only)
         with zarr.ZipStore(str(self.path), mode='r') as store:
             root = zarr.group(store=store)
             self.assertEqual(
                 root.attrs["provenance"],
                 provenance.get_provenance_dict(
                     {"variants_only": variants_only}))
Esempio n. 5
0
 def test_suffix(self):
     suffix = ".XYGsdf"
     self.compressed_path = self.compressed_path.with_suffix(suffix)
     tszip.compress(self.ts, self.compressed_path)
     self.assertTrue(self.compressed_path.exists())
     self.run_decompress([str(self.compressed_path), "-S", suffix])
     self.assertFalse(self.compressed_path.exists())
     outpath = self.trees_path
     self.assertTrue(outpath.exists())
     ts = tskit.load(str(outpath))
     self.assertEqual(ts.tables, self.ts.tables)
Esempio n. 6
0
def run_compress(args):
    setup_logging(args)
    for file_arg in args.files:
        logger.info("Compressing {}".format(file_arg))
        try:
            ts = tskit.load(file_arg)
        except tskit.FileFormatError as ffe:
            exit("Error loading '{}': {}".format(file_arg, ffe))
        logger.debug("Loaded tree sequence")
        infile = pathlib.Path(file_arg)
        outfile = pathlib.Path(file_arg + args.suffix)
        check_output(outfile, args)
        tszip.compress(ts, outfile, variants_only=args.variants_only)
        remove_input(infile, args)
Esempio n. 7
0
def run_compress(args):
    if args.stdout:
        exit("Compressing to stdout not currently supported;"
             "Please see https://github.com/tskit-dev/tszip/issues/49")
    setup_logging(args)
    for file_arg in args.files:
        logger.info(f"Compressing {file_arg}")
        try:
            ts = tskit.load(file_arg)
        except (FileNotFoundError, tskit.FileFormatError) as ffe:
            exit(f"Error loading '{file_arg}': {ffe}")
        logger.debug("Loaded tree sequence")
        infile = pathlib.Path(file_arg)
        outfile = pathlib.Path(file_arg + args.suffix)
        check_output(outfile, args)
        tszip.compress(ts, outfile, variants_only=args.variants_only)
        remove_input(infile, args)
Esempio n. 8
0
def save_ts(ts, path, tszip=False):
    """Save ts to a file.

    Path gives the filename
    if tszip evaluates to True, the output will be comrpessed tszip
    tszip compression can can reduce files sizes,
    but adds time to import and export steps.
    """
    if tszip:
        # save compressed ts
        try:
            import tszip
        except ImportError:
            assert False, "tszip compression requires tszip package"
        tszip.compress(ts, path, variants_only=False)
    else:
        # save uncompressed ts
        ts.dump(path)
def convert_file_worker(k):
    n = 10**k
    filename = os.path.join(data_prefix, "{}.trees".format(n))
    if not os.path.exists(filename):
        raise ValueError("Missing simulation")
    ts = msprime.load(filename)

    tsz_filename = filename + ".tsz"
    tszip.compress(ts, tsz_filename, variants_only=True)

    # Convert to PBWT by piping in VCF. This avoids having the write the
    # ~10TB VCF to disk.
    pbwt_filename = os.path.join(data_prefix, "{}.pbwt".format(n))
    pbwtgz_filename = pbwt_filename + ".gz"
    sites_filename = os.path.join(data_prefix, "{}.sites".format(n))
    sitesgz_filename = sites_filename + ".gz"

    cmd = "./tools/pbwt/pbwt -readVcfGT - -write {} -writeSites {}".format(
        pbwt_filename, sites_filename)
    read_fd, write_fd = os.pipe()
    write_pipe = os.fdopen(write_fd, "w")
    proc = subprocess.Popen(cmd, shell=True, stdin=read_fd)
    ts.write_vcf(write_pipe, ploidy=2)
    write_pipe.close()
    os.close(read_fd)
    proc.wait()
    if proc.returncode != 0:
        raise RuntimeError("pbwt failed with status:", proc.returncode)

    subprocess.check_call(
        "gzip -c {} > {}".format(pbwt_filename, pbwtgz_filename), shell=True)

    subprocess.check_call(
        "gzip -c {} > {}".format(sites_filename, sitesgz_filename), shell=True)

    if k < 7:
        vcf_filename = os.path.join(data_prefix, "{}.vcf".format(n))
        with open(vcf_filename, "w") as vcf_file:
            ts.write_vcf(vcf_file, 2)
        print("Wrote ", vcf_filename)
        gz_filename = vcf_filename + ".gz"
        subprocess.check_call("gzip -c {} > {}".format(vcf_filename, gz_filename), shell=True)
        print("Wrote ", gz_filename)
    return k
Esempio n. 10
0
 def verify(self, ts):
     if ts.num_migrations > 0:
         raise unittest.SkipTest("Migrations not supported")
     with tempfile.TemporaryDirectory() as tmpdir:
         path = pathlib.Path(tmpdir) / "treeseq.tsz"
         tszip.compress(ts, path, variants_only=True)
         other_ts = tszip.decompress(path)
     self.assertEqual(ts.num_sites, other_ts.num_sites)
     for var1, var2 in zip(ts.variants(), other_ts.variants()):
         self.assertTrue(np.array_equal(var1.genotypes, var2.genotypes))
         self.assertEqual(var1.site.position, var2.site.position)
         self.assertEqual(var1.alleles, var2.alleles)
     # Populations, individuals and sites should be untouched if there are no
     # unreachable individuals.
     t1 = ts.tables
     t2 = other_ts.tables
     self.assertEqual(t1.sequence_length, t2.sequence_length)
     self.assertEqual(t1.populations, t2.populations)
     self.assertEqual(t1.individuals, t2.individuals)
     self.assertEqual(t1.sites, t2.sites)
     # We should be adding an extra provenance record in here due to simplify.
     self.assertEqual(len(t1.provenances), len(t2.provenances) - 1)
Esempio n. 11
0
 def test_save_dir(self):
     ts = msprime.simulate(10, random_seed=1)
     with self.assertRaises(OSError):
         tszip.compress(ts, self.path.parent)
Esempio n. 12
0
 def verify(self, ts):
     with tempfile.TemporaryDirectory() as tmpdir:
         path = pathlib.Path(tmpdir) / "treeseq.tsz"
         tszip.compress(ts, path)
         other_ts = tszip.decompress(path)
     self.assertEqual(ts.tables, other_ts.tables)
Esempio n. 13
0
def sim_two_pulse(rec_map=None, L=1e9, Ne=10000, Nadmix=500,
                T1=4, T2=12, frac1=.2, frac2=.2,
                seed=None, path=None, tszip=None):
    """Simulate a simple pulse model of admixture.

    Using the disrete-time backwards wright-fisher.

    rec_map = valid msprime recombination map
    L = length of genome, in base pairs (ignored if rec_map is specified)

    Ne = diploid population size for all three populations
    Tadmix = time of admixture
    Nadmix = number of observed admixed diploid individuals
    seed = seed passed to msprime.simulate()
    path = file path, if given will write the ts to this path
    """

    assert T2 > T1, "T2 must be greater than T1"

    # convert to correct dtypes and catch problems
    T1 = int(T1)
    T2 = int(T2)
    Ne = int(Ne)
    Nadmix = int(Nadmix)

    # recombination map
    if rec_map:
        recomb_map = rec_map
    else:
        L = int(L)
        recomb_map = msprime.RecombinationMap.uniform_map(L, 1e-8, L)

    pop_configs = [
        msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0),
        msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0),
        msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0)
    ]

    # no ongoing migration
    mig_mat = [
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
    ]

    admixture_events = [
        msprime.MassMigration(time=T1, source=2, destination=1, proportion=frac1),
        msprime.MassMigration(time=T2, source=2, destination=1, proportion=frac2),
        msprime.MassMigration(time=T2 + 1, source=2, destination=0, proportion=1.0),
    ]

    samps = [msprime.Sample(population=2, time=0)] * 2 * Nadmix

    ts_admix = msprime.simulate(
        population_configurations=pop_configs,
        migration_matrix=mig_mat,
        demographic_events=admixture_events,
        recombination_map=recomb_map,
        mutation_rate=0,
        model='dtwf',
        samples=samps,
        random_seed=seed,
        start_time=0,
        end_time=T2 + 2
    )

    if path:
        if tszip:
            # save compressed ts
            import tszip
            tszip.compress(ts_admix, path, variants_only=False)
        else:
            # save uncompressed ts
            ts_admix.dump(path)

    return(ts_admix)
Esempio n. 14
0
T54=70000/generation_time  # CEU joins AFR
T10=200000/generation_time # Denisovan 1 joins Denisovan 0
T20=200000/generation_time # Denisovan 2 joins Denisovan 0 
T03=300000/generation_time # Denisovan 0 joins Denisovan 3 (Altai) 
T93=400000/generation_time # Neanderthal joins Denisovan 3 (Altai) 
T34=600000/generation_time # Denisovan 3 (Altai) joins AFR
T410=4000000/generation_time # AFR joins Chimp
 
TA1=2500/generation_time   
TA2=48000/generation_time   
TA3=68000/generation_time   

TS_NEA=60000/generation_time
TS_DEN3=40000/generation_time

NumSamples=80
nS=[10]
tS=[0,TS_DEN3,TS_NEA]
f=[0.10, 0.04, 0.02]
N=[1500,1500,1500,1500,15000,5000,3500,3500,3500,2000,30000]
seed=None

samples = set_up_pops(nS,tS)
demography = set_up_demography(T78, T68, T85, T54, T10, T20, T03, T93, T34, T410, TA1, TA2, TA3, f)
pops = [msp.PopulationConfiguration(initial_size = n) for n in N]

ts = msp.simulate(samples=samples, Ne=N[0], population_configurations=pops, demographic_events=demography, mutation_rate=mu, length=L, recombination_rate=r, record_migrations=True, random_seed=seed)

# output resulting tree sequences to compressed .tsz file
tszip.compress(ts, "tree_seq_files/{}_model_{}.tsz".format(model, simrep))
Esempio n. 15
0
def run_compress(args):
    logger.info("Compressing {}".format(args.file))
    ts = tskit.load(args.file)
    outfile = args.file + ".zarr"
    tszip.compress(ts, outfile)