コード例 #1
0
ファイル: test_compression.py プロジェクト: tskit-dev/tszip
 def test_format_written(self):
     ts = msprime.simulate(10, random_seed=1)
     tszip.compress(ts, self.path)
     with zarr.ZipStore(str(self.path), mode="r") as store:
         root = zarr.group(store=store)
         self.assertEqual(root.attrs["format_name"], compression.FORMAT_NAME)
         self.assertEqual(root.attrs["format_version"], compression.FORMAT_VERSION)
コード例 #2
0
 def setUp(self):
     self.tmpdir = tempfile.TemporaryDirectory(prefix="tszip_cli_")
     self.trees_path = pathlib.Path(self.tmpdir.name) / "msprime.trees"
     self.ts = msprime.simulate(10, mutation_rate=10, random_seed=1)
     self.compressed_path = pathlib.Path(
         self.tmpdir.name) / "msprime.trees.tsz"
     tszip.compress(self.ts, self.compressed_path)
コード例 #3
0
 def verify(self, ts):
     with tempfile.TemporaryDirectory() as tmpdir:
         path = pathlib.Path(tmpdir) / "treeseq.tsz"
         tszip.compress(ts, path)
         other_ts = tszip.decompress(path)
     G1 = ts.genotype_matrix()
     G2 = other_ts.genotype_matrix()
     self.assertTrue(np.array_equal(G1, G2))
コード例 #4
0
 def test_provenance(self):
     ts = msprime.simulate(10, random_seed=1)
     for variants_only in [True, False]:
         tszip.compress(ts, self.path, variants_only=variants_only)
         with zarr.ZipStore(str(self.path), mode='r') as store:
             root = zarr.group(store=store)
             self.assertEqual(
                 root.attrs["provenance"],
                 provenance.get_provenance_dict(
                     {"variants_only": variants_only}))
コード例 #5
0
 def test_suffix(self):
     suffix = ".XYGsdf"
     self.compressed_path = self.compressed_path.with_suffix(suffix)
     tszip.compress(self.ts, self.compressed_path)
     self.assertTrue(self.compressed_path.exists())
     self.run_decompress([str(self.compressed_path), "-S", suffix])
     self.assertFalse(self.compressed_path.exists())
     outpath = self.trees_path
     self.assertTrue(outpath.exists())
     ts = tskit.load(str(outpath))
     self.assertEqual(ts.tables, self.ts.tables)
コード例 #6
0
def run_compress(args):
    setup_logging(args)
    for file_arg in args.files:
        logger.info("Compressing {}".format(file_arg))
        try:
            ts = tskit.load(file_arg)
        except tskit.FileFormatError as ffe:
            exit("Error loading '{}': {}".format(file_arg, ffe))
        logger.debug("Loaded tree sequence")
        infile = pathlib.Path(file_arg)
        outfile = pathlib.Path(file_arg + args.suffix)
        check_output(outfile, args)
        tszip.compress(ts, outfile, variants_only=args.variants_only)
        remove_input(infile, args)
コード例 #7
0
def run_compress(args):
    if args.stdout:
        exit("Compressing to stdout not currently supported;"
             "Please see https://github.com/tskit-dev/tszip/issues/49")
    setup_logging(args)
    for file_arg in args.files:
        logger.info(f"Compressing {file_arg}")
        try:
            ts = tskit.load(file_arg)
        except (FileNotFoundError, tskit.FileFormatError) as ffe:
            exit(f"Error loading '{file_arg}': {ffe}")
        logger.debug("Loaded tree sequence")
        infile = pathlib.Path(file_arg)
        outfile = pathlib.Path(file_arg + args.suffix)
        check_output(outfile, args)
        tszip.compress(ts, outfile, variants_only=args.variants_only)
        remove_input(infile, args)
コード例 #8
0
ファイル: utils.py プロジェクト: rwaples/ABCadmix_dev
def save_ts(ts, path, tszip=False):
    """Save ts to a file.

    Path gives the filename
    if tszip evaluates to True, the output will be comrpessed tszip
    tszip compression can can reduce files sizes,
    but adds time to import and export steps.
    """
    if tszip:
        # save compressed ts
        try:
            import tszip
        except ImportError:
            assert False, "tszip compression requires tszip package"
        tszip.compress(ts, path, variants_only=False)
    else:
        # save uncompressed ts
        ts.dump(path)
コード例 #9
0
def convert_file_worker(k):
    n = 10**k
    filename = os.path.join(data_prefix, "{}.trees".format(n))
    if not os.path.exists(filename):
        raise ValueError("Missing simulation")
    ts = msprime.load(filename)

    tsz_filename = filename + ".tsz"
    tszip.compress(ts, tsz_filename, variants_only=True)

    # Convert to PBWT by piping in VCF. This avoids having the write the
    # ~10TB VCF to disk.
    pbwt_filename = os.path.join(data_prefix, "{}.pbwt".format(n))
    pbwtgz_filename = pbwt_filename + ".gz"
    sites_filename = os.path.join(data_prefix, "{}.sites".format(n))
    sitesgz_filename = sites_filename + ".gz"

    cmd = "./tools/pbwt/pbwt -readVcfGT - -write {} -writeSites {}".format(
        pbwt_filename, sites_filename)
    read_fd, write_fd = os.pipe()
    write_pipe = os.fdopen(write_fd, "w")
    proc = subprocess.Popen(cmd, shell=True, stdin=read_fd)
    ts.write_vcf(write_pipe, ploidy=2)
    write_pipe.close()
    os.close(read_fd)
    proc.wait()
    if proc.returncode != 0:
        raise RuntimeError("pbwt failed with status:", proc.returncode)

    subprocess.check_call(
        "gzip -c {} > {}".format(pbwt_filename, pbwtgz_filename), shell=True)

    subprocess.check_call(
        "gzip -c {} > {}".format(sites_filename, sitesgz_filename), shell=True)

    if k < 7:
        vcf_filename = os.path.join(data_prefix, "{}.vcf".format(n))
        with open(vcf_filename, "w") as vcf_file:
            ts.write_vcf(vcf_file, 2)
        print("Wrote ", vcf_filename)
        gz_filename = vcf_filename + ".gz"
        subprocess.check_call("gzip -c {} > {}".format(vcf_filename, gz_filename), shell=True)
        print("Wrote ", gz_filename)
    return k
コード例 #10
0
 def verify(self, ts):
     if ts.num_migrations > 0:
         raise unittest.SkipTest("Migrations not supported")
     with tempfile.TemporaryDirectory() as tmpdir:
         path = pathlib.Path(tmpdir) / "treeseq.tsz"
         tszip.compress(ts, path, variants_only=True)
         other_ts = tszip.decompress(path)
     self.assertEqual(ts.num_sites, other_ts.num_sites)
     for var1, var2 in zip(ts.variants(), other_ts.variants()):
         self.assertTrue(np.array_equal(var1.genotypes, var2.genotypes))
         self.assertEqual(var1.site.position, var2.site.position)
         self.assertEqual(var1.alleles, var2.alleles)
     # Populations, individuals and sites should be untouched if there are no
     # unreachable individuals.
     t1 = ts.tables
     t2 = other_ts.tables
     self.assertEqual(t1.sequence_length, t2.sequence_length)
     self.assertEqual(t1.populations, t2.populations)
     self.assertEqual(t1.individuals, t2.individuals)
     self.assertEqual(t1.sites, t2.sites)
     # We should be adding an extra provenance record in here due to simplify.
     self.assertEqual(len(t1.provenances), len(t2.provenances) - 1)
コード例 #11
0
 def test_save_dir(self):
     ts = msprime.simulate(10, random_seed=1)
     with self.assertRaises(OSError):
         tszip.compress(ts, self.path.parent)
コード例 #12
0
 def verify(self, ts):
     with tempfile.TemporaryDirectory() as tmpdir:
         path = pathlib.Path(tmpdir) / "treeseq.tsz"
         tszip.compress(ts, path)
         other_ts = tszip.decompress(path)
     self.assertEqual(ts.tables, other_ts.tables)
コード例 #13
0
ファイル: simulate.py プロジェクト: rwaples/ABCadmix_dev
def sim_two_pulse(rec_map=None, L=1e9, Ne=10000, Nadmix=500,
                T1=4, T2=12, frac1=.2, frac2=.2,
                seed=None, path=None, tszip=None):
    """Simulate a simple pulse model of admixture.

    Using the disrete-time backwards wright-fisher.

    rec_map = valid msprime recombination map
    L = length of genome, in base pairs (ignored if rec_map is specified)

    Ne = diploid population size for all three populations
    Tadmix = time of admixture
    Nadmix = number of observed admixed diploid individuals
    seed = seed passed to msprime.simulate()
    path = file path, if given will write the ts to this path
    """

    assert T2 > T1, "T2 must be greater than T1"

    # convert to correct dtypes and catch problems
    T1 = int(T1)
    T2 = int(T2)
    Ne = int(Ne)
    Nadmix = int(Nadmix)

    # recombination map
    if rec_map:
        recomb_map = rec_map
    else:
        L = int(L)
        recomb_map = msprime.RecombinationMap.uniform_map(L, 1e-8, L)

    pop_configs = [
        msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0),
        msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0),
        msprime.PopulationConfiguration(initial_size=Ne, growth_rate=0)
    ]

    # no ongoing migration
    mig_mat = [
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
    ]

    admixture_events = [
        msprime.MassMigration(time=T1, source=2, destination=1, proportion=frac1),
        msprime.MassMigration(time=T2, source=2, destination=1, proportion=frac2),
        msprime.MassMigration(time=T2 + 1, source=2, destination=0, proportion=1.0),
    ]

    samps = [msprime.Sample(population=2, time=0)] * 2 * Nadmix

    ts_admix = msprime.simulate(
        population_configurations=pop_configs,
        migration_matrix=mig_mat,
        demographic_events=admixture_events,
        recombination_map=recomb_map,
        mutation_rate=0,
        model='dtwf',
        samples=samps,
        random_seed=seed,
        start_time=0,
        end_time=T2 + 2
    )

    if path:
        if tszip:
            # save compressed ts
            import tszip
            tszip.compress(ts_admix, path, variants_only=False)
        else:
            # save uncompressed ts
            ts_admix.dump(path)

    return(ts_admix)
コード例 #14
0
T54=70000/generation_time  # CEU joins AFR
T10=200000/generation_time # Denisovan 1 joins Denisovan 0
T20=200000/generation_time # Denisovan 2 joins Denisovan 0 
T03=300000/generation_time # Denisovan 0 joins Denisovan 3 (Altai) 
T93=400000/generation_time # Neanderthal joins Denisovan 3 (Altai) 
T34=600000/generation_time # Denisovan 3 (Altai) joins AFR
T410=4000000/generation_time # AFR joins Chimp
 
TA1=2500/generation_time   
TA2=48000/generation_time   
TA3=68000/generation_time   

TS_NEA=60000/generation_time
TS_DEN3=40000/generation_time

NumSamples=80
nS=[10]
tS=[0,TS_DEN3,TS_NEA]
f=[0.10, 0.04, 0.02]
N=[1500,1500,1500,1500,15000,5000,3500,3500,3500,2000,30000]
seed=None

samples = set_up_pops(nS,tS)
demography = set_up_demography(T78, T68, T85, T54, T10, T20, T03, T93, T34, T410, TA1, TA2, TA3, f)
pops = [msp.PopulationConfiguration(initial_size = n) for n in N]

ts = msp.simulate(samples=samples, Ne=N[0], population_configurations=pops, demographic_events=demography, mutation_rate=mu, length=L, recombination_rate=r, record_migrations=True, random_seed=seed)

# output resulting tree sequences to compressed .tsz file
tszip.compress(ts, "tree_seq_files/{}_model_{}.tsz".format(model, simrep))
コード例 #15
0
ファイル: cli.py プロジェクト: OrkoHunter/tszip
def run_compress(args):
    logger.info("Compressing {}".format(args.file))
    ts = tskit.load(args.file)
    outfile = args.file + ".zarr"
    tszip.compress(ts, outfile)