Example #1
0
def run_dump_provenances(args):
    tree_sequence = tskit.load(args.tree_sequence)
    if args.human:
        for provenance in tree_sequence.provenances():
            d = json.loads(provenance.record)
            print("id={}, timestamp={}, record={}".format(
                provenance.id, provenance.timestamp, json.dumps(d, indent=4)))
    else:
        tree_sequence.dump_text(provenances=sys.stdout)
Example #2
0
 def test_fileobj_multi(self, replicate_ts_fixture, fileobj):
     file_offsets = []
     for ts in replicate_ts_fixture:
         ts.dump(fileobj)
         file_offsets.append(fileobj.tell())
     fileobj.close()
     with open(fileobj.name, "rb") as f:
         for ts, file_offset in zip(replicate_ts_fixture, file_offsets):
             ts2 = tskit.load(f)
             file_offset2 = f.tell()
             assert ts.tables == ts2.tables
             assert file_offset == file_offset2
Example #3
0
def run_mutate(args):
    tree_sequence = tskit.load(args.tree_sequence)
    tree_sequence = msprime.mutate(
        tree_sequence=tree_sequence,
        rate=args.mutation_rate,
        random_seed=args.random_seed,
        keep=args.keep,
        start_time=args.start_time,
        end_time=args.end_time,
        discrete=args.discrete,
    )
    tree_sequence.dump(args.output_tree_sequence)
Example #4
0
def run_dump_provenances(args):
    tree_sequence = tskit.load(args.tree_sequence)
    if args.human:
        for provenance in tree_sequence.provenances():
            d = json.loads(provenance.record)
            print(
                "id={}, timestamp={}, record={}".format(
                    provenance.id, provenance.timestamp, json.dumps(d, indent=4)
                )
            )
    else:
        tree_sequence.dump_text(provenances=sys.stdout)
Example #5
0
def run_date(args):
    try:
        ts = tskit.load(args.tree_sequence)
    except tskit.FileFormatError as ffe:
        exit("Error loading '{}: {}".format(args.tree_sequence, ffe))
    dated_ts = tsdate.date(
        ts, args.Ne, mutation_rate=args.mutation_rate,
        recombination_rate=args.recombination_rate,
        probability_space=args.probability_space, method=args.method,
        eps=args.epsilon, num_threads=args.num_threads,
        ignore_oldest_root=args.ignore_oldest, progress=args.progress)
    dated_ts.dump(args.output)
Example #6
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1, mask_file=None):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """
        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0
        for i, ts_p in enumerate(ts_path):
            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples
            haps = ts.genotype_matrix()

            SFSs = []
            # Masking
            retain = np.full(ts.get_num_mutations(), False)
            if mask_file:
                mask_table = pd.read_csv(mask_file, sep="\t", header=None)
                chrom = ts_p.split("/")[-1].split(".")[0]
                sub = mask_table[mask_table[0] == chrom]
                mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2])
                snp_locs = [int(x.site.position) for x in ts.variants()]
                tmp_bool = [mask_ints.contains(x) for x in snp_locs]
                retain = np.logical_or(retain, tmp_bool)
                total_length -= np.sum(mask_ints.length)

            retain = np.logical_not(retain)
            # append unmasked SFS
            SFSs.append(allel.sfs(allel.HaplotypeArray(haps).count_alleles()[:, 1])[1:])
            # get masked allele counts and append SFS
            allele_counts = allel.HaplotypeArray(haps[retain, :]).count_alleles()
            SFSs.append(allel.sfs(allele_counts[:, 1])[1:])
            sfs_path = ts_p+".sfs.pdf"
            plots.plot_sfs(SFSs, sfs_path)
            # Bootstrap allele counts
            derived_counts_all[0].extend(allele_counts[:, 1])
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)
        # Get the SFS minus the 0 bin and write output
        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
def main():

    description = """Simple CLI wrapper for tsinfer
        tskit version: {}
        tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--verbosity', '-v', action='count', default=0)
    parser.add_argument(
        "samples",
        help="The samples file name, as saved by tsinfer.SampleData.initialise()")
    parser.add_argument(
        "output",
        help="The path to write the output file to")
    parser.add_argument(
        "-l", "--length", default=None, type=int,
        help="The total sequence length")
    parser.add_argument(
        "-t", "--threads", default=1, type=int,
        help="The number of worker threads to use")
    parser.add_argument(
        "-m", "--method", default="C", choices=['C','P'],
        help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)")
    parser.add_argument(
        "--inject-real-ancestors-from-ts", default=None,
        help="Instead of inferring ancestors, construct known ones from this tree sequence file path")
    parser.add_argument(
        "-V", "--version", action='version', version=description)

    args = parser.parse_args()

    engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE

    if not os.path.isfile(args.samples):
        raise ValueError("No samples file")
    sample_data = tsinfer.load(args.samples)
    if all(False for _ in sample_data.genotypes(inference_sites=True)):
        raise ValueError("No inference sites")
    if args.inject_real_ancestors_from_ts is not None:
        ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None)
        orig_ts = tskit.load(args.inject_real_ancestors_from_ts)
        eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts)
        ancestor_data.finalise()
        ancestors_ts = tsinfer.match_ancestors(
            sample_data, ancestor_data, engine=engine)
        ts = tsinfer.match_samples(
            sample_data, ancestors_ts, engine=engine, simplify=True)
    else:
        ts = tsinfer.infer(
            sample_data, num_threads=args.threads, engine=engine)
    ts.dump(args.output)
Example #8
0
 def verify(self, cmd, num_samples):
     with tempfile.TemporaryDirectory() as tmpdir:
         filename = pathlib.Path(tmpdir) / "output.trees"
         full_cmd = "python3 -m stdpopsim -q " + cmd + f" {filename}"
         subprocess.run(full_cmd, shell=True, check=True)
         # TODO converting to str isn't necessary in tskit 0.1.5. Remove.
         ts = tskit.load(str(filename))
     self.assertEqual(ts.num_samples, num_samples)
     provenance = json.loads(ts.provenance(ts.num_provenances - 1).record)
     tskit.validate_provenance(provenance)
     stored_cmd = provenance["parameters"]["args"]
     self.assertEqual(stored_cmd[0], "-q")
     self.assertEqual(stored_cmd[1:-1], cmd.split())
Example #9
0
 def compare_python_api(self, input_ts, cmd, Ne, mutation_rate, method):
     with tempfile.TemporaryDirectory() as tmpdir:
         input_filename = pathlib.Path(tmpdir) / "input.trees"
         input_ts.dump(input_filename)
         output_filename = pathlib.Path(tmpdir) / "output.trees"
         full_cmd = "date " + str(input_filename) + f" {output_filename} " + cmd
         cli.tsdate_main(full_cmd.split())
         output_ts = tskit.load(output_filename)
     dated_ts = tsdate.date(input_ts, Ne=Ne, mutation_rate=mutation_rate,
                            method=method)
     print(dated_ts.tables.nodes.time, output_ts.tables.nodes.time)
     self.assertTrue(np.array_equal(dated_ts.tables.nodes.time,
                                    output_ts.tables.nodes.time))
Example #10
0
def run_mutate(args):
    tree_sequence = tskit.load(args.tree_sequence)
    tree_sequence = msprime.sim_mutations(
        tree_sequence=tree_sequence,
        rate=args.mutation_rate,
        random_seed=args.random_seed,
        keep=True,
        start_time=args.start_time,
        end_time=args.end_time,
        discrete_genome=True,
        model=args.model,
    )
    tree_sequence.dump(args.output_tree_sequence)
Example #11
0
def combined_ts_constrained_samples(args):
    high_cov_samples = tsinfer.load(args.high_cov)
    dated_hgdp_1kg_sgdp_ts = tskit.load(args.dated_ts)
    sites_time = tsdate.sites_time_from_ts(dated_hgdp_1kg_sgdp_ts)
    dated_samples = tsdate.add_sampledata_times(high_cov_samples, sites_time)
    # Record number of constrained sites
    print("Total number of sites: ", sites_time.shape[0])
    print("Number of ancient lower bounds: ",
          np.sum(high_cov_samples.min_site_times(individuals_only=True) != 0))
    print("Number of corrected times: ",
          np.sum(dated_samples.sites_time[:] != sites_time))
    high_cov_samples_copy = dated_samples.copy(args.output)
    high_cov_samples_copy.finalise()
Example #12
0
def run_infer(args):
    setup_logging(args)
    try:
        sample_data = tsinfer.SampleData.load(args.samples)
    except exceptions.FileFormatError as e:
        # Check if the user has tried to infer a tree sequence, a common basic mistake
        try:
            tskit.load(args.samples)
        except tskit.FileFormatError:
            raise e  # Re-raise the original error
        raise exceptions.FileFormatError(
            "Expecting a sample data file, not a tree sequence (you can create one "
            "via the Python function `tsinfer.SampleData.from_tree_sequence()`)."
        )
    sample_data = tsinfer.SampleData.load(args.samples)
    ts = tsinfer.infer(sample_data,
                       progress_monitor=args.progress,
                       num_threads=args.num_threads)
    output_trees = get_output_trees_path(args.output_trees, args.samples)
    logger.info("Writing output tree sequence to {}".format(output_trees))
    ts.dump(output_trees)
    summarise_usage()
Example #13
0
    def test_index_columns(self):
        ts = migration_example()
        ts.dump(self.temp_file)
        with kastore.load(self.temp_file) as store:
            all_data = dict(store)

        edge_removal_order = "indexes/edge_removal_order"
        edge_insertion_order = "indexes/edge_insertion_order"

        data = dict(all_data)
        del data[edge_removal_order]
        del data[edge_insertion_order]
        kastore.dump(data, self.temp_file)
        with pytest.raises(exceptions.LibraryError):
            tskit.load(self.temp_file)

        data = dict(all_data)
        del data[edge_removal_order]
        kastore.dump(data, self.temp_file)
        with pytest.raises(exceptions.LibraryError):
            tskit.load(self.temp_file)

        data = dict(all_data)
        del data[edge_insertion_order]
        kastore.dump(data, self.temp_file)
        with pytest.raises(exceptions.LibraryError):
            tskit.load(self.temp_file)

        data = dict(all_data)
        data[edge_insertion_order] = data[edge_insertion_order][:1]
        kastore.dump(data, self.temp_file)
        with pytest.raises(exceptions.FileFormatError):
            tskit.load(self.temp_file)

        data = dict(all_data)
        data[edge_removal_order] = data[edge_removal_order][:1]
        kastore.dump(data, self.temp_file)
        with pytest.raises(exceptions.FileFormatError):
            tskit.load(self.temp_file)
Example #14
0
 def verify(self, cmd, num_samples, seed=1):
     with tempfile.TemporaryDirectory() as tmpdir:
         filename = pathlib.Path(tmpdir) / "output.trees"
         full_cmd = cmd + f" -q -o {filename} --seed={seed}"
         with mock.patch("stdpopsim.cli.setup_logging"):
             stdout, stderr = capture_output(cli.stdpopsim_main, full_cmd.split())
         self.assertEqual(len(stderr), 0)
         self.assertEqual(len(stdout), 0)
         ts = tskit.load(str(filename))
     self.assertEqual(ts.num_samples, num_samples)
     provenance = json.loads(ts.provenance(0).record)
     prov_seed = provenance["parameters"]["random_seed"]
     self.assertEqual(prov_seed, seed)
Example #15
0
def run_dump_macs(args):
    """
    Write a macs formatted file so we can import into pbwt.
    """
    tree_sequence = tskit.load(args.tree_sequence)
    n = tree_sequence.get_sample_size()
    m = tree_sequence.get_sequence_length()
    print("COMMAND:\tnot_macs {} {}".format(n, m))
    print("SEED:\tASEED")
    for variant in tree_sequence.variants(as_bytes=True):
        print(
            "SITE:", variant.index, variant.position / m, 0.0,
            "{}".format(variant.genotypes.decode()), sep="\t")
Example #16
0
def compute_stats(ts_file):
    st = dict()
    ts = tskit.load(ts_file)
    for key, func in stats_functions.items():
        try:
            res = func(ts)
        except Exception:
            # Print the filename so it's easier to trace problems.
            warning(f"{ts_file} triggered exception")
            raise
        if res is not None:
            st[key] = res
    return st
Example #17
0
def main():
    """Run main function."""
    args = parse_args(sys.argv[1:])
    # =========================================================================
    #  Gather args
    # =========================================================================
    tree = args.tree
    outfile = args.outfile
    if outfile is None:
        outfile = path.split(tree)[-1]
    ref_set = args.ref
    tar_set = args.tar
    gnn_win = args.gnn_windows
    pop_ids = args.pop_ids[0]
    # =========================================================================
    #  Loading and Checks
    # =========================================================================
    ts = tskit.load(tree)  # load tree
    print("tree loaded")
    # set refernce for comparison
    if ref_set:  # custom ref sets for comparison
        ref_nodes = []
        with open(ref_set) as f:
            for line in f:
                x = line.split(",")
                assert len(x) > 1, "recheck delimiter should be ,"
                ref_nodes.append(list(map(int, x)))
    else:  # all populations
        ref_nodes = [
            ts.samples(population=i) for i in range(ts.num_populations)
        ]
    # set target population
    if tar_set is None:
        tar_nodes = ts.samples()
    elif tar_set.isnumeric():
        tar_nodes = ts.samples(population=int(tar_set))
    else:
        tar_nodes = []
        with open(tar_set) as f:
            for line in f:
                x = line.split(",")
                assert len(x) > 1, "recheck delimiter should be ,"
                tar_nodes.extend(list(map(int, x)))

    # =========================================================================
    #  Main executions
    # =========================================================================
    if gnn_win:
        gnn_windows_fx(outfile, ts, ref_nodes, tar_nodes, pop_ids)
    else:
        gnn_fx(outfile, ts, ref_nodes, tar_nodes, pop_ids)
Example #18
0
    def test_simulate(self):
        saved_slim_env = os.environ.get("SLIM")
        with tempfile.NamedTemporaryFile(mode="w") as f:
            self.docmd(f"--slim-path slim HomSap -o {f.name}")
            ts = tskit.load(f.name)
        self.assertEqual(ts.num_samples, 10)
        self.assertTrue(all(tree.num_roots == 1 for tree in ts.trees()))

        if saved_slim_env is None:
            del os.environ["SLIM"]
        else:
            os.environ["SLIM"] = saved_slim_env

        with tempfile.NamedTemporaryFile(mode="w") as f:
            self.docmd(f"--slim-no-recapitation HomSap -o {f.name}")
            ts = tskit.load(f.name)
        self.assertEqual(ts.num_samples, 10)

        with tempfile.NamedTemporaryFile(mode="w") as f:
            self.docmd(
                f"--slim-no-recapitation --slim-no-burnin HomSap -o {f.name}")
            ts = tskit.load(f.name)
        self.assertEqual(ts.num_samples, 10)

        # verify sample counts for a multipopulation demographic model
        with tempfile.NamedTemporaryFile(mode="w") as f:
            cmd = (f"-e slim HomSap -o {f.name} -l 0.00001 -c chr1 -s 1234 -q "
                   "-d OutOfAfrica_3G09 0 0 8").split()
            capture_output(stdpopsim.cli.stdpopsim_main, cmd)
            ts = tskit.load(f.name)
        self.assertEqual(ts.num_populations, 3)
        observed_counts = [0, 0, 0]
        for sample in ts.samples():
            observed_counts[ts.get_population(sample)] += 1
        self.assertEqual(observed_counts[0], 0)
        self.assertEqual(observed_counts[1], 0)
        self.assertEqual(observed_counts[2], 8)
        self.assertTrue(all(tree.num_roots == 1 for tree in ts.trees()))
Example #19
0
def run_compress(args):
    setup_logging(args)
    for file_arg in args.files:
        logger.info("Compressing {}".format(file_arg))
        try:
            ts = tskit.load(file_arg)
        except tskit.FileFormatError as ffe:
            exit("Error loading '{}': {}".format(file_arg, ffe))
        logger.debug("Loaded tree sequence")
        infile = pathlib.Path(file_arg)
        outfile = pathlib.Path(file_arg + args.suffix)
        check_output(outfile, args)
        tszip.compress(ts, outfile, variants_only=args.variants_only)
        remove_input(infile, args)
Example #20
0
def test_accuracy(reps):
    n = 100 
    Ne = 10000
    mut_rate = 1e-8
    rec_rate = 1e-8
    theta = 4*10000*mut_rate
    rho = 4*10000*rec_rate
    length = 1e5 
    
    compare_df_master = pd.DataFrame(columns = ['truth', 'tsdate', 'tsdate_inferred', 'relate', 'geva'])

    for rep in range(reps): 
        vanilla_ts = msprime.simulate(sample_size=n, Ne=Ne, mutation_rate=mut_rate, recombination_rate=rec_rate, length=length)

        ts, dated_ts, dated_inferred_ts_mut = run_simulation(vanilla_ts, n, Ne, theta, rho)

        # Run GEVA
        samples = generate_samples(ts, 'testing')
        ages = samplesdata_to_ages(samples, Ne=Ne, length=length, mut_rate=mut_rate, rec_rate=rec_rate, filename=str("test"))

        # Run Relate on simulated data
        relate_path = "/Users/anthonywohns/Documents/mcvean_group/software/relate_v1.0.13_MacOSX/"

        def run_relate(ts, relate_path):
            subprocess.check_output([relate_path + "bin/RelateFileFormats", "--mode", "ConvertFromVcf", "--haps", relate_path + "age_compare/compare.haps", "--sample", relate_path + "age_compare/compare.sample", "-i", path + "tmp/test"])
            subprocess.check_output([relate_path + "bin/Relate", "--mode", "All", "-m", str(mut_rate), "-N", "20000", "--haps", relate_path + "age_compare/compare.haps", "--sample", relate_path + "age_compare/compare.sample", "--seed", "1", "-o", "compare", "--map", relate_path + "genetic_map.txt"])
            subprocess.check_output(
                [relate_path + "bin/RelateFileFormats", "--mode",
                    "ConvertToTreeSequence",
                    "-i", "compare", "-o", "compare"])
        run_relate(ts, relate_path)
        relate_ts = tskit.load('compare.trees')
        table_collection = relate_ts.dump_tables()
        table_collection.nodes.flags[0:n] = 1
        table_collection = relate_ts.dump_tables()
        table_collection.nodes.set_columns(
            flags=np.array(np.concatenate(
                [np.repeat(1, n),
                 np.repeat(0, relate_ts.num_nodes - n)]),
                dtype='uint32'), time=relate_ts.tables.nodes.time)
        relate_ts_fixed = table_collection.tree_sequence()

        ts.dump('true_ts_' + str(rep) + '.trees')
        dated_ts.dump('dated_ts_' + str(rep) + '.trees')
        dated_inferred_ts_mut.dump('dated_inferred_ts_' + str(rep) + '.trees')
        relate_ts_fixed.dump('relate_ts_' + str(rep) + '.trees')
        compare_dict = compare_muts(n, ts, dated_ts,
                                    dated_inferred_ts_mut, ages)
        compare_df_master = pd.concat([compare_df_master, compare_dict])
        compare_df_master.to_csv("compare_df")
def run_tsinfer_mismatch(
    sample_fn,
    length,
    num_threads=1,
    inject_real_ancestors_from_ts_fn=None,
    rho=None,
    error_probability=None,
):
    with tempfile.NamedTemporaryFile("w+") as ts_out:
        cmd = [tsinfer_executable, ts_out.name, "-s", "infer", sample_fn]
        # cmd += ["--threads", str(num_threads), ts_out.name]
        cpu_time, memory_use = time_cmd(cmd)
        ts_simplified = tskit.load(ts_out.name)
    return ts_simplified, cpu_time, memory_use
Example #22
0
 def test_duplicate_positions(self):
     ts = msprime.simulate(10, mutation_rate=10)
     for version in [2, 3]:
         tskit.dump_legacy(ts, self.legacy_file_name, version=version)
         root = h5py.File(self.legacy_file_name, "r+")
         root['mutations/position'][:] = 0
         root.close()
         stdout, stderr = capture_output(
             cli.tskit_main,
             ["upgrade", "-d", self.legacy_file_name, self.current_file_name])
         self.assertEqual(stdout, "")
         tsp = tskit.load(self.current_file_name)
         self.assertEqual(tsp.sample_size, ts.sample_size)
         self.assertEqual(tsp.num_sites, 1)
Example #23
0
 def verify(self, cmd, num_samples, seed=1):
     with tempfile.TemporaryDirectory() as tmpdir:
         filename = pathlib.Path(tmpdir) / "output.trees"
         full_cmd = f"{sys.executable} -m stdpopsim -q {cmd} -o {filename} -s {seed}"
         subprocess.run(full_cmd, shell=True, check=True)
         ts = tskit.load(str(filename))
     assert ts.num_samples == num_samples
     provenance = json.loads(ts.provenance(ts.num_provenances - 1).record)
     tskit.validate_provenance(provenance)
     stored_cmd = provenance["parameters"]["args"]
     assert stored_cmd[0] == "-q"
     assert stored_cmd[-1] == str(seed)
     assert stored_cmd[-2] == "-s"
     assert stored_cmd[1:-4] == cmd.split()
Example #24
0
 def verify_equal_length_columns(self, ts, table):
     ts.dump(self.temp_file)
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     table_cols = [
         colname for colname in all_data.keys() if colname.startswith(table)
     ]
     # Remove all the 'offset' columns
     for col in list(table_cols):
         if col.endswith("_offset"):
             main_col = col[:col.index("_offset")]
             table_cols.remove(main_col)
             table_cols.remove(col)
         if "metadata_schema" in col:
             table_cols.remove(col)
     # Remaining columns should all be the same length
     for col in table_cols:
         for bad_val in [[], all_data[col][:-1]]:
             data = dict(all_data)
             data[col] = bad_val
             kastore.dump(data, self.temp_file)
             with pytest.raises(exceptions.FileFormatError):
                 tskit.load(self.temp_file)
Example #25
0
def load_tree(tree_file):
    """Reads tree sequence from disk.

    Parameters
    ----------
    tree : str
        file path to tree sequence

    Returns
    -------
    tskit tree sequencing object

    """

    return tskit.load(tree_file)
Example #26
0
 def test_conversion(self):
     ts1 = msprime.simulate(10)
     for version in [2, 3]:
         tskit.dump_legacy(ts1, self.legacy_file_name, version=version)
         stdout, stderr = capture_output(
             cli.tskit_main, [
                 "upgrade", self.legacy_file_name, self.current_file_name])
         ts2 = tskit.load(self.current_file_name)
         self.assertEqual(stdout, "")
         self.assertEqual(stderr, "")
         # Quick checks to ensure we have the right tree sequence.
         # More thorough checks are done elsewhere.
         self.assertEqual(ts1.get_sample_size(), ts2.get_sample_size())
         self.assertEqual(ts1.num_edges, ts2.num_edges)
         self.assertEqual(ts1.get_num_trees(), ts2.get_num_trees())
Example #27
0
def ts_to_seg(path, n=None):
    """
    Converts a tree sequence into a seg file for use by :code:`smcsmc.run_smcsmcs()`. This is especially
    useful if you are simulating data from :code:`msprime` and would like to directly 
    use it in :code:`smcsmc`. For details of how to do this, please see the tutorial on simulation using :code:`msprime`.

    Provide the path to the tree sequence, and the suffix will be replaced by :code:`.seg`. This code is adapted from PopSim.

    :param str path: Full file path to the tree sequence created by :code:`ts.dump`.
    :param list n: If more than one sample of haplotypes is being analysed simulateously, provide it here as a list. Otherwise, simply provide the number of haplotypes as a single-element list. 
    """

    if n is None:
        ts = tskit.load(pathe)
        dirr = os.path.dirname(path)
        filen = os.path.basename(path)
        sep = filen.split(".")
        output = os.path.join(dirr, ".".join(sep) + ".seg")
        fi = open(output, "w")
        prev = 1
        cur = 0
        for var in ts.variants():
            cur = int(var.site.position)
            if cur > prev:
                geno = ''.join(map(str, var.genotypes))
                fi.write(f"{prev}\t{cur-prev}\t{geno}\n")
            prev = cur
        fi.close()
    else:
        for sample_size in n:
            ts = smcsmc.utils.prune_tree_sequence(path, sample_size)
            dirr = os.path.dirname(path)
            filen = os.path.basename(path)
            sep = filen.split(".")
            chrom = sep[0]
            sep.insert(0, str(sample_size))
            output = os.path.join(dirr, ".".join(sep) + ".seg")
            fi = open(output, "w")
            prev = 1
            cur = 0
            for var in ts.variants():
                cur = int(var.site.position)
                if cur > prev:
                    geno = ''.join(map(str, var.genotypes))
                    fi.write(f"{prev}\t{cur-prev}\t{geno}\n")
                prev = cur
            fi.close()
    return None
Example #28
0
def run_list(args):
    setup_logging(args)
    # First try to load with tskit.
    ts = None
    try:
        ts = tskit.load(args.path)
    except tskit.FileFormatError:
        pass
    if ts is None:
        tsinfer_file = tsinfer.load(args.path)
        if args.storage:
            print(tsinfer_file.info)
        else:
            print(tsinfer_file)
    else:
        summarise_tree_sequence(args.path, ts)
def run_tsdate(input_fn, Ne, mut_rate, timepoints, method):
    with tempfile.NamedTemporaryFile("w+") as ts_out:
        cmd = [
            sys.executable,
            tsdate_executable,
            input_fn,
            ts_out.name,
            str(Ne),
            "--mutation-rate",
            str(mut_rate),
        ]
        # cmd = ["tsdate", "date", input_fn, ts_out.name, str(Ne)]
        # cmd += ["--mutation-rate", str(mut_rate), "--timepoints", str(timepoints), "--method", str(method)]
        cpu_time, memory_use = time_cmd(cmd)
        dated_ts = tskit.load(ts_out.name)
    return dated_ts, cpu_time, memory_use
Example #30
0
    def load(cls, path):
        '''
        Load a :class:`SlimTreeSequence` from a .trees file on disk.

        :param string path: The path to a .trees file.
        :rtype SlimTreeSequence:
        '''
        ts = tskit.load(path)
        # extract the reference sequence from the kastore
        kas = kastore.load(path)
        if 'reference_sequence/data' in kas:
            int_rs = kas['reference_sequence/data']
            reference_sequence = int_rs.tostring().decode('ascii')
        else:
            reference_sequence = None
        return cls(ts, reference_sequence)
Example #31
0
def run_dump_macs(args):
    """
    Write a macs formatted file so we can import into pbwt.
    """
    tree_sequence = tskit.load(args.tree_sequence)
    n = tree_sequence.get_sample_size()
    m = tree_sequence.get_sequence_length()
    print("COMMAND:\tnot_macs {} {}".format(n, m))
    print("SEED:\tASEED")
    for variant in tree_sequence.variants(as_bytes=True):
        print("SITE:",
              variant.index,
              variant.position / m,
              0.0,
              "{}".format(variant.genotypes.decode()),
              sep="\t")
Example #32
0
def load_from_stream(q_err, q_out, file_in):
    """
    tskit.load() tree sequences from `file_in` and put them onto `q_out`.
    Uncaught exceptions are placed onto the `q_err` queue.
    """
    try:
        with open(file_in, "rb") as f:
            while True:
                try:
                    ts = tskit.load(f)
                except EOFError:
                    break
                q_out.put(ts)
    except Exception as exc:
        tb = traceback.format_exc()
        q_err.put((exc, tb))
Example #33
0
def run_dump_mutations(args):
    tree_sequence = tskit.load(args.tree_sequence)
    tree_sequence.dump_text(mutations=sys.stdout, precision=args.precision)
Example #34
0
def run_dump_vcf(args):
    tree_sequence = tskit.load(args.tree_sequence)
    tree_sequence.write_vcf(sys.stdout, args.ploidy)
Example #35
0
def run_dump_variants(args):
    tree_sequence = tskit.load(args.tree_sequence)
    for variant in tree_sequence.variants(as_bytes=True):
        print(variant.position, end="\t")
        print("{}".format(variant.genotypes.decode()))
Example #36
0
def run_dump_haplotypes(args):
    tree_sequence = tskit.load(args.tree_sequence)
    for h in tree_sequence.haplotypes():
        print(h)
Example #37
0
def run_dump_newick(args):
    tree_sequence = tskit.load(args.tree_sequence)
    for tree in tree_sequence.trees():
        newick = tree.newick(precision=args.precision)
        print(newick)
Example #38
0
def run_dump_sites(args):
    tree_sequence = tskit.load(args.tree_sequence)
    tree_sequence.dump_text(sites=sys.stdout, precision=args.precision)